diff --git a/.clang-tidy b/.clang-tidy
index 5466a4a31d20a3..d5fc66c26d42d9 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -2,6 +2,7 @@
 # NOTE: there must be no spaces before the '-', so put the comma first.
 Checks: '
   *
+  ,clang-analyzer-*
   ,modernize-*
   ,-cert-err58-cpp
   ,-cert-err60-cpp
@@ -9,6 +10,7 @@ Checks: '
   ,-cppcoreguidelines-owning-memory
   ,-cppcoreguidelines-pro-bounds-array-to-pointer-decay
   ,-cppcoreguidelines-pro-bounds-constant-array-index
+  ,-cppcoreguidelines-pro-type-member-init
   ,-cppcoreguidelines-pro-type-static-cast-downcast
   ,-cppcoreguidelines-pro-type-vararg
   ,-cppcoreguidelines-special-member-functions
@@ -23,9 +25,11 @@ Checks: '
   ,-hicpp-braces-around-statements
   ,-hicpp-explicit-conversions
   ,-hicpp-no-array-decay
+  ,-hicpp-signed-bitwise
   ,-hicpp-special-member-functions
   ,-hicpp-vararg
   ,-llvm-header-guard
+  ,-llvm-include-order
   ,-llvm-namespace-comment
   ,-misc-unused-parameters
   ,-modernize-make-unique
@@ -34,7 +38,6 @@ Checks: '
   ,-readability-braces-around-statements
   ,-readability-else-after-return
   ,-readability-named-parameter
-  ,clang-analyzer-*
   '
 WarningsAsErrors: ''
 HeaderFilterRegex: 'torch/csrc/'
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000000000..cd41d1a02f8290
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.bat	text eol=crlf
diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
index 6b8aa6fc62bb94..3bc5157d9cab7a 100755
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@@ -124,7 +124,7 @@ CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
 CMAKE_ARGS+=("-DUSE_ZSTD=ON")
 CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
 
-if [[ $BUILD_ENVIRONMENT == *-aten-* ]]; then
+if [[ $BUILD_ENVIRONMENT == *-aten-* || -n "$INTEGRATED" ]]; then
   if [[ CMAKE_ARGS != *USE_ATEN* ]] && [[ CMAKE_ARGS != *BUILD_ATEN* ]]; then
     CMAKE_ARGS+=("-DBUILD_ATEN=ON")
   fi
diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 053a9be5e05487..40e3e21417b9b2 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -64,7 +64,13 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do
       ;;
     */aten/*)
       # ATen uses test framework Catch2
-      "$test" -r=xml -o "${junit_reports_dir}/$(basename $test).xml"
+      # NB: We do NOT use the xml test reporter, because
+      # Catch doesn't support multiple reporters
+      # c.f. https://github.com/catchorg/Catch2/blob/master/docs/release-notes.md#223
+      # which means that enabling XML output means you lose useful stdout
+      # output for Jenkins.  It's more important to have useful console
+      # output than it is to have XML output for Jenkins.
+      "$test"
       ;;
     *)
       "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml"
@@ -109,6 +115,10 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
   # Our cuda top_k op has some asm code, the hipified version doesn't
   # compile yet, so we don't have top_k operator for now
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py")
+
+  # Our AMD CI boxes have 4 gpus on each
+  # Remove this once we have added multi-gpu support
+  export HIP_VISIBLE_DEVICES=$(($BUILD_NUMBER % 4))
 fi
 
 # Python tests
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 56db6914c1c20a..48e81dfd635bce 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -43,12 +43,9 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   # https://github.com/RadeonOpenCompute/hcc#hcc-with-thinlto-linking
   export KMTHINLTO=1
 
-  sudo chown -R jenkins:jenkins /usr/local
-  rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true
-  python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py"
-
-  USE_ROCM=1 python setup.py install
-  exit
+  python tools/amd_build/build_pytorch_amd.py
+  USE_ROCM=1 python setup.py install --user
+  exit 0
 fi
 
 # TODO: Don't install this here
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 651e230ab35ea7..c7eb20d1336550 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -214,9 +214,10 @@ if(NOT MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-overflow")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-stringop-overflow")
   # These flags are not available in GCC-4.8.5. Set only when using clang.
   # Compared against https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Option-Summary.html
-  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-invalid-partial-specialization")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-typedef-redefinition")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option")
@@ -226,6 +227,7 @@ if(NOT MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++14-extensions")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constexpr-not-const")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments")
   endif()
   if ((APPLE AND (NOT ("${CLANG_VERSION_STRING}" VERSION_LESS "9.0")))
     OR (CMAKE_COMPILER_IS_GNUCXX 
@@ -284,6 +286,8 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR})
 # in PROJECT_SOURCE_DIR.
 include_directories(BEFORE ${PROJECT_BINARY_DIR})
 
+include_directories(BEFORE ${PROJECT_SOURCE_DIR}/aten/src/)
+
 # ---[ Old caffe protobuf
 if(BUILD_CAFFE2)
   add_subdirectory(caffe/proto)
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index 462a12b086d2d0..2f2ffdce186d39 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -146,4 +146,5 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
   set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
   set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
   set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
+  set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
 endif()
diff --git a/aten/src/ATen/Allocator.h b/aten/src/ATen/Allocator.h
index c1c78102a0fef8..26989a7ea7fbed 100644
--- a/aten/src/ATen/Allocator.h
+++ b/aten/src/ATen/Allocator.h
@@ -6,7 +6,7 @@
 #include <ATen/Error.h>
 #include <ATen/Retainable.h>
 #include <ATen/Device.h>
-#include <ATen/detail/UniqueVoidPtr.h>
+#include <ATen/core/UniqueVoidPtr.h>
 
 namespace at {
 
diff --git a/aten/src/ATen/ArrayRef.cpp b/aten/src/ATen/ArrayRef.cpp
new file mode 100644
index 00000000000000..2a5d1f7a7cb595
--- /dev/null
+++ b/aten/src/ATen/ArrayRef.cpp
@@ -0,0 +1 @@
+#include <ATen/ArrayRef.h>
diff --git a/aten/src/ATen/ArrayRef.h b/aten/src/ATen/ArrayRef.h
index df144025578c6b..f52a5fcf1c2c58 100644
--- a/aten/src/ATen/ArrayRef.h
+++ b/aten/src/ATen/ArrayRef.h
@@ -1,192 +1,2 @@
-//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-// ATen: modified from llvm::ArrayRef.
-// removed llvm-specific functionality
-// removed some implicit const -> non-const conversions that rely on
-// complicated std::enable_if meta-programming
-// removed a bunch of slice variants for simplicity...
-
 #pragma once
-
-#include <ATen/Error.h>
-#include <ATen/SmallVector.h>
-
-#include <array>
-#include <iterator>
-#include <vector>
-
-namespace at {
-  /// ArrayRef - Represent a constant reference to an array (0 or more elements
-  /// consecutively in memory), i.e. a start pointer and a length.  It allows
-  /// various APIs to take consecutive elements easily and conveniently.
-  ///
-  /// This class does not own the underlying data, it is expected to be used in
-  /// situations where the data resides in some other buffer, whose lifetime
-  /// extends past that of the ArrayRef. For this reason, it is not in general
-  /// safe to store an ArrayRef.
-  ///
-  /// This is intended to be trivially copyable, so it should be passed by
-  /// value.
-  template<typename T>
-  class ArrayRef {
-  public:
-    typedef const T *iterator;
-    typedef const T *const_iterator;
-    typedef size_t size_type;
-
-    typedef std::reverse_iterator<iterator> reverse_iterator;
-
-  private:
-    /// The start of the array, in an external buffer.
-    const T *Data;
-
-    /// The number of elements.
-    size_type Length;
-
-  public:
-    /// @name Constructors
-    /// @{
-
-    /// Construct an empty ArrayRef.
-    /*implicit*/ ArrayRef() : Data(nullptr), Length(0) {}
-
-    /// Construct an ArrayRef from a single element.
-    /*implicit*/ ArrayRef(const T &OneElt)
-      : Data(&OneElt), Length(1) {}
-
-    /// Construct an ArrayRef from a pointer and length.
-    /*implicit*/ ArrayRef(const T *data, size_t length)
-      : Data(data), Length(length) {}
-
-    /// Construct an ArrayRef from a range.
-    ArrayRef(const T *begin, const T *end)
-      : Data(begin), Length(end - begin) {}
-
-    /// Construct an ArrayRef from a SmallVector. This is templated in order to
-    /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
-    /// copy-construct an ArrayRef.
-    template<typename U>
-    /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T, U> &Vec)
-      : Data(Vec.data()), Length(Vec.size()) {
-    }
-
-    /// Construct an ArrayRef from a std::vector.
-    template<typename A>
-    /*implicit*/ ArrayRef(const std::vector<T, A> &Vec)
-      : Data(Vec.data()), Length(Vec.size()) {}
-
-    /// Construct an ArrayRef from a std::array
-    template <size_t N>
-    /*implicit*/ constexpr ArrayRef(const std::array<T, N> &Arr)
-        : Data(Arr.data()), Length(N) {}
-
-    /// Construct an ArrayRef from a C array.
-    template <size_t N>
-    /*implicit*/ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
-
-    /// Construct an ArrayRef from a std::initializer_list.
-    /*implicit*/ ArrayRef(const std::initializer_list<T> &Vec)
-    : Data(Vec.begin() == Vec.end() ? (T*)nullptr : Vec.begin()),
-      Length(Vec.size()) {}
-
-    /// @}
-    /// @name Simple Operations
-    /// @{
-
-    const_iterator begin() const { return Data; }
-    const_iterator end() const { return Data + Length; }
-
-    reverse_iterator rbegin() const { return reverse_iterator(end()); }
-    reverse_iterator rend() const { return reverse_iterator(begin()); }
-
-    /// empty - Check if the array is empty.
-    bool empty() const { return Length == 0; }
-
-    const T *data() const { return Data; }
-
-    /// size - Get the array size.
-    size_t size() const { return Length; }
-
-    /// front - Get the first element.
-    const T &front() const {
-      AT_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list");
-      return Data[0];
-    }
-
-    /// back - Get the last element.
-    const T &back() const {
-      AT_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
-      return Data[Length-1];
-    }
-
-    /// equals - Check for element-wise equality.
-    bool equals(ArrayRef RHS) const {
-      if (Length != RHS.Length)
-        return false;
-      return std::equal(begin(), end(), RHS.begin());
-    }
-
-    /// slice(n, m) - Chop off the first N elements of the array, and keep M
-    /// elements in the array.
-    ArrayRef<T> slice(size_t N, size_t M) const {
-      AT_CHECK(N+M <= size(), "ArrayRef: invalid slice, ", N, " + ", M, " is not <= ", size());
-      return ArrayRef<T>(data()+N, M);
-    }
-
-    /// slice(n) - Chop off the first N elements of the array.
-    ArrayRef<T> slice(size_t N) const { return slice(N, size() - N); }
-
-    /// @}
-    /// @name Operator Overloads
-    /// @{
-    const T &operator[](size_t Index) const {
-      return Data[Index];
-    }
-
-    /// Vector compatibility
-    const T &at(size_t Index) const {
-      AT_CHECK(Index < Length, "ArrayRef: invalid index ", Index, " for length ", Length);
-      return Data[Index];
-    }
-
-    /// Disallow accidental assignment from a temporary.
-    ///
-    /// The declaration here is extra complicated so that "arrayRef = {}"
-    /// continues to select the move assignment operator.
-    template <typename U>
-    typename std::enable_if<std::is_same<U, T>::value, ArrayRef<T>>::type &
-    operator=(U &&Temporary) = delete;
-
-    /// Disallow accidental assignment from a temporary.
-    ///
-    /// The declaration here is extra complicated so that "arrayRef = {}"
-    /// continues to select the move assignment operator.
-    template <typename U>
-    typename std::enable_if<std::is_same<U, T>::value, ArrayRef<T>>::type &
-    operator=(std::initializer_list<U>) = delete;
-
-    /// @}
-    /// @name Expensive Operations
-    /// @{
-    std::vector<T> vec() const {
-      return std::vector<T>(Data, Data+Length);
-    }
-
-    /// @}
-    /// @name Conversion operators
-    /// @{
-    operator std::vector<T>() const {
-      return std::vector<T>(Data, Data+Length);
-    }
-
-    /// @}
-  };
-
-} // end namespace at
+#include <ATen/core/ArrayRef.h>
diff --git a/aten/src/ATen/Backtrace.h b/aten/src/ATen/Backtrace.h
index 347c430d61b75c..bdef9f4a9de439 100644
--- a/aten/src/ATen/Backtrace.h
+++ b/aten/src/ATen/Backtrace.h
@@ -1,28 +1,2 @@
 #pragma once
-
-#include <cstddef>
-#include <string>
-#include <typeinfo>
-
-#include <ATen/ATenGeneral.h>
-
-namespace at {
-/// Utility to demangle a C++ symbol name.
-AT_API std::string demangle(const char* name);
-
-/// Returns the printable name of the type.
-template <typename T>
-inline const char* demangle_type() {
-#ifdef __GXX_RTTI
-  static const std::string name = demangle(typeid(T).name());
-  return name.c_str();
-#else // __GXX_RTTI
-  return "(RTTI disabled, cannot show name)";
-#endif // __GXX_RTTI
-}
-
-AT_API std::string get_backtrace(
-    size_t frames_to_skip = 0,
-    size_t maximum_number_of_frames = 64,
-    bool skip_python_frames = true);
-} // namespace at
+#include <ATen/core/Backtrace.h>
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 562910ad86a298..25a2e6d8b501f0 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -44,6 +44,7 @@ CONFIGURE_FILE(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig
 # NB: If you edit these globs, you'll have to update setup.py package_data as well
 FILE(GLOB base_h "*.h" "detail/*.h")
 FILE(GLOB base_cpp "*.cpp" "detail/*.cpp")
+add_subdirectory(core)
 FILE(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
 FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
 FILE(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu")
@@ -62,7 +63,7 @@ FILE(GLOB native_cuda_cpp "native/cuda/*.cpp")
 FILE(GLOB native_mkl_cpp "native/mkl/*.cpp")
 FILE(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
 
-set(all_cpu_cpp ${base_cpp} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
+set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
 if(AT_MKL_ENABLED)
   set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp})
 endif()
@@ -393,7 +394,7 @@ INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
   DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen")
 
 # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake
-FOREACH(HEADER ${base_h} ${cuda_h} ${cudnn_h})
+FOREACH(HEADER ${base_h} ${ATen_CORE_HEADERS} ${cuda_h} ${cudnn_h})
   string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" HEADER_SUB ${HEADER})
   GET_FILENAME_COMPONENT(DIR ${HEADER_SUB} DIRECTORY)
   INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/${DIR})
@@ -444,6 +445,7 @@ if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
 endif()
 
 # Pass source, includes, and libs to parent
+set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
 set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
index 2db2786b1c66cd..ef370ea6e0bc30 100644
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -109,8 +109,8 @@ struct strided_tensor_iter {
       : data_(tensor.data<T>()),
         dim_(tensor.ndimension()),
         counter_(dim_, 0),
-        sizes_(tensor.sizes()),
-        strides_(tensor.strides()) {
+        sizes_(tensor.sizes().vec()),
+        strides_(tensor.strides().vec()) {
     _setup_arrays(tensor, this);
   }
 };
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 59f6ff755ee3f1..d153e6bc6ada00 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -37,8 +37,11 @@ Context::Context()
   Type::registerCPU(this);
 }
 
+// NB: Ensure that globalContext is initialized before we load
+// variable hooks, otherwise we will deadlock.  Regardless, the
+// deadlock is bad, and being tracked at https://github.com/pytorch/pytorch/issues/9784
+static Context globalContext_;
 Context & globalContext() {
-  static Context globalContext_;
   return globalContext_;
 }
 
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 309c4be2e651dd..7d3fdd1cc2d4af 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -9,6 +9,9 @@
 #include "ATen/detail/CUDAHooksInterface.h"
 #include "ATen/CUDAStream.h"
 
+// This is temporary
+#include "ATen/core/ATenCoreTest.h"
+
 #include <memory>
 #include <mutex>
 #include <cstdint>
diff --git a/aten/src/ATen/Error.h b/aten/src/ATen/Error.h
index 5a41eb7c74e7cb..2a184d4ecbd5ea 100644
--- a/aten/src/ATen/Error.h
+++ b/aten/src/ATen/Error.h
@@ -1,131 +1,2 @@
 #pragma once
-
-#include <ATen/ATenGeneral.h> // for AT_API
-#include <ATen/optional.h>
-
-#include <cstddef>
-#include <exception>
-#include <ostream>
-#include <sstream>
-#include <string>
-
-#if defined(_MSC_VER) && _MSC_VER <= 1900
-#define __func__ __FUNCTION__
-#endif
-
-namespace at {
-
-namespace detail {
-
-inline std::ostream& _str(std::ostream& ss) { return ss; }
-
-template <typename T>
-inline std::ostream& _str(std::ostream& ss, const T& t) {
-  ss << t;
-  return ss;
-}
-
-template <typename T, typename... Args>
-inline std::ostream&
-_str(std::ostream& ss, const T& t, const Args&... args) {
-  return _str(_str(ss, t), args...);
-}
-
-} // namespace detail
-
-// Convert a list of string-like arguments into a single string.
-template <typename... Args>
-inline std::string str(const Args&... args) {
-  std::ostringstream ss;
-  detail::_str(ss, args...);
-  return ss.str();
-}
-
-// Specializations for already-a-string types.
-template <>
-inline std::string str(const std::string& str) {
-  return str;
-}
-inline std::string str(const char* c_str) {
-  return c_str;
-}
-
-/// Represents a location in source code (for debugging).
-struct SourceLocation {
-  const char* function;
-  const char* file;
-  uint32_t line;
-};
-
-std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
-
-/// The primary ATen error class.
-/// Provides a complete error message with source location information via
-/// `what()`, and a more concise message via `what_without_backtrace()`. Should
-/// primarily be used with the `AT_ERROR` macro.
-///
-/// NB: at::Error is handled specially by the default torch to suppress the
-/// backtrace, see torch/csrc/Exceptions.h
-class AT_API Error : public std::exception {
-  std::string what_without_backtrace_;
-  std::string what_;
-
-public:
-  Error(SourceLocation source_location, std::string err);
-
-  /// Returns the complete error message, including the source location.
-  const char* what() const noexcept override {
-    return what_.c_str();
-  }
-
-  /// Returns only the error message string, without source location.
-  const char* what_without_backtrace() const noexcept {
-    return what_without_backtrace_.c_str();
-  }
-};
-
-class AT_API Warning {
-  using handler_t = void(*)(const SourceLocation& source_location, const char* msg);
-
-public:
-  /// Issue a warning with a given message. Dispatched to the current
-  /// warning handler.
-  static void warn(SourceLocation source_location, std::string msg);
-
-  /// Sets the global warning handler. This is not thread-safe, so it should
-  /// generally be called once during initialization.
-  static void set_warning_handler(handler_t handler);
-
-  /// The default warning handler. Prints the message to stderr.
-  static void print_warning(const SourceLocation& source_location, const char* msg);
-
-private:
-  static handler_t warning_handler_;
-};
-
-
-} // namespace at
-
-// TODO: variants that print the expression tested and thus don't require strings
-// TODO: CAFFE_ENFORCE_WITH_CALLER style macro
-
-#define AT_ERROR(...) \
-  throw at::Error({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__))
-
-#define AT_WARN(...) \
-  at::Warning::warn({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__))
-
-#define AT_ASSERT(cond) \
-  if (!(cond)) {             \
-    AT_ERROR(#cond " ASSERT FAILED at ", __FILE__, ":", __LINE__, ", please report a bug to PyTorch.");   \
-  }
-
-#define AT_ASSERTM(cond, ...) \
-  if (!(cond)) {             \
-    AT_ERROR(at::str(#cond, " ASSERT FAILED at ", __FILE__, ":", __LINE__, ", please report a bug to PyTorch. ", __VA_ARGS__));   \
-  }
-
-#define AT_CHECK(cond, ...) \
-  if (!(cond)) {             \
-    AT_ERROR(at::str(__VA_ARGS__));   \
-  }
+#include <ATen/core/Error.h>
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 35125cfa6751bb..934be4093b7257 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -111,7 +111,7 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
     if (!to_expand[i].defined()) {
       continue;
     } else if (first) {
-      sizes = to_expand[i].sizes();
+      sizes = to_expand[i].sizes().vec();
       first = false;
     } else {
       sizes = infer_size(sizes, to_expand[i].sizes());
diff --git a/aten/src/ATen/Half-inl.h b/aten/src/ATen/Half-inl.h
deleted file mode 100644
index e5563faca3ab33..00000000000000
--- a/aten/src/ATen/Half-inl.h
+++ /dev/null
@@ -1,168 +0,0 @@
-#pragma once
-
-#include "ATen/ATenGeneral.h"
-#include <cstring>
-#include <limits>
-
-#ifdef __CUDACC__
-#include <cuda_fp16.h>
-#endif
-
-namespace at {
-
-/// Constructors
-
-inline AT_HOSTDEVICE Half::Half(float value) {
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  x = __half_as_short(__float2half(value));
-#else
-  x = detail::float2halfbits(value);
-#endif
-}
-
-/// Implicit conversions
-
-inline AT_HOSTDEVICE Half::operator float() const {
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  return __half2float(*reinterpret_cast<const __half*>(&x));
-#else
-  return detail::halfbits2float(x);
-#endif
-}
-
-#ifdef __CUDACC__
-inline AT_HOSTDEVICE Half::Half(const __half& value) {
-  x = *reinterpret_cast<const unsigned short*>(&value);
-}
-inline AT_HOSTDEVICE Half::operator __half() const {
-  return *reinterpret_cast<const __half*>(&x);
-}
-#endif
-
-/// Arithmetic
-
-inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) {
-  return (float)a + (float)b;
-}
-
-inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) {
-  return (float)a - (float)b;
-}
-
-inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) {
-  return (float)a * (float)b;
-}
-
-inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) {
-  return (float)a / (float)b;
-}
-
-inline AT_HOSTDEVICE Half operator-(const Half& a) {
-  return -(float)a;
-}
-
-inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) {
-  a = a + b;
-  return a;
-}
-
-inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) {
-  a = a - b;
-  return a;
-}
-
-inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) {
-  a = a * b;
-  return a;
-}
-
-inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) {
-  a = a / b;
-  return a;
-}
-
-/// Arithmetic with floats
-
-inline AT_HOSTDEVICE float operator+(Half a, float b) { return (float)a + b; }
-inline AT_HOSTDEVICE float operator-(Half a, float b) { return (float)a - b; }
-inline AT_HOSTDEVICE float operator*(Half a, float b) { return (float)a * b; }
-inline AT_HOSTDEVICE float operator/(Half a, float b) { return (float)a / b; }
-
-inline AT_HOSTDEVICE float operator+(float a, Half b) { return a + (float)b; }
-inline AT_HOSTDEVICE float operator-(float a, Half b) { return a - (float)b; }
-inline AT_HOSTDEVICE float operator*(float a, Half b) { return a * (float)b; }
-inline AT_HOSTDEVICE float operator/(float a, Half b) { return a / (float)b; }
-
-inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) { return a += (float)b; }
-inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) { return a -= (float)b; }
-inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) { return a *= (float)b; }
-inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) { return a /= (float)b; }
-
-/// Arithmetic with doubles
-
-inline AT_HOSTDEVICE double operator+(Half a, double b) { return (double)a + b; }
-inline AT_HOSTDEVICE double operator-(Half a, double b) { return (double)a - b; }
-inline AT_HOSTDEVICE double operator*(Half a, double b) { return (double)a * b; }
-inline AT_HOSTDEVICE double operator/(Half a, double b) { return (double)a / b; }
-
-inline AT_HOSTDEVICE double operator+(double a, Half b) { return a + (double)b; }
-inline AT_HOSTDEVICE double operator-(double a, Half b) { return a - (double)b; }
-inline AT_HOSTDEVICE double operator*(double a, Half b) { return a * (double)b; }
-inline AT_HOSTDEVICE double operator/(double a, Half b) { return a / (double)b; }
-
-/// Arithmetic with ints
-
-inline AT_HOSTDEVICE Half operator+(Half a, int b) { return a + (Half)b; }
-inline AT_HOSTDEVICE Half operator-(Half a, int b) { return a - (Half)b; }
-inline AT_HOSTDEVICE Half operator*(Half a, int b) { return a * (Half)b; }
-inline AT_HOSTDEVICE Half operator/(Half a, int b) { return a / (Half)b; }
-
-inline AT_HOSTDEVICE Half operator+(int a, Half b) { return (Half)a + b; }
-inline AT_HOSTDEVICE Half operator-(int a, Half b) { return (Half)a - b; }
-inline AT_HOSTDEVICE Half operator*(int a, Half b) { return (Half)a * b; }
-inline AT_HOSTDEVICE Half operator/(int a, Half b) { return (Half)a / b; }
-
-/// NOTE: we do not define comparisons directly and instead rely on the implicit
-/// conversion from at::Half to float.
-
-} // namespace at
-
-namespace std {
-
-template<> class numeric_limits<at::Half> {
- public:
-  static constexpr bool is_specialized = true;
-  static constexpr bool is_signed = true;
-  static constexpr bool is_integer = false;
-  static constexpr bool is_exact = false;
-  static constexpr bool has_infinity = true;
-  static constexpr bool has_quiet_NaN = true;
-  static constexpr bool has_signaling_NaN = true;
-  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
-  static constexpr auto has_denorm_loss = numeric_limits<float>::has_denorm_loss;
-  static constexpr auto round_style = numeric_limits<float>::round_style;
-  static constexpr bool is_iec559 = true;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo = false;
-  static constexpr int digits = 11;
-  static constexpr int digits10 = 3;
-  static constexpr int max_digits10 = 5;
-  static constexpr int radix = 2;
-  static constexpr int min_exponent = -13;
-  static constexpr int min_exponent10 = -4;
-  static constexpr int max_exponent = 16;
-  static constexpr int max_exponent10 = 4;
-  static constexpr auto traps = numeric_limits<float>::traps;
-  static constexpr auto tinyness_before = numeric_limits<float>::tinyness_before;
-  static constexpr at::Half min() { return at::Half(0x0400, at::Half::from_bits); }
-  static constexpr at::Half lowest() { return at::Half(0xFBFF, at::Half::from_bits); }
-  static constexpr at::Half max() { return at::Half(0x7BFF, at::Half::from_bits); }
-  static constexpr at::Half epsilon() { return at::Half(0x1400, at::Half::from_bits); }
-  static constexpr at::Half round_error() { return at::Half(0x3800, at::Half::from_bits); }
-  static constexpr at::Half infinity() { return at::Half(0x7C00, at::Half::from_bits); }
-  static constexpr at::Half quiet_NaN() { return at::Half(0x7E00, at::Half::from_bits); }
-  static constexpr at::Half signaling_NaN() { return at::Half(0x7D00, at::Half::from_bits); }
-  static constexpr at::Half denorm_min() { return at::Half(0x0001, at::Half::from_bits); }
-};
-
-} // namespace std
diff --git a/aten/src/ATen/Half.cpp b/aten/src/ATen/Half.cpp
deleted file mode 100644
index 68f80a56ea8195..00000000000000
--- a/aten/src/ATen/Half.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "ATen/Half.h"
-
-#include "ATen/Tensor.h"
-#include "ATen/Context.h"
-
-#include <TH/TH.h>
-#include <iostream>
-
-namespace at {
-
-static_assert(std::is_standard_layout<Half>::value, "at::Half must be standard layout.");
-
-namespace detail {
-
-float halfbits2float(unsigned short bits) {
-  float value;
-  TH_halfbits2float(&bits, &value);
-  return value;
-}
-
-unsigned short float2halfbits(float value) {
-  unsigned short bits;
-  TH_float2halfbits(&value, &bits);
-  return bits;
-}
-
-} // namespace detail
-
-std::ostream& operator<<(std::ostream & out, const Half& value) {
-  out << (float)value;
-  return out;
-}
-
-} // namespace at
diff --git a/aten/src/ATen/Half.h b/aten/src/ATen/Half.h
index b7ac47e4fda79a..21941116f19e82 100644
--- a/aten/src/ATen/Half.h
+++ b/aten/src/ATen/Half.h
@@ -1,120 +1,2 @@
 #pragma once
-
-/// Defines the Half type (half-precision floating-point) including conversions
-/// to standard C types and basic arithmetic operations. Note that arithmetic
-/// operations are implemented by converting to floating point and
-/// performing the operation in float32, instead of using CUDA half intrinisics.
-/// Most uses of this type within ATen are memory bound, including the
-/// element-wise kernels, and the half intrinisics aren't efficient on all GPUs.
-/// If you are writing a compute bound kernel, you can use the CUDA half
-/// intrinsics directly on the Half type from device code.
-
-#include "ATen/ATenGeneral.h"
-
-#include <limits>
-#include <string>
-#include <cstdint>
-#include <stdexcept>
-#include <utility>
-#include <cmath>
-#include <iosfwd>
-
-#ifdef __CUDACC__
-#include <cuda_fp16.h>
-#endif
-
-#ifndef AT_HOSTDEVICE
-  #ifdef __CUDACC__
-    #define AT_HOSTDEVICE __host__ __device__
-  #else
-    #define AT_HOSTDEVICE
-  #endif
-#endif
-
-namespace at {
-
-namespace detail {
-
-AT_API float halfbits2float(unsigned short bits);
-AT_API unsigned short float2halfbits(float value);
-
-}
-
-struct alignas(2) Half {
-  unsigned short x;
-
-  struct from_bits_t {};
-  static constexpr from_bits_t from_bits = from_bits_t();
-
-  // HIP wants __host__ __device__ tag, CUDA does not
-#ifdef __HIP_PLATFORM_HCC__
-  AT_HOSTDEVICE Half() = default;
-#else
-  Half() = default;
-#endif
-
-  constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits) {};
-  inline AT_HOSTDEVICE Half(float value);
-  inline AT_HOSTDEVICE operator float() const;
-
-#ifdef __CUDACC__
-  inline AT_HOSTDEVICE Half(const __half& value);
-  inline AT_HOSTDEVICE operator __half() const;
-#endif
-};
-
-template<typename To, typename From> To convert(From f) {
-  return static_cast<To>(f);
-}
-
-// skip isnan and isinf check for integral types
-template<typename To, typename From>
-typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(From f) {
-  using limit = std::numeric_limits<To>;
-  if (!limit::is_signed && std::numeric_limits<From>::is_signed) {
-    // allow for negative numbers to wrap using two's complement arithmetic.
-    // For example, with uint8, this allows for `a - b` to be treated as
-    // `a + 255 * b`.
-    return f > limit::max() || (f < 0 && -(uint64_t)f > limit::max());
-  } else {
-    return f < limit::lowest() || f > limit::max();
-  }
-}
-
-template<typename To, typename From>
-typename std::enable_if<!std::is_integral<From>::value, bool>::type overflows(From f) {
-  using limit = std::numeric_limits<To>;
-  if (limit::has_infinity && std::isinf((double)f)) {
-    return false;
-  }
-  if (!limit::has_quiet_NaN && (f != f)) {
-    return true;
-  }
-  return f < limit::lowest() || f > limit::max();
-}
-
-template<typename To, typename From> To checked_convert(From f, const char* name) {
-  if (overflows<To, From>(f)) {
-    std::string msg = "value cannot be converted to type ";
-    msg += name;
-    msg += " without overflow: ";
-    msg += std::to_string(f);
-    throw std::domain_error(std::move(msg));
-  }
-  return convert<To, From>(f);
-}
-
-template<typename To, typename From>
-To HalfFix(From h) {
-  To ret;
-  ret.x = h.x;
-  return ret;
-}
-
-AT_API std::ostream& operator<<(std::ostream & out, const Half& value);
-
-} // namespace at
-
-#include "Half-inl.h"
-
-#undef AT_HOSTDEVICE
+#include <ATen/core/Half.h>
diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index 794d8e5f8c31a9..6aadd62eb1d3fd 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -37,7 +37,9 @@ inline void parallel_for(
       f(begin_tid, std::min(end, chunk_size + begin_tid));
   }
 #else
-  f(begin, end);
+  if (begin < end) {
+    f(begin, end);
+  }
 #endif
 }
 
diff --git a/aten/src/ATen/Scalar.h b/aten/src/ATen/Scalar.h
index e80d467b138ac3..f0b84d67554c02 100644
--- a/aten/src/ATen/Scalar.h
+++ b/aten/src/ATen/Scalar.h
@@ -10,7 +10,6 @@
 #include "ATen/Half.h"
 #include "ATen/ScalarType.h"
 #include "ATen/TensorBase.h"
-#include "ATen/Utils.h"
 
 
 namespace at {
diff --git a/aten/src/ATen/ScalarType.h b/aten/src/ATen/ScalarType.h
index f7c9243a89df2a..3651aef60e3e1e 100644
--- a/aten/src/ATen/ScalarType.h
+++ b/aten/src/ATen/ScalarType.h
@@ -10,16 +10,16 @@
 namespace at {
 
 // NB: Order matters for this macro; it is relied upon in
-// _promoteTypesLookup and probably other places.
+// _promoteTypesLookup and the serialization format.
 #define AT_FORALL_SCALAR_TYPES(_) \
-_(uint8_t,Byte,i) \
-_(int8_t,Char,i) \
-_(int16_t,Short,i) \
-_(int,Int,i) \
-_(int64_t,Long,i) \
-_(at::Half,Half,d) \
-_(float,Float,d) \
-_(double,Double,d)
+_(uint8_t,Byte,i)  /* 0 */ \
+_(int8_t,Char,i)   /* 1 */ \
+_(int16_t,Short,i) /* 2 */ \
+_(int,Int,i)       /* 3 */ \
+_(int64_t,Long,i)  /* 4 */ \
+_(at::Half,Half,d) /* 5 */ \
+_(float,Float,d)   /* 6 */ \
+_(double,Double,d) /* 7 */
 
 #define AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(_) \
 _(uint8_t,Byte,i) \
@@ -35,7 +35,7 @@ enum class ScalarType {
   n,
   AT_FORALL_SCALAR_TYPES(DEFINE_ENUM)
 #undef DEFINE_ENUM
-  Undefined,
+  Undefined, // 8
   NumOptions
 };
 
diff --git a/aten/src/ATen/SmallVector.h b/aten/src/ATen/SmallVector.h
index 7c52ef686aa41a..1dbaa933c555dd 100644
--- a/aten/src/ATen/SmallVector.h
+++ b/aten/src/ATen/SmallVector.h
@@ -1,982 +1,2 @@
-//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the SmallVector class.
-//
-//===----------------------------------------------------------------------===//
-
-// ATen: modified from llvm::SmallVector.
-// replaced report_bad_alloc_error with std::bad_alloc
-// replaced isPodLike<T> with AT_IS_TRIVIALLY_COPYABLE
-// replaced iterator_range constructor with inline Container&& constructor
-// removed LLVM_NODISCARD and LLVM_ATTRIBUTE_ALWAYS_INLINE qualifiers
-// removed LLVM_UNLIKELY
-
 #pragma once
-
-#include "AlignOf.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdlib>
-#include <cstring>
-#include <initializer_list>
-#include <iterator>
-#include <memory>
-#include <new>
-#include <type_traits>
-#include <utility>
-
-#include <ATen/ATenGeneral.h>
-
-#if __GNUG__ && __GNUC__ < 5
-#define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
-#else
-#define AT_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
-#endif
-
-namespace at {
-
-namespace detail {
-
-// From llvm/Support/MathExtras.h
-static inline uint64_t NextPowerOf2(uint64_t A) {
-  A |= (A >> 1);
-  A |= (A >> 2);
-  A |= (A >> 4);
-  A |= (A >> 8);
-  A |= (A >> 16);
-  A |= (A >> 32);
-  return A + 1;
-}
-
-}
-
-/// This is all the non-templated stuff common to all SmallVectors.
-class AT_API SmallVectorBase {
-protected:
-  void *BeginX, *EndX, *CapacityX;
-
-protected:
-  SmallVectorBase(void *FirstEl, size_t Size)
-    : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {}
-
-  /// This is an implementation of the grow() method which only works
-  /// on POD-like data types and is out of line to reduce code duplication.
-  void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize);
-
-public:
-  /// This returns size()*sizeof(T).
-  size_t size_in_bytes() const {
-    return size_t((char*)EndX - (char*)BeginX);
-  }
-
-  /// capacity_in_bytes - This returns capacity()*sizeof(T).
-  size_t capacity_in_bytes() const {
-    return size_t((char*)CapacityX - (char*)BeginX);
-  }
-
-  bool empty() const { return BeginX == EndX; }
-};
-
-/// This is the part of SmallVectorTemplateBase which does not depend on whether
-/// the type T is a POD. The extra dummy template argument is used by ArrayRef
-/// to avoid unnecessarily requiring T to be complete.
-template <typename T, typename = void>
-class SmallVectorTemplateCommon : public SmallVectorBase {
-private:
-  template <typename, unsigned> friend struct SmallVectorStorage;
-
-  // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
-  // don't want it to be automatically run, so we need to represent the space as
-  // something else.  Use an array of char of sufficient alignment.
-  using U = AlignedCharArrayUnion<T>;
-  U FirstEl;
-  // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
-
-protected:
-  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {}
-
-  void grow_pod(size_t MinSizeInBytes, size_t TSize) {
-    SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
-  }
-
-  /// Return true if this is a smallvector which has not had dynamic
-  /// memory allocated for it.
-  bool isSmall() const {
-    return BeginX == static_cast<const void*>(&FirstEl);
-  }
-
-  /// Put this vector in a state of being small.
-  void resetToSmall() {
-    BeginX = EndX = CapacityX = &FirstEl;
-  }
-
-  void setEnd(T *P) { this->EndX = P; }
-
-public:
-  using size_type = size_t;
-  using difference_type = ptrdiff_t;
-  using value_type = T;
-  using iterator = T *;
-  using const_iterator = const T *;
-
-  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
-  using reverse_iterator = std::reverse_iterator<iterator>;
-
-  using reference = T &;
-  using const_reference = const T &;
-  using pointer = T *;
-  using const_pointer = const T *;
-
-  // forward iterator creation methods.
-  iterator begin() { return (iterator)this->BeginX; }
-  const_iterator begin() const { return (const_iterator)this->BeginX; }
-  iterator end() { return (iterator)this->EndX; }
-  const_iterator end() const { return (const_iterator)this->EndX; }
-
-protected:
-  iterator capacity_ptr() { return (iterator)this->CapacityX; }
-  const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;}
-
-public:
-  // reverse iterator creation methods.
-  reverse_iterator rbegin()            { return reverse_iterator(end()); }
-  const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
-  reverse_iterator rend()              { return reverse_iterator(begin()); }
-  const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
-
-  size_type size() const { return end()-begin(); }
-  size_type max_size() const { return size_type(-1) / sizeof(T); }
-
-  /// Return the total number of elements in the currently allocated buffer.
-  size_t capacity() const { return capacity_ptr() - begin(); }
-
-  /// Return a pointer to the vector's buffer, even if empty().
-  pointer data() { return pointer(begin()); }
-  /// Return a pointer to the vector's buffer, even if empty().
-  const_pointer data() const { return const_pointer(begin()); }
-
-  reference operator[](size_type idx) {
-    assert(idx < size());
-    return begin()[idx];
-  }
-  const_reference operator[](size_type idx) const {
-    assert(idx < size());
-    return begin()[idx];
-  }
-
-  reference front() {
-    assert(!empty());
-    return begin()[0];
-  }
-  const_reference front() const {
-    assert(!empty());
-    return begin()[0];
-  }
-
-  reference back() {
-    assert(!empty());
-    return end()[-1];
-  }
-  const_reference back() const {
-    assert(!empty());
-    return end()[-1];
-  }
-};
-
-/// SmallVectorTemplateBase<isPodLike = false> - This is where we put method
-/// implementations that are designed to work with non-POD-like T's.
-template <typename T, bool isPodLike>
-class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
-protected:
-  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
-
-  static void destroy_range(T *S, T *E) {
-    while (S != E) {
-      --E;
-      E->~T();
-    }
-  }
-
-  /// Move the range [I, E) into the uninitialized memory starting with "Dest",
-  /// constructing elements as needed.
-  template<typename It1, typename It2>
-  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
-    std::uninitialized_copy(std::make_move_iterator(I),
-                            std::make_move_iterator(E), Dest);
-  }
-
-  /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
-  /// constructing elements as needed.
-  template<typename It1, typename It2>
-  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
-    std::uninitialized_copy(I, E, Dest);
-  }
-
-  /// Grow the allocated memory (without initializing new elements), doubling
-  /// the size of the allocated memory. Guarantees space for at least one more
-  /// element, or MinSize more elements if specified.
-  void grow(size_t MinSize = 0);
-
-public:
-  void push_back(const T &Elt) {
-    if (this->EndX >= this->CapacityX)
-      this->grow();
-    ::new ((void*) this->end()) T(Elt);
-    this->setEnd(this->end()+1);
-  }
-
-  void push_back(T &&Elt) {
-    if (this->EndX >= this->CapacityX)
-      this->grow();
-    ::new ((void*) this->end()) T(::std::move(Elt));
-    this->setEnd(this->end()+1);
-  }
-
-  void pop_back() {
-    this->setEnd(this->end()-1);
-    this->end()->~T();
-  }
-};
-
-// Define this out-of-line to dissuade the C++ compiler from inlining it.
-template <typename T, bool isPodLike>
-void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
-  size_t CurCapacity = this->capacity();
-  size_t CurSize = this->size();
-  // Always grow, even from zero.
-  size_t NewCapacity = size_t(detail::NextPowerOf2(CurCapacity+2));
-  if (NewCapacity < MinSize)
-    NewCapacity = MinSize;
-  T *NewElts = static_cast<T*>(malloc(NewCapacity*sizeof(T)));
-  if (NewElts == nullptr)
-    throw std::bad_alloc();
-
-  // Move the elements over.
-  this->uninitialized_move(this->begin(), this->end(), NewElts);
-
-  // Destroy the original elements.
-  destroy_range(this->begin(), this->end());
-
-  // If this wasn't grown from the inline copy, deallocate the old space.
-  if (!this->isSmall())
-    free(this->begin());
-
-  this->setEnd(NewElts+CurSize);
-  this->BeginX = NewElts;
-  this->CapacityX = this->begin()+NewCapacity;
-}
-
-
-/// SmallVectorTemplateBase<isPodLike = true> - This is where we put method
-/// implementations that are designed to work with POD-like T's.
-template <typename T>
-class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
-protected:
-  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
-
-  // No need to do a destroy loop for POD's.
-  static void destroy_range(T *, T *) {}
-
-  /// Move the range [I, E) onto the uninitialized memory
-  /// starting with "Dest", constructing elements into it as needed.
-  template<typename It1, typename It2>
-  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
-    // Just do a copy.
-    uninitialized_copy(I, E, Dest);
-  }
-
-  /// Copy the range [I, E) onto the uninitialized memory
-  /// starting with "Dest", constructing elements into it as needed.
-  template<typename It1, typename It2>
-  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
-    // Arbitrary iterator types; just use the basic implementation.
-    std::uninitialized_copy(I, E, Dest);
-  }
-
-  /// Copy the range [I, E) onto the uninitialized memory
-  /// starting with "Dest", constructing elements into it as needed.
-  template <typename T1, typename T2>
-  static void uninitialized_copy(
-      T1 *I, T1 *E, T2 *Dest,
-      typename std::enable_if<std::is_same<typename std::remove_const<T1>::type,
-                                           T2>::value>::type * = nullptr) {
-    // Use memcpy for PODs iterated by pointers (which includes SmallVector
-    // iterators): std::uninitialized_copy optimizes to memmove, but we can
-    // use memcpy here. Note that I and E are iterators and thus might be
-    // invalid for memcpy if they are equal.
-    if (I != E)
-      memcpy(Dest, I, (E - I) * sizeof(T));
-  }
-
-  /// Double the size of the allocated memory, guaranteeing space for at
-  /// least one more element or MinSize if specified.
-  void grow(size_t MinSize = 0) {
-    this->grow_pod(MinSize*sizeof(T), sizeof(T));
-  }
-
-public:
-  void push_back(const T &Elt) {
-    if (this->EndX >= this->CapacityX)
-      this->grow();
-    memcpy(this->end(), &Elt, sizeof(T));
-    this->setEnd(this->end()+1);
-  }
-
-  void pop_back() {
-    this->setEnd(this->end()-1);
-  }
-};
-
-/// This class consists of common code factored out of the SmallVector class to
-/// reduce code duplication based on the SmallVector 'N' template parameter.
-template <typename T>
-class SmallVectorImpl : public SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)> {
-  using SuperClass = SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)>;
-
-public:
-  using iterator = typename SuperClass::iterator;
-  using const_iterator = typename SuperClass::const_iterator;
-  using size_type = typename SuperClass::size_type;
-
-protected:
-  // Default ctor - Initialize to empty.
-  explicit SmallVectorImpl(unsigned N)
-    : SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)>(N*sizeof(T)) {
-  }
-
-public:
-  SmallVectorImpl(const SmallVectorImpl &) = delete;
-
-  ~SmallVectorImpl() {
-    // Destroy the constructed elements in the vector.
-    this->destroy_range(this->begin(), this->end());
-
-    // If this wasn't grown from the inline copy, deallocate the old space.
-    if (!this->isSmall())
-      free(this->begin());
-  }
-
-  void clear() {
-    this->destroy_range(this->begin(), this->end());
-    this->EndX = this->BeginX;
-  }
-
-  void resize(size_type N) {
-    if (N < this->size()) {
-      this->destroy_range(this->begin()+N, this->end());
-      this->setEnd(this->begin()+N);
-    } else if (N > this->size()) {
-      if (this->capacity() < N)
-        this->grow(N);
-      auto I = this->end();
-      for (auto E = this->begin() + N; I != E; ++I)
-        new (&*I) T();
-      this->setEnd(this->begin()+N);
-    }
-  }
-
-  void resize(size_type N, const T &NV) {
-    if (N < this->size()) {
-      this->destroy_range(this->begin()+N, this->end());
-      this->setEnd(this->begin()+N);
-    } else if (N > this->size()) {
-      if (this->capacity() < N)
-        this->grow(N);
-      std::uninitialized_fill(this->end(), this->begin()+N, NV);
-      this->setEnd(this->begin()+N);
-    }
-  }
-
-  void reserve(size_type N) {
-    if (this->capacity() < N)
-      this->grow(N);
-  }
-
-  T pop_back_val() {
-    T Result = ::std::move(this->back());
-    this->pop_back();
-    return Result;
-  }
-
-  void swap(SmallVectorImpl &RHS);
-
-  /// Add the specified range to the end of the SmallVector.
-  template <typename in_iter,
-            typename = typename std::enable_if<std::is_convertible<
-                typename std::iterator_traits<in_iter>::iterator_category,
-                std::input_iterator_tag>::value>::type>
-  void append(in_iter in_start, in_iter in_end) {
-    size_type NumInputs = std::distance(in_start, in_end);
-    // Grow allocated space if needed.
-    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
-      this->grow(this->size()+NumInputs);
-
-    // Copy the new elements over.
-    this->uninitialized_copy(in_start, in_end, this->end());
-    this->setEnd(this->end() + NumInputs);
-  }
-
-  /// Add the specified range to the end of the SmallVector.
-  void append(size_type NumInputs, const T &Elt) {
-    // Grow allocated space if needed.
-    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
-      this->grow(this->size()+NumInputs);
-
-    // Copy the new elements over.
-    std::uninitialized_fill_n(this->end(), NumInputs, Elt);
-    this->setEnd(this->end() + NumInputs);
-  }
-
-  void append(std::initializer_list<T> IL) {
-    append(IL.begin(), IL.end());
-  }
-
-  // FIXME: Consider assigning over existing elements, rather than clearing &
-  // re-initializing them - for all assign(...) variants.
-
-  void assign(size_type NumElts, const T &Elt) {
-    clear();
-    if (this->capacity() < NumElts)
-      this->grow(NumElts);
-    this->setEnd(this->begin()+NumElts);
-    std::uninitialized_fill(this->begin(), this->end(), Elt);
-  }
-
-  template <typename in_iter,
-            typename = typename std::enable_if<std::is_convertible<
-                typename std::iterator_traits<in_iter>::iterator_category,
-                std::input_iterator_tag>::value>::type>
-  void assign(in_iter in_start, in_iter in_end) {
-    clear();
-    append(in_start, in_end);
-  }
-
-  void assign(std::initializer_list<T> IL) {
-    clear();
-    append(IL);
-  }
-
-  iterator erase(const_iterator CI) {
-    // Just cast away constness because this is a non-const member function.
-    iterator I = const_cast<iterator>(CI);
-
-    assert(I >= this->begin() && "Iterator to erase is out of bounds.");
-    assert(I < this->end() && "Erasing at past-the-end iterator.");
-
-    iterator N = I;
-    // Shift all elts down one.
-    std::move(I+1, this->end(), I);
-    // Drop the last elt.
-    this->pop_back();
-    return(N);
-  }
-
-  iterator erase(const_iterator CS, const_iterator CE) {
-    // Just cast away constness because this is a non-const member function.
-    iterator S = const_cast<iterator>(CS);
-    iterator E = const_cast<iterator>(CE);
-
-    assert(S >= this->begin() && "Range to erase is out of bounds.");
-    assert(S <= E && "Trying to erase invalid range.");
-    assert(E <= this->end() && "Trying to erase past the end.");
-
-    iterator N = S;
-    // Shift all elts down.
-    iterator I = std::move(E, this->end(), S);
-    // Drop the last elts.
-    this->destroy_range(I, this->end());
-    this->setEnd(I);
-    return(N);
-  }
-
-  iterator insert(iterator I, T &&Elt) {
-    if (I == this->end()) {  // Important special case for empty vector.
-      this->push_back(::std::move(Elt));
-      return this->end()-1;
-    }
-
-    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
-    assert(I <= this->end() && "Inserting past the end of the vector.");
-
-    if (this->EndX >= this->CapacityX) {
-      size_t EltNo = I-this->begin();
-      this->grow();
-      I = this->begin()+EltNo;
-    }
-
-    ::new ((void*) this->end()) T(::std::move(this->back()));
-    // Push everything else over.
-    std::move_backward(I, this->end()-1, this->end());
-    this->setEnd(this->end()+1);
-
-    // If we just moved the element we're inserting, be sure to update
-    // the reference.
-    T *EltPtr = &Elt;
-    if (I <= EltPtr && EltPtr < this->EndX)
-      ++EltPtr;
-
-    *I = ::std::move(*EltPtr);
-    return I;
-  }
-
-  iterator insert(iterator I, const T &Elt) {
-    if (I == this->end()) {  // Important special case for empty vector.
-      this->push_back(Elt);
-      return this->end()-1;
-    }
-
-    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
-    assert(I <= this->end() && "Inserting past the end of the vector.");
-
-    if (this->EndX >= this->CapacityX) {
-      size_t EltNo = I-this->begin();
-      this->grow();
-      I = this->begin()+EltNo;
-    }
-    ::new ((void*) this->end()) T(std::move(this->back()));
-    // Push everything else over.
-    std::move_backward(I, this->end()-1, this->end());
-    this->setEnd(this->end()+1);
-
-    // If we just moved the element we're inserting, be sure to update
-    // the reference.
-    const T *EltPtr = &Elt;
-    if (I <= EltPtr && EltPtr < this->EndX)
-      ++EltPtr;
-
-    *I = *EltPtr;
-    return I;
-  }
-
-  iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
-    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
-    size_t InsertElt = I - this->begin();
-
-    if (I == this->end()) {  // Important special case for empty vector.
-      append(NumToInsert, Elt);
-      return this->begin()+InsertElt;
-    }
-
-    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
-    assert(I <= this->end() && "Inserting past the end of the vector.");
-
-    // Ensure there is enough space.
-    reserve(this->size() + NumToInsert);
-
-    // Uninvalidate the iterator.
-    I = this->begin()+InsertElt;
-
-    // If there are more elements between the insertion point and the end of the
-    // range than there are being inserted, we can use a simple approach to
-    // insertion.  Since we already reserved space, we know that this won't
-    // reallocate the vector.
-    if (size_t(this->end()-I) >= NumToInsert) {
-      T *OldEnd = this->end();
-      append(std::move_iterator<iterator>(this->end() - NumToInsert),
-             std::move_iterator<iterator>(this->end()));
-
-      // Copy the existing elements that get replaced.
-      std::move_backward(I, OldEnd-NumToInsert, OldEnd);
-
-      std::fill_n(I, NumToInsert, Elt);
-      return I;
-    }
-
-    // Otherwise, we're inserting more elements than exist already, and we're
-    // not inserting at the end.
-
-    // Move over the elements that we're about to overwrite.
-    T *OldEnd = this->end();
-    this->setEnd(this->end() + NumToInsert);
-    size_t NumOverwritten = OldEnd-I;
-    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
-
-    // Replace the overwritten part.
-    std::fill_n(I, NumOverwritten, Elt);
-
-    // Insert the non-overwritten middle part.
-    std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt);
-    return I;
-  }
-
-  template <typename ItTy,
-            typename = typename std::enable_if<std::is_convertible<
-                typename std::iterator_traits<ItTy>::iterator_category,
-                std::input_iterator_tag>::value>::type>
-  iterator insert(iterator I, ItTy From, ItTy To) {
-    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
-    size_t InsertElt = I - this->begin();
-
-    if (I == this->end()) {  // Important special case for empty vector.
-      append(From, To);
-      return this->begin()+InsertElt;
-    }
-
-    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
-    assert(I <= this->end() && "Inserting past the end of the vector.");
-
-    size_t NumToInsert = std::distance(From, To);
-
-    // Ensure there is enough space.
-    reserve(this->size() + NumToInsert);
-
-    // Uninvalidate the iterator.
-    I = this->begin()+InsertElt;
-
-    // If there are more elements between the insertion point and the end of the
-    // range than there are being inserted, we can use a simple approach to
-    // insertion.  Since we already reserved space, we know that this won't
-    // reallocate the vector.
-    if (size_t(this->end()-I) >= NumToInsert) {
-      T *OldEnd = this->end();
-      append(std::move_iterator<iterator>(this->end() - NumToInsert),
-             std::move_iterator<iterator>(this->end()));
-
-      // Copy the existing elements that get replaced.
-      std::move_backward(I, OldEnd-NumToInsert, OldEnd);
-
-      std::copy(From, To, I);
-      return I;
-    }
-
-    // Otherwise, we're inserting more elements than exist already, and we're
-    // not inserting at the end.
-
-    // Move over the elements that we're about to overwrite.
-    T *OldEnd = this->end();
-    this->setEnd(this->end() + NumToInsert);
-    size_t NumOverwritten = OldEnd-I;
-    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
-
-    // Replace the overwritten part.
-    for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
-      *J = *From;
-      ++J; ++From;
-    }
-
-    // Insert the non-overwritten middle part.
-    this->uninitialized_copy(From, To, OldEnd);
-    return I;
-  }
-
-  void insert(iterator I, std::initializer_list<T> IL) {
-    insert(I, IL.begin(), IL.end());
-  }
-
-  template <typename... ArgTypes> void emplace_back(ArgTypes &&... Args) {
-    if (this->EndX >= this->CapacityX)
-      this->grow();
-    ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
-    this->setEnd(this->end() + 1);
-  }
-
-  SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
-
-  SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
-
-  bool operator==(const SmallVectorImpl &RHS) const {
-    if (this->size() != RHS.size()) return false;
-    return std::equal(this->begin(), this->end(), RHS.begin());
-  }
-  bool operator!=(const SmallVectorImpl &RHS) const {
-    return !(*this == RHS);
-  }
-
-  bool operator<(const SmallVectorImpl &RHS) const {
-    return std::lexicographical_compare(this->begin(), this->end(),
-                                        RHS.begin(), RHS.end());
-  }
-
-  /// Set the array size to \p N, which the current array must have enough
-  /// capacity for.
-  ///
-  /// This does not construct or destroy any elements in the vector.
-  ///
-  /// Clients can use this in conjunction with capacity() to write past the end
-  /// of the buffer when they know that more elements are available, and only
-  /// update the size later. This avoids the cost of value initializing elements
-  /// which will only be overwritten.
-  void set_size(size_type N) {
-    assert(N <= this->capacity());
-    this->setEnd(this->begin() + N);
-  }
-};
-
-template <typename T>
-void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
-  if (this == &RHS) return;
-
-  // We can only avoid copying elements if neither vector is small.
-  if (!this->isSmall() && !RHS.isSmall()) {
-    std::swap(this->BeginX, RHS.BeginX);
-    std::swap(this->EndX, RHS.EndX);
-    std::swap(this->CapacityX, RHS.CapacityX);
-    return;
-  }
-  if (RHS.size() > this->capacity())
-    this->grow(RHS.size());
-  if (this->size() > RHS.capacity())
-    RHS.grow(this->size());
-
-  // Swap the shared elements.
-  size_t NumShared = this->size();
-  if (NumShared > RHS.size()) NumShared = RHS.size();
-  for (size_type i = 0; i != NumShared; ++i)
-    std::swap((*this)[i], RHS[i]);
-
-  // Copy over the extra elts.
-  if (this->size() > RHS.size()) {
-    size_t EltDiff = this->size() - RHS.size();
-    this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
-    RHS.setEnd(RHS.end()+EltDiff);
-    this->destroy_range(this->begin()+NumShared, this->end());
-    this->setEnd(this->begin()+NumShared);
-  } else if (RHS.size() > this->size()) {
-    size_t EltDiff = RHS.size() - this->size();
-    this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
-    this->setEnd(this->end() + EltDiff);
-    this->destroy_range(RHS.begin()+NumShared, RHS.end());
-    RHS.setEnd(RHS.begin()+NumShared);
-  }
-}
-
-template <typename T>
-SmallVectorImpl<T> &SmallVectorImpl<T>::
-  operator=(const SmallVectorImpl<T> &RHS) {
-  // Avoid self-assignment.
-  if (this == &RHS) return *this;
-
-  // If we already have sufficient space, assign the common elements, then
-  // destroy any excess.
-  size_t RHSSize = RHS.size();
-  size_t CurSize = this->size();
-  if (CurSize >= RHSSize) {
-    // Assign common elements.
-    iterator NewEnd;
-    if (RHSSize)
-      NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin());
-    else
-      NewEnd = this->begin();
-
-    // Destroy excess elements.
-    this->destroy_range(NewEnd, this->end());
-
-    // Trim.
-    this->setEnd(NewEnd);
-    return *this;
-  }
-
-  // If we have to grow to have enough elements, destroy the current elements.
-  // This allows us to avoid copying them during the grow.
-  // FIXME: don't do this if they're efficiently moveable.
-  if (this->capacity() < RHSSize) {
-    // Destroy current elements.
-    this->destroy_range(this->begin(), this->end());
-    this->setEnd(this->begin());
-    CurSize = 0;
-    this->grow(RHSSize);
-  } else if (CurSize) {
-    // Otherwise, use assignment for the already-constructed elements.
-    std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin());
-  }
-
-  // Copy construct the new elements in place.
-  this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(),
-                           this->begin()+CurSize);
-
-  // Set end.
-  this->setEnd(this->begin()+RHSSize);
-  return *this;
-}
-
-template <typename T>
-SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
-  // Avoid self-assignment.
-  if (this == &RHS) return *this;
-
-  // If the RHS isn't small, clear this vector and then steal its buffer.
-  if (!RHS.isSmall()) {
-    this->destroy_range(this->begin(), this->end());
-    if (!this->isSmall()) free(this->begin());
-    this->BeginX = RHS.BeginX;
-    this->EndX = RHS.EndX;
-    this->CapacityX = RHS.CapacityX;
-    RHS.resetToSmall();
-    return *this;
-  }
-
-  // If we already have sufficient space, assign the common elements, then
-  // destroy any excess.
-  size_t RHSSize = RHS.size();
-  size_t CurSize = this->size();
-  if (CurSize >= RHSSize) {
-    // Assign common elements.
-    iterator NewEnd = this->begin();
-    if (RHSSize)
-      NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
-
-    // Destroy excess elements and trim the bounds.
-    this->destroy_range(NewEnd, this->end());
-    this->setEnd(NewEnd);
-
-    // Clear the RHS.
-    RHS.clear();
-
-    return *this;
-  }
-
-  // If we have to grow to have enough elements, destroy the current elements.
-  // This allows us to avoid copying them during the grow.
-  // FIXME: this may not actually make any sense if we can efficiently move
-  // elements.
-  if (this->capacity() < RHSSize) {
-    // Destroy current elements.
-    this->destroy_range(this->begin(), this->end());
-    this->setEnd(this->begin());
-    CurSize = 0;
-    this->grow(RHSSize);
-  } else if (CurSize) {
-    // Otherwise, use assignment for the already-constructed elements.
-    std::move(RHS.begin(), RHS.begin()+CurSize, this->begin());
-  }
-
-  // Move-construct the new elements in place.
-  this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
-                           this->begin()+CurSize);
-
-  // Set end.
-  this->setEnd(this->begin()+RHSSize);
-
-  RHS.clear();
-  return *this;
-}
-
-/// Storage for the SmallVector elements which aren't contained in
-/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1'
-/// element is in the base class. This is specialized for the N=1 and N=0 cases
-/// to avoid allocating unnecessary storage.
-template <typename T, unsigned N>
-struct SmallVectorStorage {
-  typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1];
-};
-template <typename T> struct SmallVectorStorage<T, 1> {};
-template <typename T> struct SmallVectorStorage<T, 0> {};
-
-/// This is a 'vector' (really, a variable-sized array), optimized
-/// for the case when the array is small.  It contains some number of elements
-/// in-place, which allows it to avoid heap allocation when the actual number of
-/// elements is below that threshold.  This allows normal "small" cases to be
-/// fast without losing generality for large inputs.
-///
-/// Note that this does not attempt to be exception safe.
-///
-template <typename T, unsigned N>
-class SmallVector : public SmallVectorImpl<T> {
-  /// Inline space for elements which aren't stored in the base class.
-  SmallVectorStorage<T, N> Storage;
-
-public:
-  SmallVector() : SmallVectorImpl<T>(N) {}
-
-  explicit SmallVector(size_t Size, const T &Value = T())
-    : SmallVectorImpl<T>(N) {
-    this->assign(Size, Value);
-  }
-
-  template <typename ItTy,
-            typename = typename std::enable_if<std::is_convertible<
-                typename std::iterator_traits<ItTy>::iterator_category,
-                std::input_iterator_tag>::value>::type>
-  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
-    this->append(S, E);
-  }
-
-  template <typename Container>
-  explicit SmallVector(Container &&c) : SmallVectorImpl<T>(N) {
-    this->append(c.begin(), c.end());
-  }
-
-  SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
-    this->assign(IL);
-  }
-
-  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
-    if (!RHS.empty())
-      SmallVectorImpl<T>::operator=(RHS);
-  }
-
-  const SmallVector &operator=(const SmallVector &RHS) {
-    SmallVectorImpl<T>::operator=(RHS);
-    return *this;
-  }
-
-  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
-    if (!RHS.empty())
-      SmallVectorImpl<T>::operator=(::std::move(RHS));
-  }
-
-  template<typename Container>
-  const SmallVector &operator=(const Container &RHS) {
-    this->assign(RHS.begin(), RHS.end());
-    return *this;
-  }
-
-  SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
-    if (!RHS.empty())
-      SmallVectorImpl<T>::operator=(::std::move(RHS));
-  }
-
-  const SmallVector &operator=(SmallVector &&RHS) {
-    SmallVectorImpl<T>::operator=(::std::move(RHS));
-    return *this;
-  }
-
-  const SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
-    SmallVectorImpl<T>::operator=(::std::move(RHS));
-    return *this;
-  }
-
-  template <typename Container>
-  const SmallVector &operator=(Container &&C) {
-    this->assign(C.begin(), C.end());
-    return *this;
-  }
-
-  const SmallVector &operator=(std::initializer_list<T> IL) {
-    this->assign(IL);
-    return *this;
-  }
-};
-
-template <typename T, unsigned N>
-inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
-  return X.capacity_in_bytes();
-}
-
-} // end namespace at
-
-namespace std {
-
-  /// Implement std::swap in terms of SmallVector swap.
-  template<typename T>
-  inline void
-  swap(at::SmallVectorImpl<T> &LHS, at::SmallVectorImpl<T> &RHS) {
-    LHS.swap(RHS);
-  }
-
-  /// Implement std::swap in terms of SmallVector swap.
-  template<typename T, unsigned N>
-  inline void
-  swap(at::SmallVector<T, N> &LHS, at::SmallVector<T, N> &RHS) {
-    LHS.swap(RHS);
-  }
-
-} // end namespace std
+#include <ATen/core/SmallVector.h>
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index 968fd8ebbec266..03a5a6008e7d24 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -18,14 +18,14 @@ namespace at {
 // tensor and a [0] size values tensor for such an empty tensor.  However,
 // we don't currently support zero-size dimensions, so we can't actually
 // do this; so we just allocate zero-size tensors for everything.
-SparseTensorImpl::SparseTensorImpl(Type * type)
-    : TensorImpl(type, nullptr)
+SparseTensorImpl::SparseTensorImpl(at::Backend backend, at::ScalarType scalar_type)
+    : TensorImpl(backend, scalar_type, nullptr, false)
     , size_{0}
     , sparseDims_(1)
     , denseDims_(0)
-    , indices_(type->toDense().toScalarType(ScalarType::Long).tensor())
-    , values_(type->toDense().tensor()) {
-      AT_ASSERT(type->is_sparse());
+    , indices_(globalContext().getTypeOpt(toDense(backend), ScalarType::Long)->tensor())
+    , values_(globalContext().getTypeOpt(toDense(backend), scalar_type)->tensor()) {
+      AT_ASSERT(backend == Backend::SparseCPU || backend == Backend::SparseCUDA);
     }
 
 IntList SparseTensorImpl::sizes() const {
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 9ef08705bb0f45..307c0f9e5574d1 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -48,7 +48,7 @@ struct AT_API SparseTensorImpl : public TensorImpl {
 
 public:
   // Public for now...
-  explicit SparseTensorImpl(Type * type);
+  explicit SparseTensorImpl(at::Backend, at::ScalarType);
 
   int64_t nnz() const { return nnz_; }
   int64_t sparseDims() const { return sparseDims_; }
@@ -75,7 +75,7 @@ struct AT_API SparseTensorImpl : public TensorImpl {
     if (size.size() == 0) {
       size_ = {0};
     } else {
-      size_ = size;
+      size_ = size.vec();
     }
     sparseDims_ = sparseDims;
     denseDims_ = denseDims;
diff --git a/aten/src/ATen/Storage.cpp b/aten/src/ATen/Storage.cpp
index f5ba512cc27105..991cfba92efd2a 100644
--- a/aten/src/ATen/Storage.cpp
+++ b/aten/src/ATen/Storage.cpp
@@ -1,23 +1,32 @@
 #include <ATen/Storage.h>
-#include <ATen/Context.h>
 #include <iostream>
 
 namespace at {
 
+Storage::Storage(at::ScalarType scalar_type, size_t size, Allocator* allocator)
+    : storage_impl_(new StorageImpl(
+          scalar_type,
+          size,
+          allocator,
+          /* resizable */ false)) {}
+
+Storage::Storage(
+    at::ScalarType scalar_type,
+    at::DataPtr data_ptr,
+    size_t size,
+    const std::function<void(void*)>& deleter)
+    : storage_impl_(new StorageImpl(
+          scalar_type,
+          size,
+          std::move(data_ptr),
+          /* allocator */ nullptr,
+          /* resizable */ false)) {}
+
 Storage::~Storage() {
   if (!storage_impl_) {
     return;
   }
-  if (--storage_impl_->refcount == 0) {
-    if (storage_impl_->finalizer) {
-      (*storage_impl_->finalizer)();
-    }
-    storage_impl_->finalizer = nullptr;
-    storage_impl_->data_ptr.clear();
-    if (storage_impl_ && --storage_impl_->weakcount == 0) {
-      delete storage_impl_;
-    }
-  }
+  storage_impl_->release();
 }
 
 } // namespace at
diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h
index a5c85192e36f8c..aa27296c74d40f 100644
--- a/aten/src/ATen/Storage.h
+++ b/aten/src/ATen/Storage.h
@@ -8,6 +8,12 @@ struct AT_API Storage {
 public:
   Storage() = delete;
   Storage(StorageImpl* storage_impl) : storage_impl_(storage_impl) {}
+  Storage(at::ScalarType, size_t size, Allocator* allocator);
+  Storage(
+      at::ScalarType,
+      at::DataPtr,
+      size_t size,
+      const std::function<void(void*)>& deleter);
   ~Storage();
   // There are reasonable interpretations of these constructors, but they're to
   // be implemented on demand.
diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp
index a26f8971310aa5..6e3d693d012c5c 100644
--- a/aten/src/ATen/StorageImpl.cpp
+++ b/aten/src/ATen/StorageImpl.cpp
@@ -12,8 +12,6 @@ StorageImpl::StorageImpl(
     : scalar_type(scalar_type),
       data_ptr(std::move(data_ptr)),
       size(size),
-      refcount(1),
-      weakcount(1), // from the strong reference
       resizable(resizable),
       allocator(allocator),
       finalizer(nullptr) {}
diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h
index c48ec51e013d4c..f1c23c54677dba 100644
--- a/aten/src/ATen/StorageImpl.h
+++ b/aten/src/ATen/StorageImpl.h
@@ -5,6 +5,7 @@
 #include <ATen/Allocator.h>
 #include <ATen/ScalarType.h>
 #include <ATen/ScalarTypeUtils.h>
+#include <ATen/Retainable.h>
 #include <TH/THTypeConversion.hpp>
 #include <atomic>
 
@@ -39,7 +40,7 @@ namespace at {
 
 struct Type;
 
-struct TH_CPP_API StorageImpl {
+struct AT_API StorageImpl : public Retainable {
 
   StorageImpl() = delete;
   virtual ~StorageImpl() {};
@@ -48,8 +49,6 @@ struct TH_CPP_API StorageImpl {
   at::ScalarType scalar_type;
   at::DataPtr data_ptr;
   ptrdiff_t size;
-  std::atomic<int> refcount;
-  std::atomic<int> weakcount;
   bool resizable;
   at::Allocator* allocator;
   std::unique_ptr<THFinalizer> finalizer;
@@ -58,6 +57,8 @@ struct TH_CPP_API StorageImpl {
   StorageImpl(StorageImpl&&) = delete;
   StorageImpl(const StorageImpl&&) = delete;
 
+  // TODO: Rename this into th_data, and move it out of the class;
+  // the real data shouldn't call th::from_type
   template <typename T>
   inline T* data() const {
     auto scalar_type_T = at::CTypeToScalarType<th::from_type<T>>::to();
@@ -76,6 +77,14 @@ struct TH_CPP_API StorageImpl {
     return static_cast<T*>(this->data_ptr.get());
   }
 
+  void release_resources() {
+    if (finalizer) {
+      (*finalizer)();
+    }
+    finalizer = nullptr;
+    data_ptr.clear();
+  }
+
   void operator=(const StorageImpl&) = delete;
 
   virtual size_t elementSize() const {
@@ -94,9 +103,6 @@ struct TH_CPP_API StorageImpl {
   const void* data() const {
     return data_ptr.get();
   };
-  void retain() {
-    ++refcount;
-  }
 
   int getDevice() const {
     return data_ptr.device().index();
diff --git a/aten/src/ATen/THLongStorageView.h b/aten/src/ATen/THLongStorageView.h
index 55e7d3de6dea4a..8ebcfdaeada40f 100644
--- a/aten/src/ATen/THLongStorageView.h
+++ b/aten/src/ATen/THLongStorageView.h
@@ -64,7 +64,6 @@ class THLongStorageView {
       storage.size = ref.size();
     }
     storage.scalar_type = at::CTypeToScalarType<th::from_type<int64_t>>::to();
-    storage.refcount = 0;
     storage.set_resizable(false);
   }
 private:
diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h
index 60f6098762cd05..15f59e902182c4 100644
--- a/aten/src/ATen/TensorGeometry.h
+++ b/aten/src/ATen/TensorGeometry.h
@@ -9,7 +9,7 @@ struct AT_API TensorGeometry {
   TensorGeometry() : storage_offset_(0) {}
 
   explicit TensorGeometry(IntList sizes)
-    : sizes_(sizes)
+    : sizes_(sizes.vec())
     , strides_(sizes.size())
     , storage_offset_(0) {
       int64_t dim = sizes.size();
@@ -21,8 +21,8 @@ struct AT_API TensorGeometry {
   }
 
   explicit TensorGeometry(const Tensor& t)
-    : sizes_(t.sizes())
-    , strides_(t.strides())
+    : sizes_(t.sizes().vec())
+    , strides_(t.strides().vec())
     , storage_offset_(t.storage_offset()) {}
 
   // true if the tensor is contiguous
diff --git a/aten/src/ATen/TensorImpl.cpp b/aten/src/ATen/TensorImpl.cpp
index 59cc303a1acf5c..a48cb033b2de49 100644
--- a/aten/src/ATen/TensorImpl.cpp
+++ b/aten/src/ATen/TensorImpl.cpp
@@ -2,10 +2,23 @@
 
 #include <ATen/Tensor.h>
 #include <ATen/optional.h>
+#include <ATen/Context.h>
+
+#include <ATen/detail/VariableHooksInterface.h>
 
 #include <TH/THTensor.hpp>
 
 namespace at {
+
+Type& TensorImpl::type() const {
+  Type* base_type = &globalContext().getType(backend_, scalar_type_);
+  if (is_variable_) {
+    return detail::getVariableHooks().getVariableType(*base_type);
+  } else {
+    return *base_type;
+  }
+}
+
 Tensor& TensorImpl::grad() {
   AT_ERROR("grad is not implemented for Tensor");
 }
diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h
index 9c3591eb96b31f..1aa4d8390ed175 100644
--- a/aten/src/ATen/TensorImpl.h
+++ b/aten/src/ATen/TensorImpl.h
@@ -18,16 +18,18 @@ struct Tensor;
 
 namespace at {
 struct AT_API TensorImpl : public Retainable {
-  explicit TensorImpl(Type * type, THTensor * tensor)
-  : type_(type), tensor(tensor) {}
+  explicit TensorImpl(Backend backend, ScalarType scalar_type, THTensor * tensor, bool is_variable)
+  : backend_(backend), scalar_type_(scalar_type), is_variable_(is_variable), tensor(tensor) {}
 
   virtual ~TensorImpl();
 
   virtual void release_resources() override;
 
-  Type & type() const {
-    return *type_;
-  }
+  // The implementation of this method will have to be hoisted out and
+  // hooked in, so that Caffe2 doesn't need to know about Context
+  // TODO: This really really needs to be inlined.
+  Type & type() const;
+
   const char * toString() const;
   virtual IntList sizes() const;
   virtual IntList strides() const;
@@ -91,8 +93,12 @@ struct AT_API TensorImpl : public Retainable {
   virtual void set_data(Tensor new_data);
 
 protected:
+  Backend backend_;
+  // INVARIANT: When storage is non-null, this scalar type must
+  // agree with the scalar type in storage
+  ScalarType scalar_type_;
+  bool is_variable_ = false;
   bool is_wrapped_number_ = false;
-  Type * type_;
 public:
   THTensor * tensor;
 };
diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp
index 5e4059421c1283..ecfb70fa1bbede 100644
--- a/aten/src/ATen/UndefinedTensor.cpp
+++ b/aten/src/ATen/UndefinedTensor.cpp
@@ -6,7 +6,7 @@ namespace at {
 
 // should this use the globalContext?  Can it get a context passed in somehow?
 UndefinedTensor::UndefinedTensor()
-: TensorImpl(&(globalContext().getType(Backend::Undefined,ScalarType::Undefined)), nullptr) {
+: TensorImpl(Backend::Undefined, ScalarType::Undefined, nullptr, /* is variable */ false) {
 }
 
 IntList UndefinedTensor::sizes() const {
diff --git a/aten/src/ATen/core/ATenCoreTest.cpp b/aten/src/ATen/core/ATenCoreTest.cpp
new file mode 100644
index 00000000000000..5bb595a0bce5de
--- /dev/null
+++ b/aten/src/ATen/core/ATenCoreTest.cpp
@@ -0,0 +1,10 @@
+#include <ATen/core/ATenCoreTest.h>
+
+namespace at {
+
+static int CoreTestGlobal = 0;
+int CoreTest() {
+  return CoreTestGlobal++;
+}
+
+} // namespace at
diff --git a/aten/src/ATen/core/ATenCoreTest.h b/aten/src/ATen/core/ATenCoreTest.h
new file mode 100644
index 00000000000000..ee8471f66fe258
--- /dev/null
+++ b/aten/src/ATen/core/ATenCoreTest.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <ATen/core/CoreAPI.h>
+
+namespace at {
+
+AT_CORE_API int CoreTest();
+}
diff --git a/aten/src/ATen/AlignOf.h b/aten/src/ATen/core/AlignOf.h
similarity index 68%
rename from aten/src/ATen/AlignOf.h
rename to aten/src/ATen/core/AlignOf.h
index 5e9f0127b32e70..a7e42196f43ecd 100644
--- a/aten/src/ATen/AlignOf.h
+++ b/aten/src/ATen/core/AlignOf.h
@@ -33,7 +33,7 @@ namespace at {
 // MSVC requires special handling here.
 #ifndef _MSC_VER
 
-template<size_t Alignment, size_t Size>
+template <size_t Alignment, size_t Size>
 struct AlignedCharArray {
   alignas(Alignment) char buffer[Size];
 };
@@ -41,7 +41,7 @@ struct AlignedCharArray {
 #else // _MSC_VER
 
 /// \brief Create a type with an aligned char buffer.
-template<size_t Alignment, size_t Size>
+template <size_t Alignment, size_t Size>
 struct AlignedCharArray;
 
 // We provide special variations of this template for the most common
@@ -52,7 +52,7 @@ struct AlignedCharArray;
 // MSVC warns on the existence of the declspec despite the union member forcing
 // proper alignment.
 
-template<size_t Size>
+template <size_t Size>
 struct AlignedCharArray<1, Size> {
   union {
     char aligned;
@@ -60,7 +60,7 @@ struct AlignedCharArray<1, Size> {
   };
 };
 
-template<size_t Size>
+template <size_t Size>
 struct AlignedCharArray<2, Size> {
   union {
     short aligned;
@@ -68,7 +68,7 @@ struct AlignedCharArray<2, Size> {
   };
 };
 
-template<size_t Size>
+template <size_t Size>
 struct AlignedCharArray<4, Size> {
   union {
     int aligned;
@@ -76,7 +76,7 @@ struct AlignedCharArray<4, Size> {
   };
 };
 
-template<size_t Size>
+template <size_t Size>
 struct AlignedCharArray<8, Size> {
   union {
     double aligned;
@@ -84,14 +84,13 @@ struct AlignedCharArray<8, Size> {
   };
 };
 
-
 // The rest of these are provided with a __declspec(align(...)) and we simply
 // can't pass them by-value as function arguments on MSVC.
 
 #define AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
-  template<size_t Size> \
-  struct AlignedCharArray<x, Size> { \
-    __declspec(align(x)) char buffer[Size]; \
+  template <size_t Size>                          \
+  struct AlignedCharArray<x, Size> {              \
+    __declspec(align(x)) char buffer[Size];       \
   };
 
 AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16)
@@ -104,24 +103,47 @@ AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128)
 #endif // _MSC_VER
 
 namespace detail {
-template <typename T1,
-          typename T2 = char, typename T3 = char, typename T4 = char,
-          typename T5 = char, typename T6 = char, typename T7 = char,
-          typename T8 = char, typename T9 = char, typename T10 = char>
+template <
+    typename T1,
+    typename T2 = char,
+    typename T3 = char,
+    typename T4 = char,
+    typename T5 = char,
+    typename T6 = char,
+    typename T7 = char,
+    typename T8 = char,
+    typename T9 = char,
+    typename T10 = char>
 class AlignerImpl {
-  T1 t1; T2 t2; T3 t3; T4 t4; T5 t5; T6 t6; T7 t7; T8 t8; T9 t9; T10 t10;
+  T1 t1;
+  T2 t2;
+  T3 t3;
+  T4 t4;
+  T5 t5;
+  T6 t6;
+  T7 t7;
+  T8 t8;
+  T9 t9;
+  T10 t10;
 
   AlignerImpl() = delete;
 };
 
-template <typename T1,
-          typename T2 = char, typename T3 = char, typename T4 = char,
-          typename T5 = char, typename T6 = char, typename T7 = char,
-          typename T8 = char, typename T9 = char, typename T10 = char>
+template <
+    typename T1,
+    typename T2 = char,
+    typename T3 = char,
+    typename T4 = char,
+    typename T5 = char,
+    typename T6 = char,
+    typename T7 = char,
+    typename T8 = char,
+    typename T9 = char,
+    typename T10 = char>
 union SizerImpl {
   char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)],
-       arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)],
-       arr9[sizeof(T9)], arr10[sizeof(T10)];
+      arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)],
+      arr9[sizeof(T9)], arr10[sizeof(T10)];
 };
 } // end namespace detail
 
@@ -132,14 +154,20 @@ union SizerImpl {
 /// expose a char array buffer member which can be used as suitable storage for
 /// a placement new of any of these types. Support for more than ten types can
 /// be added at the cost of more boilerplate.
-template <typename T1,
-          typename T2 = char, typename T3 = char, typename T4 = char,
-          typename T5 = char, typename T6 = char, typename T7 = char,
-          typename T8 = char, typename T9 = char, typename T10 = char>
-struct AlignedCharArrayUnion : AlignedCharArray<
-    alignof(detail::AlignerImpl<T1, T2, T3, T4, T5,
-                                      T6, T7, T8, T9, T10>),
-    sizeof(::at::detail::SizerImpl<T1, T2, T3, T4, T5,
-                                     T6, T7, T8, T9, T10>)> {
-};
+template <
+    typename T1,
+    typename T2 = char,
+    typename T3 = char,
+    typename T4 = char,
+    typename T5 = char,
+    typename T6 = char,
+    typename T7 = char,
+    typename T8 = char,
+    typename T9 = char,
+    typename T10 = char>
+struct AlignedCharArrayUnion
+    : AlignedCharArray<
+          alignof(detail::AlignerImpl<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>),
+          sizeof(::at::detail::
+                     SizerImpl<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>)> {};
 } // end namespace at
diff --git a/aten/src/ATen/core/ArrayRef.h b/aten/src/ATen/core/ArrayRef.h
new file mode 100644
index 00000000000000..7e997d6572f3c0
--- /dev/null
+++ b/aten/src/ATen/core/ArrayRef.h
@@ -0,0 +1,212 @@
+//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::ArrayRef.
+// removed llvm-specific functionality
+// removed some implicit const -> non-const conversions that rely on
+// complicated std::enable_if meta-programming
+// removed a bunch of slice variants for simplicity...
+
+#pragma once
+
+#include <ATen/core/C++17.h>
+#include <ATen/core/Error.h>
+#include <ATen/core/SmallVector.h>
+
+#include <array>
+#include <iterator>
+#include <vector>
+
+namespace at {
+
+/// ArrayRef - Represent a constant reference to an array (0 or more elements
+/// consecutively in memory), i.e. a start pointer and a length.  It allows
+/// various APIs to take consecutive elements easily and conveniently.
+///
+/// This class does not own the underlying data, it is expected to be used in
+/// situations where the data resides in some other buffer, whose lifetime
+/// extends past that of the ArrayRef. For this reason, it is not in general
+/// safe to store an ArrayRef.
+///
+/// This is intended to be trivially copyable, so it should be passed by
+/// value.
+template <typename T>
+class ArrayRef final {
+ public:
+  using iterator = const T*;
+  using const_iterator = const T*;
+  using size_type = size_t;
+
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+ private:
+  /// The start of the array, in an external buffer.
+  const T* Data;
+
+  /// The number of elements.
+  size_type Length;
+
+ public:
+  /// @name Constructors
+  /// @{
+
+  /// Construct an empty ArrayRef.
+  /* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {}
+
+  /// Construct an ArrayRef from a single element.
+  // TODO Make this explicit
+  constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
+
+  /// Construct an ArrayRef from a pointer and length.
+  constexpr ArrayRef(const T* data, size_t length)
+      : Data(data), Length(length) {}
+
+  /// Construct an ArrayRef from a range.
+  constexpr ArrayRef(const T* begin, const T* end)
+      : Data(begin), Length(end - begin) {}
+
+  /// Construct an ArrayRef from a SmallVector. This is templated in order to
+  /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
+  /// copy-construct an ArrayRef.
+  template <typename U>
+  /* implicit */ ArrayRef(const SmallVectorTemplateCommon<T, U>& Vec)
+      : Data(Vec.data()), Length(Vec.size()) {}
+
+  /// Construct an ArrayRef from a std::vector.
+  template <typename A>
+  /* implicit */ ArrayRef(const std::vector<T, A>& Vec)
+      : Data(Vec.data()), Length(Vec.size()) {}
+
+  /// Construct an ArrayRef from a std::array
+  template <size_t N>
+  /* implicit */ constexpr ArrayRef(const std::array<T, N>& Arr)
+      : Data(Arr.data()), Length(N) {}
+
+  /// Construct an ArrayRef from a C array.
+  template <size_t N>
+  /* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
+
+  /// Construct an ArrayRef from a std::initializer_list.
+  /* implicit */ constexpr ArrayRef(const std::initializer_list<T>& Vec)
+      : Data(Vec.begin() == Vec.end() ? static_cast<T*>(nullptr) : Vec.begin()),
+        Length(Vec.size()) {}
+
+  /// @}
+  /// @name Simple Operations
+  /// @{
+
+  constexpr iterator begin() const {
+    return Data;
+  }
+  constexpr iterator end() const {
+    return Data + Length;
+  }
+
+  constexpr reverse_iterator rbegin() const {
+    return reverse_iterator(end());
+  }
+  constexpr reverse_iterator rend() const {
+    return reverse_iterator(begin());
+  }
+
+  /// empty - Check if the array is empty.
+  constexpr bool empty() const {
+    return Length == 0;
+  }
+
+  constexpr const T* data() const {
+    return Data;
+  }
+
+  /// size - Get the array size.
+  constexpr size_t size() const {
+    return Length;
+  }
+
+  /// front - Get the first element.
+  AT_CPP14_CONSTEXPR const T& front() const {
+    AT_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list");
+    return Data[0];
+  }
+
+  /// back - Get the last element.
+  AT_CPP14_CONSTEXPR const T& back() const {
+    AT_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
+    return Data[Length - 1];
+  }
+
+  /// equals - Check for element-wise equality.
+  constexpr bool equals(ArrayRef RHS) const {
+    return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
+  }
+
+  /// slice(n, m) - Chop off the first N elements of the array, and keep M
+  /// elements in the array.
+  AT_CPP14_CONSTEXPR ArrayRef<T> slice(size_t N, size_t M) const {
+    AT_CHECK(
+        N + M <= size(),
+        "ArrayRef: invalid slice, N = ",
+        N,
+        "; M = ",
+        M,
+        "; size = ",
+        size());
+    return ArrayRef<T>(data() + N, M);
+  }
+
+  /// slice(n) - Chop off the first N elements of the array.
+  constexpr ArrayRef<T> slice(size_t N) const {
+    return slice(N, size() - N);
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+  constexpr const T& operator[](size_t Index) const {
+    return Data[Index];
+  }
+
+  /// Vector compatibility
+  AT_CPP14_CONSTEXPR const T& at(size_t Index) const {
+    AT_CHECK(
+        Index < Length,
+        "ArrayRef: invalid index Index = ",
+        Index,
+        "; Length = ",
+        Length);
+    return Data[Index];
+  }
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  typename std::enable_if<std::is_same<U, T>::value, ArrayRef<T>>::type&
+  operator=(U&& Temporary) = delete;
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  typename std::enable_if<std::is_same<U, T>::value, ArrayRef<T>>::type&
+  operator=(std::initializer_list<U>) = delete;
+
+  /// @}
+  /// @name Expensive Operations
+  /// @{
+  std::vector<T> vec() const {
+    return std::vector<T>(Data, Data + Length);
+  }
+
+  /// @}
+};
+
+} // namespace at
diff --git a/aten/src/ATen/Backtrace.cpp b/aten/src/ATen/core/Backtrace.cpp
similarity index 92%
rename from aten/src/ATen/Backtrace.cpp
rename to aten/src/ATen/core/Backtrace.cpp
index a8e062051ee633..7914489d50ece3 100644
--- a/aten/src/ATen/Backtrace.cpp
+++ b/aten/src/ATen/core/Backtrace.cpp
@@ -1,5 +1,5 @@
-#include <ATen/optional.h>
-#include <ATen/Backtrace.h>
+#include <ATen/core/Backtrace.h>
+#include <ATen/core/optional.h>
 
 #include <functional>
 #include <memory>
@@ -7,18 +7,30 @@
 #include <string>
 #include <vector>
 
-#if !defined(_WIN32) && !defined(__EMSCRIPTEN__)
+#if defined(__ANDROID__)
+#define AT_CORE_MOBILE 1
+#elif (                   \
+    defined(__APPLE__) && \
+    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
+#define AT_CORE_MOBILE 1
+#else
+#define AT_CORE_MOBILE 0
+#endif
+
+#if !AT_CORE_MOBILE && !defined(_WIN32) && !defined(__EMSCRIPTEN__)
+#define SUPPORTS_BACKTRACE 1
+#else
+#define SUPPORTS_BACKTRACE 0
+#endif
+
+#if SUPPORTS_BACKTRACE
 #include <cxxabi.h>
 #include <execinfo.h>
 #endif // !defined(_WIN32)
 
 namespace at {
-#if defined(_MSC_VER)
-// Windows does not have cxxabi.h, so we will simply return the original.
-std::string demangle(const char* name) {
-  return std::string(name);
-}
-#elif !defined(__EMSCRIPTEN__)
+
+#if SUPPORTS_BACKTRACE
 std::string demangle(const char* name) {
   int status = -1;
 
@@ -45,6 +57,10 @@ std::string demangle(const char* name) {
     return name;
   }
 }
+#else
+std::string demangle(const char* name) {
+  return std::string(name);
+}
 #endif
 
 // TODO: This backtrace retrieval can be implemented on Windows via the Windows
@@ -52,8 +68,7 @@ std::string demangle(const char* name) {
 // https://stackoverflow.com/questions/5693192/win32-backtrace-from-c-code
 // https://stackoverflow.com/questions/26398064/counterpart-to-glibcs-backtrace-and-backtrace-symbols-on-windows
 // https://msdn.microsoft.com/en-us/library/windows/desktop/bb204633%28v=vs.85%29.aspx.
-#if !defined(_WIN32) && !defined(__EMSCRIPTEN__)
-
+#if SUPPORTS_BACKTRACE
 namespace {
 
 struct FrameInformation {
@@ -143,14 +158,13 @@ at::optional<FrameInformation> parse_frame_information(
 }
 
 } // anonymous namespace
-
-#endif // !defined(_WIN32)
+#endif // SUPPORTS_BACKTRACE
 
 std::string get_backtrace(
     size_t frames_to_skip,
     size_t maximum_number_of_frames,
     bool skip_python_frames) {
-#if !defined(_WIN32) && !defined(__EMSCRIPTEN__)
+#if SUPPORTS_BACKTRACE
 
   // We always skip this frame (backtrace).
   frames_to_skip += 1;
@@ -221,10 +235,9 @@ std::string get_backtrace(
   }
 
   return stream.str();
-
-#else
-
+#else // !SUPPORTS_BACKTRACE
   return "(no backtrace available)";
-#endif
+#endif // SUPPORTS_BACKTRACE
 }
+
 } // namespace at
diff --git a/aten/src/ATen/core/Backtrace.h b/aten/src/ATen/core/Backtrace.h
new file mode 100644
index 00000000000000..ec4c17c6f6a531
--- /dev/null
+++ b/aten/src/ATen/core/Backtrace.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include <typeinfo>
+
+#include <ATen/core/CoreAPI.h>
+
+namespace at {
+/// Utility to demangle a C++ symbol name.
+AT_CORE_API std::string demangle(const char* name);
+
+/// Returns the printable name of the type.
+template <typename T>
+inline const char* demangle_type() {
+#ifdef __GXX_RTTI
+  static const std::string name = demangle(typeid(T).name());
+  return name.c_str();
+#else // __GXX_RTTI
+  return "(RTTI disabled, cannot show name)";
+#endif // __GXX_RTTI
+}
+
+AT_CORE_API std::string get_backtrace(
+    size_t frames_to_skip = 0,
+    size_t maximum_number_of_frames = 64,
+    bool skip_python_frames = true);
+} // namespace at
diff --git a/aten/src/ATen/core/C++17.cpp b/aten/src/ATen/core/C++17.cpp
new file mode 100644
index 00000000000000..6074cb6be15e9c
--- /dev/null
+++ b/aten/src/ATen/core/C++17.cpp
@@ -0,0 +1 @@
+#include <ATen/core/C++17.h>
diff --git a/caffe2/utils/C++17.h b/aten/src/ATen/core/C++17.h
similarity index 93%
rename from caffe2/utils/C++17.h
rename to aten/src/ATen/core/C++17.h
index 0186944e251159..5112d9070dcd5e 100644
--- a/caffe2/utils/C++17.h
+++ b/aten/src/ATen/core/C++17.h
@@ -95,10 +95,14 @@ template<class T> using decay_t = typename std::decay<T>::type;
 
 #ifdef __cpp_lib_logical_traits
 
-using conjunction = std::conjunction;
-using disjunction = std::disjunction;
-using bool_constant = std::bool_constant;
-using negation = std::negation;
+template <class... B>
+using conjunction = std::conjunction<B...>;
+template <class... B>
+using disjunction = std::disjunction<B...>;
+template <bool B>
+using bool_constant = std::bool_constant<B>;
+template <class B>
+using negation = std::negation<B>;
 
 #else
 
@@ -145,7 +149,10 @@ template<typename... Ts> using void_t = typename make_void<Ts...>::type;
 
 #ifdef __cpp_lib_apply
 
-using apply = std::apply;
+template <class F, class Tuple>
+inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
+  return std::apply(std::forward<F>(f), std::forward<Tuple>(t));
+}
 
 #else
 
@@ -175,9 +182,9 @@ constexpr auto apply(F&& f, Tuple&& t) -> decltype(detail::apply_impl(
 
 
 #if defined(__cpp_constexpr) && __cpp_constexpr >= 201304
-#  define C10_CPP14_CONSTEXPR constexpr
+#  define AT_CPP14_CONSTEXPR constexpr
 #else
-#  define C10_CPP14_CONSTEXPR
+#  define AT_CPP14_CONSTEXPR
 #endif
 
 
diff --git a/aten/src/ATen/core/CMakeLists.txt b/aten/src/ATen/core/CMakeLists.txt
new file mode 100644
index 00000000000000..59149be784c3a6
--- /dev/null
+++ b/aten/src/ATen/core/CMakeLists.txt
@@ -0,0 +1,16 @@
+# This file solely exists to let Caffe2 Android build get at the list
+# of core files without having to trundle through all of ATen's CMakeLists.txt
+
+FILE(GLOB ATen_CORE_HEADERS "*.h")
+FILE(GLOB ATen_CORE_SRCS "*.cpp")
+FILE(GLOB ATen_CORE_TEST_SRCS "*_test.cpp")
+EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})
+
+# Pass to parent
+set(ATen_CORE_HEADERS ${ATen_CORE_HEADERS} PARENT_SCOPE)
+set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
+set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
+# This is a little dodgy, because it means ALL ATen headers are made
+# visible.  Fortunately, you should just get a lot of undefined symbol
+# errors if you go outside core
+set(ATen_CORE_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../.. PARENT_SCOPE)
diff --git a/aten/src/ATen/core/CoreAPI.h b/aten/src/ATen/core/CoreAPI.h
new file mode 100644
index 00000000000000..0ee114d9f4cfdd
--- /dev/null
+++ b/aten/src/ATen/core/CoreAPI.h
@@ -0,0 +1,20 @@
+// You can use the definition AT_CORE_STATIC_WINDOWS to control whether
+// or not we apply __declspec.  You will want to set this as
+// -DAT_CORE_STATIC_WINDOWS=1 when compiling code which links
+// against ATen/core on Windows, when ATen/core is built as a
+// static library (in which case, saying the symbol is coming
+// from a DLL would be incorrect).
+
+#ifdef _WIN32
+#if !defined(AT_CORE_STATIC_WINDOWS)
+#if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
+#define AT_CORE_API __declspec(dllexport)
+#else
+#define AT_CORE_API __declspec(dllimport)
+#endif
+#else
+#define AT_CORE_API
+#endif
+#else
+#define AT_CORE_API
+#endif
diff --git a/aten/src/ATen/Error.cpp b/aten/src/ATen/core/Error.cpp
similarity index 64%
rename from aten/src/ATen/Error.cpp
rename to aten/src/ATen/core/Error.cpp
index 1261fbe0295d6c..35ba7d644e109b 100644
--- a/aten/src/ATen/Error.cpp
+++ b/aten/src/ATen/core/Error.cpp
@@ -1,5 +1,5 @@
-#include <ATen/Error.h>
-#include <ATen/Backtrace.h>
+#include <ATen/core/Error.h>
+#include <ATen/core/Backtrace.h>
 
 #include <iostream>
 #include <string>
@@ -11,9 +11,13 @@ std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
 }
 
 Error::Error(SourceLocation source_location, std::string err)
-  : what_without_backtrace_(err)
-  , what_(str(err, " (", source_location, ")\n", get_backtrace(/*frames_to_skip=*/2)))
-  {}
+    : what_without_backtrace_(err),
+      what_(
+          str(err,
+              " (",
+              source_location,
+              ")\n",
+              get_backtrace(/*frames_to_skip=*/2))) {}
 
 void Warning::warn(SourceLocation source_location, std::string msg) {
   warning_handler_(source_location, msg.c_str());
@@ -23,7 +27,9 @@ void Warning::set_warning_handler(handler_t handler) {
   warning_handler_ = handler;
 }
 
-void Warning::print_warning(const SourceLocation& source_location, const char* msg) {
+void Warning::print_warning(
+    const SourceLocation& source_location,
+    const char* msg) {
   std::cerr << "Warning: " << msg << " (" << source_location << ")\n";
 }
 
diff --git a/aten/src/ATen/core/Error.h b/aten/src/ATen/core/Error.h
new file mode 100644
index 00000000000000..b95a5f120f21b8
--- /dev/null
+++ b/aten/src/ATen/core/Error.h
@@ -0,0 +1,147 @@
+#pragma once
+
+#include <ATen/core/CoreAPI.h>
+#include <ATen/core/optional.h>
+
+#include <cstddef>
+#include <exception>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#define __func__ __FUNCTION__
+#endif
+
+namespace at {
+
+namespace detail {
+
+inline std::ostream& _str(std::ostream& ss) {
+  return ss;
+}
+
+template <typename T>
+inline std::ostream& _str(std::ostream& ss, const T& t) {
+  ss << t;
+  return ss;
+}
+
+template <typename T, typename... Args>
+inline std::ostream& _str(std::ostream& ss, const T& t, const Args&... args) {
+  return _str(_str(ss, t), args...);
+}
+
+} // namespace detail
+
+// Convert a list of string-like arguments into a single string.
+template <typename... Args>
+inline std::string str(const Args&... args) {
+  std::ostringstream ss;
+  detail::_str(ss, args...);
+  return ss.str();
+}
+
+// Specializations for already-a-string types.
+template <>
+inline std::string str(const std::string& str) {
+  return str;
+}
+inline std::string str(const char* c_str) {
+  return c_str;
+}
+
+/// Represents a location in source code (for debugging).
+struct SourceLocation {
+  const char* function;
+  const char* file;
+  uint32_t line;
+};
+
+std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
+
+/// The primary ATen error class.
+/// Provides a complete error message with source location information via
+/// `what()`, and a more concise message via `what_without_backtrace()`. Should
+/// primarily be used with the `AT_ERROR` macro.
+///
+/// NB: at::Error is handled specially by the default torch to suppress the
+/// backtrace, see torch/csrc/Exceptions.h
+class AT_CORE_API Error : public std::exception {
+  std::string what_without_backtrace_;
+  std::string what_;
+
+ public:
+  Error(SourceLocation source_location, std::string err);
+
+  /// Returns the complete error message, including the source location.
+  const char* what() const noexcept override {
+    return what_.c_str();
+  }
+
+  /// Returns only the error message string, without source location.
+  const char* what_without_backtrace() const noexcept {
+    return what_without_backtrace_.c_str();
+  }
+};
+
+class AT_CORE_API Warning {
+  using handler_t =
+      void (*)(const SourceLocation& source_location, const char* msg);
+
+ public:
+  /// Issue a warning with a given message. Dispatched to the current
+  /// warning handler.
+  static void warn(SourceLocation source_location, std::string msg);
+
+  /// Sets the global warning handler. This is not thread-safe, so it should
+  /// generally be called once during initialization.
+  static void set_warning_handler(handler_t handler);
+
+  /// The default warning handler. Prints the message to stderr.
+  static void print_warning(
+      const SourceLocation& source_location,
+      const char* msg);
+
+ private:
+  static handler_t warning_handler_;
+};
+
+} // namespace at
+
+// TODO: variants that print the expression tested and thus don't require
+// strings
+// TODO: CAFFE_ENFORCE_WITH_CALLER style macro
+
+#define AT_ERROR(...) \
+  throw at::Error({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__))
+
+#define AT_WARN(...) \
+  at::Warning::warn({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__))
+
+#define AT_ASSERT(cond)                       \
+  if (!(cond)) {                              \
+    AT_ERROR(                                 \
+        #cond " ASSERT FAILED at ",           \
+        __FILE__,                             \
+        ":",                                  \
+        __LINE__,                             \
+        ", please report a bug to PyTorch."); \
+  }
+
+#define AT_ASSERTM(cond, ...)                 \
+  if (!(cond)) {                              \
+    AT_ERROR(at::str(                         \
+        #cond,                                \
+        " ASSERT FAILED at ",                 \
+        __FILE__,                             \
+        ":",                                  \
+        __LINE__,                             \
+        ", please report a bug to PyTorch. ", \
+        __VA_ARGS__));                        \
+  }
+
+#define AT_CHECK(cond, ...)         \
+  if (!(cond)) {                    \
+    AT_ERROR(at::str(__VA_ARGS__)); \
+  }
diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h
new file mode 100644
index 00000000000000..d89b496d7083b8
--- /dev/null
+++ b/aten/src/ATen/core/Half-inl.h
@@ -0,0 +1,249 @@
+#pragma once
+
+#include <cstring>
+#include <limits>
+#include <ATen/core/CoreAPI.h>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+#if defined(__HIP_DEVICE_COMPILE__)
+#include <hip/hip_fp16.h>
+#endif
+
+namespace at {
+
+/// Constructors
+
+inline AT_HOSTDEVICE Half::Half(float value) {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  x = __half_as_short(__float2half(value));
+#else
+  x = detail::float2halfbits(value);
+#endif
+}
+
+/// Implicit conversions
+
+inline AT_HOSTDEVICE Half::operator float() const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return __half2float(*reinterpret_cast<const __half*>(&x));
+#else
+  return detail::halfbits2float(x);
+#endif
+}
+
+#ifdef __CUDACC__
+inline AT_HOSTDEVICE Half::Half(const __half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline AT_HOSTDEVICE Half::operator __half() const {
+  return *reinterpret_cast<const __half*>(&x);
+}
+#endif
+
+/// Arithmetic
+
+inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) {
+  return (float)a + (float)b;
+}
+
+inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) {
+  return (float)a - (float)b;
+}
+
+inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) {
+  return (float)a * (float)b;
+}
+
+inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) {
+  return (float)a / (float)b;
+}
+
+inline AT_HOSTDEVICE Half operator-(const Half& a) {
+  return -(float)a;
+}
+
+inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) {
+  a = a + b;
+  return a;
+}
+
+inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) {
+  a = a - b;
+  return a;
+}
+
+inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) {
+  a = a * b;
+  return a;
+}
+
+inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline AT_HOSTDEVICE float operator+(Half a, float b) {
+  return (float)a + b;
+}
+inline AT_HOSTDEVICE float operator-(Half a, float b) {
+  return (float)a - b;
+}
+inline AT_HOSTDEVICE float operator*(Half a, float b) {
+  return (float)a * b;
+}
+inline AT_HOSTDEVICE float operator/(Half a, float b) {
+  return (float)a / b;
+}
+
+inline AT_HOSTDEVICE float operator+(float a, Half b) {
+  return a + (float)b;
+}
+inline AT_HOSTDEVICE float operator-(float a, Half b) {
+  return a - (float)b;
+}
+inline AT_HOSTDEVICE float operator*(float a, Half b) {
+  return a * (float)b;
+}
+inline AT_HOSTDEVICE float operator/(float a, Half b) {
+  return a / (float)b;
+}
+
+inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) {
+  return a += (float)b;
+}
+inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) {
+  return a -= (float)b;
+}
+inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) {
+  return a *= (float)b;
+}
+inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) {
+  return a /= (float)b;
+}
+
+/// Arithmetic with doubles
+
+inline AT_HOSTDEVICE double operator+(Half a, double b) {
+  return (double)a + b;
+}
+inline AT_HOSTDEVICE double operator-(Half a, double b) {
+  return (double)a - b;
+}
+inline AT_HOSTDEVICE double operator*(Half a, double b) {
+  return (double)a * b;
+}
+inline AT_HOSTDEVICE double operator/(Half a, double b) {
+  return (double)a / b;
+}
+
+inline AT_HOSTDEVICE double operator+(double a, Half b) {
+  return a + (double)b;
+}
+inline AT_HOSTDEVICE double operator-(double a, Half b) {
+  return a - (double)b;
+}
+inline AT_HOSTDEVICE double operator*(double a, Half b) {
+  return a * (double)b;
+}
+inline AT_HOSTDEVICE double operator/(double a, Half b) {
+  return a / (double)b;
+}
+
+/// Arithmetic with ints
+
+inline AT_HOSTDEVICE Half operator+(Half a, int b) {
+  return a + (Half)b;
+}
+inline AT_HOSTDEVICE Half operator-(Half a, int b) {
+  return a - (Half)b;
+}
+inline AT_HOSTDEVICE Half operator*(Half a, int b) {
+  return a * (Half)b;
+}
+inline AT_HOSTDEVICE Half operator/(Half a, int b) {
+  return a / (Half)b;
+}
+
+inline AT_HOSTDEVICE Half operator+(int a, Half b) {
+  return (Half)a + b;
+}
+inline AT_HOSTDEVICE Half operator-(int a, Half b) {
+  return (Half)a - b;
+}
+inline AT_HOSTDEVICE Half operator*(int a, Half b) {
+  return (Half)a * b;
+}
+inline AT_HOSTDEVICE Half operator/(int a, Half b) {
+  return (Half)a / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from at::Half to float.
+
+} // namespace at
+
+namespace std {
+
+template <>
+class numeric_limits<at::Half> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss =
+      numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 11;
+  static constexpr int digits10 = 3;
+  static constexpr int max_digits10 = 5;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+  static constexpr at::Half min() {
+    return at::Half(0x0400, at::Half::from_bits);
+  }
+  static constexpr at::Half lowest() {
+    return at::Half(0xFBFF, at::Half::from_bits);
+  }
+  static constexpr at::Half max() {
+    return at::Half(0x7BFF, at::Half::from_bits);
+  }
+  static constexpr at::Half epsilon() {
+    return at::Half(0x1400, at::Half::from_bits);
+  }
+  static constexpr at::Half round_error() {
+    return at::Half(0x3800, at::Half::from_bits);
+  }
+  static constexpr at::Half infinity() {
+    return at::Half(0x7C00, at::Half::from_bits);
+  }
+  static constexpr at::Half quiet_NaN() {
+    return at::Half(0x7E00, at::Half::from_bits);
+  }
+  static constexpr at::Half signaling_NaN() {
+    return at::Half(0x7D00, at::Half::from_bits);
+  }
+  static constexpr at::Half denorm_min() {
+    return at::Half(0x0001, at::Half::from_bits);
+  }
+};
+
+} // namespace std
diff --git a/aten/src/ATen/core/Half.cpp b/aten/src/ATen/core/Half.cpp
new file mode 100644
index 00000000000000..e511f03a92bc73
--- /dev/null
+++ b/aten/src/ATen/core/Half.cpp
@@ -0,0 +1,105 @@
+#include <ATen/core/Half.h>
+
+#include <iostream>
+
+namespace at {
+
+static_assert(
+    std::is_standard_layout<Half>::value,
+    "at::Half must be standard layout.");
+
+namespace detail {
+
+// Host functions for converting between FP32 and FP16 formats
+
+float halfbits2float(unsigned short h) {
+  unsigned sign = ((h >> 15) & 1);
+  unsigned exponent = ((h >> 10) & 0x1f);
+  unsigned mantissa = ((h & 0x3ff) << 13);
+
+  if (exponent == 0x1f) { /* NaN or Inf */
+    mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+    exponent = 0xff;
+  } else if (!exponent) { /* Denorm or Zero */
+    if (mantissa) {
+      unsigned int msb;
+      exponent = 0x71;
+      do {
+        msb = (mantissa & 0x400000);
+        mantissa <<= 1; /* normalize */
+        --exponent;
+      } while (!msb);
+      mantissa &= 0x7fffff; /* 1.mantissa is implicit */
+    }
+  } else {
+    exponent += 0x70;
+  }
+
+  unsigned result_bit = (sign << 31) | (exponent << 23) | mantissa;
+
+  // Reinterpret the result bit pattern as a float
+  float result_float;
+  std::memcpy(&result_float, &result_bit, sizeof(result_float));
+  return result_float;
+}
+
+unsigned short float2halfbits(float src) {
+  // Reinterpret the float as a bit pattern
+  unsigned x;
+  std::memcpy(&x, &src, sizeof(x));
+
+  unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+  unsigned sign, exponent, mantissa;
+
+  // Get rid of +NaN/-NaN case first.
+  if (u > 0x7f800000) {
+    return 0x7fffU;
+  }
+
+  sign = ((x >> 16) & 0x8000);
+
+  // Get rid of +Inf/-Inf, +0/-0.
+  if (u > 0x477fefff) {
+    return sign | 0x7c00U;
+  }
+  if (u < 0x33000001) {
+    return (sign | 0x0000);
+  }
+
+  exponent = ((u >> 23) & 0xff);
+  mantissa = (u & 0x7fffff);
+
+  if (exponent > 0x70) {
+    shift = 13;
+    exponent -= 0x70;
+  } else {
+    shift = 0x7e - exponent;
+    exponent = 0;
+    mantissa |= 0x800000;
+  }
+  lsb = (1 << shift);
+  lsb_s1 = (lsb >> 1);
+  lsb_m1 = (lsb - 1);
+
+  // Round to nearest even.
+  remainder = (mantissa & lsb_m1);
+  mantissa >>= shift;
+  if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+    ++mantissa;
+    if (!(mantissa & 0x3ff)) {
+      ++exponent;
+      mantissa = 0;
+    }
+  }
+
+  return (sign | (exponent << 10) | mantissa);
+}
+
+} // namespace detail
+
+std::ostream& operator<<(std::ostream& out, const Half& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace at
diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h
new file mode 100644
index 00000000000000..385f18e78cab02
--- /dev/null
+++ b/aten/src/ATen/core/Half.h
@@ -0,0 +1,127 @@
+#pragma once
+
+/// Defines the Half type (half-precision floating-point) including conversions
+/// to standard C types and basic arithmetic operations. Note that arithmetic
+/// operations are implemented by converting to floating point and
+/// performing the operation in float32, instead of using CUDA half intrinisics.
+/// Most uses of this type within ATen are memory bound, including the
+/// element-wise kernels, and the half intrinisics aren't efficient on all GPUs.
+/// If you are writing a compute bound kernel, you can use the CUDA half
+/// intrinsics directly on the Half type from device code.
+
+#include <ATen/core/CoreAPI.h>
+
+#include <cmath>
+#include <cstdint>
+#include <iosfwd>
+#include <limits>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <sstream>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+#if defined(__HIP_DEVICE_COMPILE__)
+#include <hip/hip_fp16.h>
+#endif
+
+#ifndef AT_HOSTDEVICE
+#ifdef __CUDACC__
+#define AT_HOSTDEVICE __host__ __device__
+#else
+#define AT_HOSTDEVICE
+#endif
+#endif
+
+namespace at {
+
+namespace detail {
+
+AT_CORE_API float halfbits2float(unsigned short bits);
+AT_CORE_API unsigned short float2halfbits(float value);
+
+} // namespace detail
+
+struct alignas(2) Half {
+  unsigned short x;
+
+  struct from_bits_t {};
+  static constexpr from_bits_t from_bits = from_bits_t();
+
+  // HIP wants __host__ __device__ tag, CUDA does not
+#ifdef __HIP_PLATFORM_HCC__
+  AT_HOSTDEVICE Half() = default;
+#else
+  Half() = default;
+#endif
+
+  constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits){};
+  inline AT_HOSTDEVICE Half(float value);
+  inline AT_HOSTDEVICE operator float() const;
+
+#ifdef __CUDACC__
+  inline AT_HOSTDEVICE Half(const __half& value);
+  inline AT_HOSTDEVICE operator __half() const;
+#endif
+};
+
+template <typename To, typename From>
+To convert(From f) {
+  return static_cast<To>(f);
+}
+
+// skip isnan and isinf check for integral types
+template <typename To, typename From>
+typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
+    From f) {
+  using limit = std::numeric_limits<To>;
+  if (!limit::is_signed && std::numeric_limits<From>::is_signed) {
+    // allow for negative numbers to wrap using two's complement arithmetic.
+    // For example, with uint8, this allows for `a - b` to be treated as
+    // `a + 255 * b`.
+    return f > limit::max() || (f < 0 && -(uint64_t)f > limit::max());
+  } else {
+    return f < limit::lowest() || f > limit::max();
+  }
+}
+
+template <typename To, typename From>
+typename std::enable_if<!std::is_integral<From>::value, bool>::type overflows(
+    From f) {
+  using limit = std::numeric_limits<To>;
+  if (limit::has_infinity && std::isinf((double)f)) {
+    return false;
+  }
+  if (!limit::has_quiet_NaN && (f != f)) {
+    return true;
+  }
+  return f < limit::lowest() || f > limit::max();
+}
+
+template <typename To, typename From>
+To checked_convert(From f, const char* name) {
+  if (overflows<To, From>(f)) {
+    std::ostringstream oss;
+    oss << "value cannot be converted to type " << name << " without overflow: " << f;
+    throw std::domain_error(oss.str());
+  }
+  return convert<To, From>(f);
+}
+
+template <typename To, typename From>
+To HalfFix(From h) {
+  To ret;
+  ret.x = h.x;
+  return ret;
+}
+
+AT_CORE_API std::ostream& operator<<(std::ostream& out, const Half& value);
+
+} // namespace at
+
+#include "ATen/core/Half-inl.h"
+
+#undef AT_HOSTDEVICE
diff --git a/aten/src/ATen/core/IdWrapper.h b/aten/src/ATen/core/IdWrapper.h
new file mode 100644
index 00000000000000..7d152269d9a8c2
--- /dev/null
+++ b/aten/src/ATen/core/IdWrapper.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <functional>
+
+namespace at {
+
+/**
+ * This template simplifies generation of simple classes that wrap an id
+ * in a typesafe way. Namely, you can use it to create a very lightweight
+ * type that only offers equality comparators and hashing. Example:
+ *
+ *   struct MyIdType final : IdWrapper<MyIdType, uint32_t> {
+ *     constexpr explicit MyIdType(uint32_t id): IdWrapper(id) {}
+ *   };
+ *
+ * Then in the global top level namespace:
+ *
+ *   AT_DEFINE_HASH_FOR_IDWRAPPER(MyIdType);
+ *
+ * That's it - equality operators and hash functions are automatically defined
+ * for you, given the underlying type supports it.
+ */
+template <class ConcreteType, class UnderlyingType>
+class IdWrapper {
+ public:
+  using underlying_type = UnderlyingType;
+  using concrete_type = ConcreteType;
+
+ protected:
+  constexpr explicit IdWrapper(underlying_type id) noexcept(
+      noexcept(underlying_type(std::declval<underlying_type>())))
+      : id_(id) {}
+
+  constexpr underlying_type underlyingId() const
+      noexcept(noexcept(underlying_type(std::declval<underlying_type>()))) {
+    return id_;
+  }
+
+ private:
+  friend size_t hash_value(const concrete_type& v) {
+    return std::hash<underlying_type>()(v.id_);
+  }
+
+  // TODO Making operator== noexcept if underlying type is noexcept equality
+  // comparable doesn't work with GCC 4.8.
+  //      Fix this once we don't need GCC 4.8 anymore.
+  friend constexpr bool operator==(
+      const concrete_type& lhs,
+      const concrete_type& rhs) {
+    return lhs.id_ == rhs.id_;
+  }
+
+  // TODO Making operator!= noexcept if operator== is noexcept doesn't work with
+  // GCC 4.8.
+  //      Fix this once we don't need GCC 4.8 anymore.
+  friend constexpr bool operator!=(
+      const concrete_type& lhs,
+      const concrete_type& rhs) {
+    return !(lhs == rhs);
+  }
+
+  underlying_type id_;
+};
+
+} // namespace at
+
+#define AT_DEFINE_HASH_FOR_IDWRAPPER(ClassName) \
+  namespace std {                               \
+  template <>                                   \
+  struct hash<ClassName> {                      \
+    size_t operator()(ClassName x) const {      \
+      return hash_value(x);                     \
+    }                                           \
+  };                                            \
+  }
diff --git a/aten/src/ATen/core/README.md b/aten/src/ATen/core/README.md
new file mode 100644
index 00000000000000..71654f44e26f91
--- /dev/null
+++ b/aten/src/ATen/core/README.md
@@ -0,0 +1,5 @@
+ATen Core
+---------
+
+ATen Core is a minimal subset of ATen which is suitable for deployment
+on mobile.  Binary size of files in this folder is an important constraint.
diff --git a/aten/src/ATen/SmallVector.cpp b/aten/src/ATen/core/SmallVector.cpp
similarity index 87%
rename from aten/src/ATen/SmallVector.cpp
rename to aten/src/ATen/core/SmallVector.cpp
index 59095a2809c7a8..976809c5b50931 100644
--- a/aten/src/ATen/SmallVector.cpp
+++ b/aten/src/ATen/core/SmallVector.cpp
@@ -14,20 +14,22 @@
 // ATen: modified from llvm::SmallVector.
 // replaced report_bad_alloc_error with std::bad_alloc
 
-#include "SmallVector.h"
+#include <ATen/core/SmallVector.h>
 
 namespace at {
 
 /// grow_pod - This is an implementation of the grow() method which only works
 /// on POD-like datatypes and is out of line to reduce code duplication.
-void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes,
-                               size_t TSize) {
+void SmallVectorBase::grow_pod(
+    void* FirstEl,
+    size_t MinSizeInBytes,
+    size_t TSize) {
   size_t CurSizeBytes = size_in_bytes();
   size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow.
   if (NewCapacityInBytes < MinSizeInBytes)
     NewCapacityInBytes = MinSizeInBytes;
 
-  void *NewElts;
+  void* NewElts;
   if (BeginX == FirstEl) {
     NewElts = malloc(NewCapacityInBytes);
     if (NewElts == nullptr)
@@ -42,9 +44,9 @@ void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes,
       throw std::bad_alloc();
   }
 
-  this->EndX = (char*)NewElts+CurSizeBytes;
+  this->EndX = (char*)NewElts + CurSizeBytes;
   this->BeginX = NewElts;
   this->CapacityX = (char*)this->BeginX + NewCapacityInBytes;
 }
 
-}
+} // namespace at
diff --git a/aten/src/ATen/core/SmallVector.h b/aten/src/ATen/core/SmallVector.h
new file mode 100644
index 00000000000000..269b21b0d5cf37
--- /dev/null
+++ b/aten/src/ATen/core/SmallVector.h
@@ -0,0 +1,1034 @@
+//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SmallVector class.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::SmallVector.
+// replaced report_bad_alloc_error with std::bad_alloc
+// replaced isPodLike<T> with AT_IS_TRIVIALLY_COPYABLE
+// replaced iterator_range constructor with inline Container&& constructor
+// removed LLVM_NODISCARD and LLVM_ATTRIBUTE_ALWAYS_INLINE qualifiers
+// removed LLVM_UNLIKELY
+
+#pragma once
+
+#include <ATen/core/AlignOf.h>
+#include <ATen/core/CoreAPI.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+#if __GNUG__ && __GNUC__ < 5
+#define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
+#else
+#define AT_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
+#endif
+
+namespace at {
+
+namespace detail {
+
+// From llvm/Support/MathExtras.h
+static inline uint64_t NextPowerOf2(uint64_t A) {
+  A |= (A >> 1);
+  A |= (A >> 2);
+  A |= (A >> 4);
+  A |= (A >> 8);
+  A |= (A >> 16);
+  A |= (A >> 32);
+  return A + 1;
+}
+
+} // namespace detail
+
+/// This is all the non-templated stuff common to all SmallVectors.
+class AT_CORE_API SmallVectorBase {
+ protected:
+  void *BeginX, *EndX, *CapacityX;
+
+ protected:
+  SmallVectorBase(void* FirstEl, size_t Size)
+      : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl + Size) {}
+
+  /// This is an implementation of the grow() method which only works
+  /// on POD-like data types and is out of line to reduce code duplication.
+  void grow_pod(void* FirstEl, size_t MinSizeInBytes, size_t TSize);
+
+ public:
+  /// This returns size()*sizeof(T).
+  size_t size_in_bytes() const {
+    return size_t((char*)EndX - (char*)BeginX);
+  }
+
+  /// capacity_in_bytes - This returns capacity()*sizeof(T).
+  size_t capacity_in_bytes() const {
+    return size_t((char*)CapacityX - (char*)BeginX);
+  }
+
+  bool empty() const {
+    return BeginX == EndX;
+  }
+};
+
+/// This is the part of SmallVectorTemplateBase which does not depend on whether
+/// the type T is a POD. The extra dummy template argument is used by ArrayRef
+/// to avoid unnecessarily requiring T to be complete.
+template <typename T, typename = void>
+class SmallVectorTemplateCommon : public SmallVectorBase {
+ private:
+  template <typename, unsigned>
+  friend struct SmallVectorStorage;
+
+  // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
+  // don't want it to be automatically run, so we need to represent the space as
+  // something else.  Use an array of char of sufficient alignment.
+  using U = AlignedCharArrayUnion<T>;
+  U FirstEl;
+  // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
+
+ protected:
+  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {}
+
+  void grow_pod(size_t MinSizeInBytes, size_t TSize) {
+    SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
+  }
+
+  /// Return true if this is a smallvector which has not had dynamic
+  /// memory allocated for it.
+  bool isSmall() const {
+    return BeginX == static_cast<const void*>(&FirstEl);
+  }
+
+  /// Put this vector in a state of being small.
+  void resetToSmall() {
+    BeginX = EndX = CapacityX = &FirstEl;
+  }
+
+  void setEnd(T* P) {
+    this->EndX = P;
+  }
+
+ public:
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using value_type = T;
+  using iterator = T*;
+  using const_iterator = const T*;
+
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+  using reference = T&;
+  using const_reference = const T&;
+  using pointer = T*;
+  using const_pointer = const T*;
+
+  // forward iterator creation methods.
+  iterator begin() {
+    return (iterator)this->BeginX;
+  }
+  const_iterator begin() const {
+    return (const_iterator)this->BeginX;
+  }
+  iterator end() {
+    return (iterator)this->EndX;
+  }
+  const_iterator end() const {
+    return (const_iterator)this->EndX;
+  }
+
+ protected:
+  iterator capacity_ptr() {
+    return (iterator)this->CapacityX;
+  }
+  const_iterator capacity_ptr() const {
+    return (const_iterator)this->CapacityX;
+  }
+
+ public:
+  // reverse iterator creation methods.
+  reverse_iterator rbegin() {
+    return reverse_iterator(end());
+  }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  reverse_iterator rend() {
+    return reverse_iterator(begin());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  size_type size() const {
+    return end() - begin();
+  }
+  size_type max_size() const {
+    return size_type(-1) / sizeof(T);
+  }
+
+  /// Return the total number of elements in the currently allocated buffer.
+  size_t capacity() const {
+    return capacity_ptr() - begin();
+  }
+
+  /// Return a pointer to the vector's buffer, even if empty().
+  pointer data() {
+    return pointer(begin());
+  }
+  /// Return a pointer to the vector's buffer, even if empty().
+  const_pointer data() const {
+    return const_pointer(begin());
+  }
+
+  reference operator[](size_type idx) {
+    assert(idx < size());
+    return begin()[idx];
+  }
+  const_reference operator[](size_type idx) const {
+    assert(idx < size());
+    return begin()[idx];
+  }
+
+  reference front() {
+    assert(!empty());
+    return begin()[0];
+  }
+  const_reference front() const {
+    assert(!empty());
+    return begin()[0];
+  }
+
+  reference back() {
+    assert(!empty());
+    return end()[-1];
+  }
+  const_reference back() const {
+    assert(!empty());
+    return end()[-1];
+  }
+};
+
+/// SmallVectorTemplateBase<isPodLike = false> - This is where we put method
+/// implementations that are designed to work with non-POD-like T's.
+template <typename T, bool isPodLike>
+class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
+ protected:
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  static void destroy_range(T* S, T* E) {
+    while (S != E) {
+      --E;
+      E->~T();
+    }
+  }
+
+  /// Move the range [I, E) into the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template <typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(
+        std::make_move_iterator(I), std::make_move_iterator(E), Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template <typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Grow the allocated memory (without initializing new elements), doubling
+  /// the size of the allocated memory. Guarantees space for at least one more
+  /// element, or MinSize more elements if specified.
+  void grow(size_t MinSize = 0);
+
+ public:
+  void push_back(const T& Elt) {
+    if (this->EndX >= this->CapacityX)
+      this->grow();
+    ::new ((void*)this->end()) T(Elt);
+    this->setEnd(this->end() + 1);
+  }
+
+  void push_back(T&& Elt) {
+    if (this->EndX >= this->CapacityX)
+      this->grow();
+    ::new ((void*)this->end()) T(::std::move(Elt));
+    this->setEnd(this->end() + 1);
+  }
+
+  void pop_back() {
+    this->setEnd(this->end() - 1);
+    this->end()->~T();
+  }
+};
+
+// Define this out-of-line to dissuade the C++ compiler from inlining it.
+template <typename T, bool isPodLike>
+void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
+  size_t CurCapacity = this->capacity();
+  size_t CurSize = this->size();
+  // Always grow, even from zero.
+  size_t NewCapacity = size_t(detail::NextPowerOf2(CurCapacity + 2));
+  if (NewCapacity < MinSize)
+    NewCapacity = MinSize;
+  T* NewElts = static_cast<T*>(malloc(NewCapacity * sizeof(T)));
+  if (NewElts == nullptr)
+    throw std::bad_alloc();
+
+  // Move the elements over.
+  this->uninitialized_move(this->begin(), this->end(), NewElts);
+
+  // Destroy the original elements.
+  destroy_range(this->begin(), this->end());
+
+  // If this wasn't grown from the inline copy, deallocate the old space.
+  if (!this->isSmall())
+    free(this->begin());
+
+  this->setEnd(NewElts + CurSize);
+  this->BeginX = NewElts;
+  this->CapacityX = this->begin() + NewCapacity;
+}
+
+/// SmallVectorTemplateBase<isPodLike = true> - This is where we put method
+/// implementations that are designed to work with POD-like T's.
+template <typename T>
+class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
+ protected:
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  // No need to do a destroy loop for POD's.
+  static void destroy_range(T*, T*) {}
+
+  /// Move the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    // Just do a copy.
+    uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    // Arbitrary iterator types; just use the basic implementation.
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename T1, typename T2>
+  static void uninitialized_copy(
+      T1* I,
+      T1* E,
+      T2* Dest,
+      typename std::enable_if<
+          std::is_same<typename std::remove_const<T1>::type, T2>::value>::
+          type* = nullptr) {
+    // Use memcpy for PODs iterated by pointers (which includes SmallVector
+    // iterators): std::uninitialized_copy optimizes to memmove, but we can
+    // use memcpy here. Note that I and E are iterators and thus might be
+    // invalid for memcpy if they are equal.
+    if (I != E)
+      memcpy(Dest, I, (E - I) * sizeof(T));
+  }
+
+  /// Double the size of the allocated memory, guaranteeing space for at
+  /// least one more element or MinSize if specified.
+  void grow(size_t MinSize = 0) {
+    this->grow_pod(MinSize * sizeof(T), sizeof(T));
+  }
+
+ public:
+  void push_back(const T& Elt) {
+    if (this->EndX >= this->CapacityX)
+      this->grow();
+    memcpy(this->end(), &Elt, sizeof(T));
+    this->setEnd(this->end() + 1);
+  }
+
+  void pop_back() {
+    this->setEnd(this->end() - 1);
+  }
+};
+
+/// This class consists of common code factored out of the SmallVector class to
+/// reduce code duplication based on the SmallVector 'N' template parameter.
+template <typename T>
+class SmallVectorImpl
+    : public SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)> {
+  using SuperClass = SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)>;
+
+ public:
+  using iterator = typename SuperClass::iterator;
+  using const_iterator = typename SuperClass::const_iterator;
+  using size_type = typename SuperClass::size_type;
+
+ protected:
+  // Default ctor - Initialize to empty.
+  explicit SmallVectorImpl(unsigned N)
+      : SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)>(N * sizeof(T)) {
+  }
+
+ public:
+  SmallVectorImpl(const SmallVectorImpl&) = delete;
+
+  ~SmallVectorImpl() {
+    // Destroy the constructed elements in the vector.
+    this->destroy_range(this->begin(), this->end());
+
+    // If this wasn't grown from the inline copy, deallocate the old space.
+    if (!this->isSmall())
+      free(this->begin());
+  }
+
+  void clear() {
+    this->destroy_range(this->begin(), this->end());
+    this->EndX = this->BeginX;
+  }
+
+  void resize(size_type N) {
+    if (N < this->size()) {
+      this->destroy_range(this->begin() + N, this->end());
+      this->setEnd(this->begin() + N);
+    } else if (N > this->size()) {
+      if (this->capacity() < N)
+        this->grow(N);
+      auto I = this->end();
+      for (auto E = this->begin() + N; I != E; ++I)
+        new (&*I) T();
+      this->setEnd(this->begin() + N);
+    }
+  }
+
+  void resize(size_type N, const T& NV) {
+    if (N < this->size()) {
+      this->destroy_range(this->begin() + N, this->end());
+      this->setEnd(this->begin() + N);
+    } else if (N > this->size()) {
+      if (this->capacity() < N)
+        this->grow(N);
+      std::uninitialized_fill(this->end(), this->begin() + N, NV);
+      this->setEnd(this->begin() + N);
+    }
+  }
+
+  void reserve(size_type N) {
+    if (this->capacity() < N)
+      this->grow(N);
+  }
+
+  T pop_back_val() {
+    T Result = ::std::move(this->back());
+    this->pop_back();
+    return Result;
+  }
+
+  void swap(SmallVectorImpl& RHS);
+
+  /// Add the specified range to the end of the SmallVector.
+  template <
+      typename in_iter,
+      typename = typename std::enable_if<std::is_convertible<
+          typename std::iterator_traits<in_iter>::iterator_category,
+          std::input_iterator_tag>::value>::type>
+  void append(in_iter in_start, in_iter in_end) {
+    size_type NumInputs = std::distance(in_start, in_end);
+    // Grow allocated space if needed.
+    if (NumInputs > size_type(this->capacity_ptr() - this->end()))
+      this->grow(this->size() + NumInputs);
+
+    // Copy the new elements over.
+    this->uninitialized_copy(in_start, in_end, this->end());
+    this->setEnd(this->end() + NumInputs);
+  }
+
+  /// Add the specified range to the end of the SmallVector.
+  void append(size_type NumInputs, const T& Elt) {
+    // Grow allocated space if needed.
+    if (NumInputs > size_type(this->capacity_ptr() - this->end()))
+      this->grow(this->size() + NumInputs);
+
+    // Copy the new elements over.
+    std::uninitialized_fill_n(this->end(), NumInputs, Elt);
+    this->setEnd(this->end() + NumInputs);
+  }
+
+  void append(std::initializer_list<T> IL) {
+    append(IL.begin(), IL.end());
+  }
+
+  // FIXME: Consider assigning over existing elements, rather than clearing &
+  // re-initializing them - for all assign(...) variants.
+
+  void assign(size_type NumElts, const T& Elt) {
+    clear();
+    if (this->capacity() < NumElts)
+      this->grow(NumElts);
+    this->setEnd(this->begin() + NumElts);
+    std::uninitialized_fill(this->begin(), this->end(), Elt);
+  }
+
+  template <
+      typename in_iter,
+      typename = typename std::enable_if<std::is_convertible<
+          typename std::iterator_traits<in_iter>::iterator_category,
+          std::input_iterator_tag>::value>::type>
+  void assign(in_iter in_start, in_iter in_end) {
+    clear();
+    append(in_start, in_end);
+  }
+
+  void assign(std::initializer_list<T> IL) {
+    clear();
+    append(IL);
+  }
+
+  iterator erase(const_iterator CI) {
+    // Just cast away constness because this is a non-const member function.
+    iterator I = const_cast<iterator>(CI);
+
+    assert(I >= this->begin() && "Iterator to erase is out of bounds.");
+    assert(I < this->end() && "Erasing at past-the-end iterator.");
+
+    iterator N = I;
+    // Shift all elts down one.
+    std::move(I + 1, this->end(), I);
+    // Drop the last elt.
+    this->pop_back();
+    return (N);
+  }
+
+  iterator erase(const_iterator CS, const_iterator CE) {
+    // Just cast away constness because this is a non-const member function.
+    iterator S = const_cast<iterator>(CS);
+    iterator E = const_cast<iterator>(CE);
+
+    assert(S >= this->begin() && "Range to erase is out of bounds.");
+    assert(S <= E && "Trying to erase invalid range.");
+    assert(E <= this->end() && "Trying to erase past the end.");
+
+    iterator N = S;
+    // Shift all elts down.
+    iterator I = std::move(E, this->end(), S);
+    // Drop the last elts.
+    this->destroy_range(I, this->end());
+    this->setEnd(I);
+    return (N);
+  }
+
+  iterator insert(iterator I, T&& Elt) {
+    if (I == this->end()) { // Important special case for empty vector.
+      this->push_back(::std::move(Elt));
+      return this->end() - 1;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    if (this->EndX >= this->CapacityX) {
+      size_t EltNo = I - this->begin();
+      this->grow();
+      I = this->begin() + EltNo;
+    }
+
+    ::new ((void*)this->end()) T(::std::move(this->back()));
+    // Push everything else over.
+    std::move_backward(I, this->end() - 1, this->end());
+    this->setEnd(this->end() + 1);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference.
+    T* EltPtr = &Elt;
+    if (I <= EltPtr && EltPtr < this->EndX)
+      ++EltPtr;
+
+    *I = ::std::move(*EltPtr);
+    return I;
+  }
+
+  iterator insert(iterator I, const T& Elt) {
+    if (I == this->end()) { // Important special case for empty vector.
+      this->push_back(Elt);
+      return this->end() - 1;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    if (this->EndX >= this->CapacityX) {
+      size_t EltNo = I - this->begin();
+      this->grow();
+      I = this->begin() + EltNo;
+    }
+    ::new ((void*)this->end()) T(std::move(this->back()));
+    // Push everything else over.
+    std::move_backward(I, this->end() - 1, this->end());
+    this->setEnd(this->end() + 1);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference.
+    const T* EltPtr = &Elt;
+    if (I <= EltPtr && EltPtr < this->EndX)
+      ++EltPtr;
+
+    *I = *EltPtr;
+    return I;
+  }
+
+  iterator insert(iterator I, size_type NumToInsert, const T& Elt) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) { // Important special case for empty vector.
+      append(NumToInsert, Elt);
+      return this->begin() + InsertElt;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    // Ensure there is enough space.
+    reserve(this->size() + NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin() + InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end() - I) >= NumToInsert) {
+      T* OldEnd = this->end();
+      append(
+          std::move_iterator<iterator>(this->end() - NumToInsert),
+          std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd - NumToInsert, OldEnd);
+
+      std::fill_n(I, NumToInsert, Elt);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T* OldEnd = this->end();
+    this->setEnd(this->end() + NumToInsert);
+    size_t NumOverwritten = OldEnd - I;
+    this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten);
+
+    // Replace the overwritten part.
+    std::fill_n(I, NumOverwritten, Elt);
+
+    // Insert the non-overwritten middle part.
+    std::uninitialized_fill_n(OldEnd, NumToInsert - NumOverwritten, Elt);
+    return I;
+  }
+
+  template <
+      typename ItTy,
+      typename = typename std::enable_if<std::is_convertible<
+          typename std::iterator_traits<ItTy>::iterator_category,
+          std::input_iterator_tag>::value>::type>
+  iterator insert(iterator I, ItTy From, ItTy To) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) { // Important special case for empty vector.
+      append(From, To);
+      return this->begin() + InsertElt;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    size_t NumToInsert = std::distance(From, To);
+
+    // Ensure there is enough space.
+    reserve(this->size() + NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin() + InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end() - I) >= NumToInsert) {
+      T* OldEnd = this->end();
+      append(
+          std::move_iterator<iterator>(this->end() - NumToInsert),
+          std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd - NumToInsert, OldEnd);
+
+      std::copy(From, To, I);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T* OldEnd = this->end();
+    this->setEnd(this->end() + NumToInsert);
+    size_t NumOverwritten = OldEnd - I;
+    this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten);
+
+    // Replace the overwritten part.
+    for (T* J = I; NumOverwritten > 0; --NumOverwritten) {
+      *J = *From;
+      ++J;
+      ++From;
+    }
+
+    // Insert the non-overwritten middle part.
+    this->uninitialized_copy(From, To, OldEnd);
+    return I;
+  }
+
+  void insert(iterator I, std::initializer_list<T> IL) {
+    insert(I, IL.begin(), IL.end());
+  }
+
+  template <typename... ArgTypes>
+  void emplace_back(ArgTypes&&... Args) {
+    if (this->EndX >= this->CapacityX)
+      this->grow();
+    ::new ((void*)this->end()) T(std::forward<ArgTypes>(Args)...);
+    this->setEnd(this->end() + 1);
+  }
+
+  SmallVectorImpl& operator=(const SmallVectorImpl& RHS);
+
+  SmallVectorImpl& operator=(SmallVectorImpl&& RHS);
+
+  bool operator==(const SmallVectorImpl& RHS) const {
+    if (this->size() != RHS.size())
+      return false;
+    return std::equal(this->begin(), this->end(), RHS.begin());
+  }
+  bool operator!=(const SmallVectorImpl& RHS) const {
+    return !(*this == RHS);
+  }
+
+  bool operator<(const SmallVectorImpl& RHS) const {
+    return std::lexicographical_compare(
+        this->begin(), this->end(), RHS.begin(), RHS.end());
+  }
+
+  /// Set the array size to \p N, which the current array must have enough
+  /// capacity for.
+  ///
+  /// This does not construct or destroy any elements in the vector.
+  ///
+  /// Clients can use this in conjunction with capacity() to write past the end
+  /// of the buffer when they know that more elements are available, and only
+  /// update the size later. This avoids the cost of value initializing elements
+  /// which will only be overwritten.
+  void set_size(size_type N) {
+    assert(N <= this->capacity());
+    this->setEnd(this->begin() + N);
+  }
+};
+
+template <typename T>
+void SmallVectorImpl<T>::swap(SmallVectorImpl<T>& RHS) {
+  if (this == &RHS)
+    return;
+
+  // We can only avoid copying elements if neither vector is small.
+  if (!this->isSmall() && !RHS.isSmall()) {
+    std::swap(this->BeginX, RHS.BeginX);
+    std::swap(this->EndX, RHS.EndX);
+    std::swap(this->CapacityX, RHS.CapacityX);
+    return;
+  }
+  if (RHS.size() > this->capacity())
+    this->grow(RHS.size());
+  if (this->size() > RHS.capacity())
+    RHS.grow(this->size());
+
+  // Swap the shared elements.
+  size_t NumShared = this->size();
+  if (NumShared > RHS.size())
+    NumShared = RHS.size();
+  for (size_type i = 0; i != NumShared; ++i)
+    std::swap((*this)[i], RHS[i]);
+
+  // Copy over the extra elts.
+  if (this->size() > RHS.size()) {
+    size_t EltDiff = this->size() - RHS.size();
+    this->uninitialized_copy(this->begin() + NumShared, this->end(), RHS.end());
+    RHS.setEnd(RHS.end() + EltDiff);
+    this->destroy_range(this->begin() + NumShared, this->end());
+    this->setEnd(this->begin() + NumShared);
+  } else if (RHS.size() > this->size()) {
+    size_t EltDiff = RHS.size() - this->size();
+    this->uninitialized_copy(RHS.begin() + NumShared, RHS.end(), this->end());
+    this->setEnd(this->end() + EltDiff);
+    this->destroy_range(RHS.begin() + NumShared, RHS.end());
+    RHS.setEnd(RHS.begin() + NumShared);
+  }
+}
+
+template <typename T>
+SmallVectorImpl<T>& SmallVectorImpl<T>::operator=(
+    const SmallVectorImpl<T>& RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS)
+    return *this;
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd;
+    if (RHSSize)
+      NewEnd = std::copy(RHS.begin(), RHS.begin() + RHSSize, this->begin());
+    else
+      NewEnd = this->begin();
+
+    // Destroy excess elements.
+    this->destroy_range(NewEnd, this->end());
+
+    // Trim.
+    this->setEnd(NewEnd);
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: don't do this if they're efficiently moveable.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->destroy_range(this->begin(), this->end());
+    this->setEnd(this->begin());
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::copy(RHS.begin(), RHS.begin() + CurSize, this->begin());
+  }
+
+  // Copy construct the new elements in place.
+  this->uninitialized_copy(
+      RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize);
+
+  // Set end.
+  this->setEnd(this->begin() + RHSSize);
+  return *this;
+}
+
+template <typename T>
+SmallVectorImpl<T>& SmallVectorImpl<T>::operator=(SmallVectorImpl<T>&& RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS)
+    return *this;
+
+  // If the RHS isn't small, clear this vector and then steal its buffer.
+  if (!RHS.isSmall()) {
+    this->destroy_range(this->begin(), this->end());
+    if (!this->isSmall())
+      free(this->begin());
+    this->BeginX = RHS.BeginX;
+    this->EndX = RHS.EndX;
+    this->CapacityX = RHS.CapacityX;
+    RHS.resetToSmall();
+    return *this;
+  }
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd = this->begin();
+    if (RHSSize)
+      NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
+
+    // Destroy excess elements and trim the bounds.
+    this->destroy_range(NewEnd, this->end());
+    this->setEnd(NewEnd);
+
+    // Clear the RHS.
+    RHS.clear();
+
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: this may not actually make any sense if we can efficiently move
+  // elements.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->destroy_range(this->begin(), this->end());
+    this->setEnd(this->begin());
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::move(RHS.begin(), RHS.begin() + CurSize, this->begin());
+  }
+
+  // Move-construct the new elements in place.
+  this->uninitialized_move(
+      RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize);
+
+  // Set end.
+  this->setEnd(this->begin() + RHSSize);
+
+  RHS.clear();
+  return *this;
+}
+
+/// Storage for the SmallVector elements which aren't contained in
+/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1'
+/// element is in the base class. This is specialized for the N=1 and N=0 cases
+/// to avoid allocating unnecessary storage.
+template <typename T, unsigned N>
+struct SmallVectorStorage {
+  typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1];
+};
+template <typename T>
+struct SmallVectorStorage<T, 1> {};
+template <typename T>
+struct SmallVectorStorage<T, 0> {};
+
+/// This is a 'vector' (really, a variable-sized array), optimized
+/// for the case when the array is small.  It contains some number of elements
+/// in-place, which allows it to avoid heap allocation when the actual number of
+/// elements is below that threshold.  This allows normal "small" cases to be
+/// fast without losing generality for large inputs.
+///
+/// Note that this does not attempt to be exception safe.
+///
+template <typename T, unsigned N>
+class SmallVector : public SmallVectorImpl<T> {
+  /// Inline space for elements which aren't stored in the base class.
+  SmallVectorStorage<T, N> Storage;
+
+ public:
+  SmallVector() : SmallVectorImpl<T>(N) {}
+
+  explicit SmallVector(size_t Size, const T& Value = T())
+      : SmallVectorImpl<T>(N) {
+    this->assign(Size, Value);
+  }
+
+  template <
+      typename ItTy,
+      typename = typename std::enable_if<std::is_convertible<
+          typename std::iterator_traits<ItTy>::iterator_category,
+          std::input_iterator_tag>::value>::type>
+  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
+    this->append(S, E);
+  }
+
+  template <typename Container>
+  explicit SmallVector(Container&& c) : SmallVectorImpl<T>(N) {
+    this->append(c.begin(), c.end());
+  }
+
+  SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
+    this->assign(IL);
+  }
+
+  SmallVector(const SmallVector& RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(RHS);
+  }
+
+  const SmallVector& operator=(const SmallVector& RHS) {
+    SmallVectorImpl<T>::operator=(RHS);
+    return *this;
+  }
+
+  SmallVector(SmallVector&& RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  template <typename Container>
+  const SmallVector& operator=(const Container& RHS) {
+    this->assign(RHS.begin(), RHS.end());
+    return *this;
+  }
+
+  SmallVector(SmallVectorImpl<T>&& RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  const SmallVector& operator=(SmallVector&& RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  const SmallVector& operator=(SmallVectorImpl<T>&& RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  template <typename Container>
+  const SmallVector& operator=(Container&& C) {
+    this->assign(C.begin(), C.end());
+    return *this;
+  }
+
+  const SmallVector& operator=(std::initializer_list<T> IL) {
+    this->assign(IL);
+    return *this;
+  }
+};
+
+template <typename T, unsigned N>
+inline size_t capacity_in_bytes(const SmallVector<T, N>& X) {
+  return X.capacity_in_bytes();
+}
+
+} // end namespace at
+
+namespace std {
+
+/// Implement std::swap in terms of SmallVector swap.
+template <typename T>
+inline void swap(at::SmallVectorImpl<T>& LHS, at::SmallVectorImpl<T>& RHS) {
+  LHS.swap(RHS);
+}
+
+/// Implement std::swap in terms of SmallVector swap.
+template <typename T, unsigned N>
+inline void swap(at::SmallVector<T, N>& LHS, at::SmallVector<T, N>& RHS) {
+  LHS.swap(RHS);
+}
+
+} // end namespace std
diff --git a/aten/src/ATen/core/UniqueVoidPtr.cpp b/aten/src/ATen/core/UniqueVoidPtr.cpp
new file mode 100644
index 00000000000000..fd08f7e13d2bf8
--- /dev/null
+++ b/aten/src/ATen/core/UniqueVoidPtr.cpp
@@ -0,0 +1,9 @@
+#include <ATen/core/UniqueVoidPtr.h>
+
+namespace at {
+namespace detail {
+
+void deleteNothing(void*) {}
+
+} // namespace detail
+} // namespace at
diff --git a/aten/src/ATen/detail/UniqueVoidPtr.h b/aten/src/ATen/core/UniqueVoidPtr.h
similarity index 77%
rename from aten/src/ATen/detail/UniqueVoidPtr.h
rename to aten/src/ATen/core/UniqueVoidPtr.h
index e277014a7935d6..299c729e125a58 100644
--- a/aten/src/ATen/detail/UniqueVoidPtr.h
+++ b/aten/src/ATen/core/UniqueVoidPtr.h
@@ -1,15 +1,15 @@
 #include <memory>
 
-#include <ATen/ATenGeneral.h>
+#include <ATen/core/CoreAPI.h>
 
 namespace at {
 
-using DeleterFnPtr = void(*)(void*);
+using DeleterFnPtr = void (*)(void*);
 
 namespace detail {
 
 // Does not delete anything
-AT_API void deleteNothing(void*);
+AT_CORE_API void deleteNothing(void*);
 
 // A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but
 // with three major differences:
@@ -35,33 +35,47 @@ AT_API void deleteNothing(void*);
 // to reflect this.
 //
 class UniqueVoidPtr {
-private:
+ private:
   // Lifetime tied to ctx_
   void* data_;
   std::unique_ptr<void, DeleterFnPtr> ctx_;
-public:
+
+ public:
   UniqueVoidPtr() : data_(nullptr), ctx_(nullptr, &deleteNothing) {}
-  explicit UniqueVoidPtr(void* data) : data_(data), ctx_(nullptr, &deleteNothing) {}
+  explicit UniqueVoidPtr(void* data)
+      : data_(data), ctx_(nullptr, &deleteNothing) {}
   UniqueVoidPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter)
-    : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {}
-  void* operator->() const { return data_; }
+      : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {}
+  void* operator->() const {
+    return data_;
+  }
   void clear() {
     ctx_ = nullptr;
     data_ = nullptr;
   }
-  void* get() const { return data_; }
-  void* get_context() const { return ctx_.get(); }
-  void* release_context() { return ctx_.release(); }
+  void* get() const {
+    return data_;
+  }
+  void* get_context() const {
+    return ctx_.get();
+  }
+  void* release_context() {
+    return ctx_.release();
+  }
   template <typename T>
   T* cast_context(DeleterFnPtr expected_deleter) const {
-    if (get_deleter() != expected_deleter) return nullptr;
+    if (get_deleter() != expected_deleter)
+      return nullptr;
     return static_cast<T*>(get_context());
   }
-  operator bool() const { return data_ || ctx_; }
-  DeleterFnPtr get_deleter() const { return ctx_.get_deleter(); }
+  operator bool() const {
+    return data_ || ctx_;
+  }
+  DeleterFnPtr get_deleter() const {
+    return ctx_.get_deleter();
+  }
 };
 
-
 // Note [How UniqueVoidPtr is implemented]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // UniqueVoidPtr solves a common problem for allocators of tensor data, which
@@ -80,9 +94,18 @@ class UniqueVoidPtr {
 // pointer itself.  In simple cases, the context pointer is just the pointer
 // itself.
 
-inline bool operator==(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { return !sp; }
-inline bool operator==(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { return !sp; }
-inline bool operator!=(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { return sp; }
-inline bool operator!=(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { return sp; }
+inline bool operator==(const UniqueVoidPtr& sp, std::nullptr_t) noexcept {
+  return !sp;
+}
+inline bool operator==(std::nullptr_t, const UniqueVoidPtr& sp) noexcept {
+  return !sp;
+}
+inline bool operator!=(const UniqueVoidPtr& sp, std::nullptr_t) noexcept {
+  return sp;
+}
+inline bool operator!=(std::nullptr_t, const UniqueVoidPtr& sp) noexcept {
+  return sp;
+}
 
-}} // namespace at::detail
+} // namespace detail
+} // namespace at
diff --git a/aten/src/ATen/core/optional.h b/aten/src/ATen/core/optional.h
new file mode 100644
index 00000000000000..8b0a7bfc4ead31
--- /dev/null
+++ b/aten/src/ATen/core/optional.h
@@ -0,0 +1,1027 @@
+// Copyright (C) 2011 - 2012 Andrzej Krzemienski.
+//
+// Use, modification, and distribution is subject to the Boost Software
+// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+//
+// The idea and interface is based on Boost.Optional library
+// authored by Fernando Luis Cacciola Carballal
+//
+// From https://github.com/akrzemi1/Optional
+//
+// ATen:
+// - Move to `at` namespace.
+// - Remove macro use in line 478 because the nvcc device compiler cannot handle
+// it.
+
+#pragma once
+
+#include <cassert>
+#include <functional>
+#include <initializer_list>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#define TR2_OPTIONAL_REQUIRES(...) \
+  typename std::enable_if<__VA_ARGS__::value, bool>::type = false
+
+#if defined __GNUC__ // NOTE: GNUC is also defined for Clang
+#if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)
+#define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+#elif (__GNUC__ > 4)
+#define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+#endif
+#
+#if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)
+#define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
+#elif (__GNUC__ > 4)
+#define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
+#endif
+#
+#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1)
+#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9)
+#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#elif (__GNUC__ > 4)
+#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#endif
+#endif
+#
+#if defined __clang_major__
+#if (__clang_major__ == 3 && __clang_minor__ >= 5)
+#define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#elif (__clang_major__ > 3)
+#define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#endif
+#if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+#elif ( \
+    __clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2)
+#define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+#endif
+#endif
+#
+#if defined _MSC_VER
+#if (_MSC_VER >= 1900)
+#define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+#endif
+#endif
+
+#if defined __clang__
+#if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9)
+#define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+#else
+#define OPTIONAL_HAS_THIS_RVALUE_REFS 0
+#endif
+#elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+#elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+#define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+#else
+#define OPTIONAL_HAS_THIS_RVALUE_REFS 0
+#endif
+
+#if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1
+#define OPTIONAL_CONSTEXPR_INIT_LIST constexpr
+#else
+#define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0
+#define OPTIONAL_CONSTEXPR_INIT_LIST
+#endif
+
+#if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && \
+    (__cplusplus != 201103L)
+#define OPTIONAL_HAS_MOVE_ACCESSORS 1
+#else
+#define OPTIONAL_HAS_MOVE_ACCESSORS 0
+#endif
+
+#// In C++11 constexpr implies const, so we need to make non-const members also non-constexpr
+#if (defined __cplusplus) && (__cplusplus == 201103L)
+#define OPTIONAL_MUTABLE_CONSTEXPR
+#else
+#define OPTIONAL_MUTABLE_CONSTEXPR constexpr
+#endif
+
+namespace at {
+
+// 20.5.4, optional for object types
+template <class T>
+class optional;
+
+// 20.5.5, optional for lvalue reference types
+template <class T>
+class optional<T&>;
+
+// workaround: std utility functions aren't constexpr yet
+template <class T>
+inline constexpr T&& constexpr_forward(
+    typename std::remove_reference<T>::type& t) noexcept {
+  return static_cast<T&&>(t);
+}
+
+template <class T>
+inline constexpr T&& constexpr_forward(
+    typename std::remove_reference<T>::type&& t) noexcept {
+  static_assert(!std::is_lvalue_reference<T>::value, "!!");
+  return static_cast<T&&>(t);
+}
+
+template <class T>
+inline constexpr typename std::remove_reference<T>::type&& constexpr_move(
+    T&& t) noexcept {
+  return static_cast<typename std::remove_reference<T>::type&&>(t);
+}
+
+#if defined NDEBUG
+#define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR)
+#else
+#define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) \
+  ((CHECK) ? (EXPR) : ([] { assert(!#CHECK); }(), (EXPR)))
+#endif
+
+namespace detail_ {
+
+// static_addressof: a constexpr version of addressof
+template <typename T>
+struct has_overloaded_addressof {
+  template <class X>
+  constexpr static bool has_overload(...) {
+    return false;
+  }
+
+  template <class X, size_t S = sizeof(std::declval<X&>().operator&())>
+  constexpr static bool has_overload(bool) {
+    return true;
+  }
+
+  constexpr static bool value = has_overload<T>(true);
+};
+
+template <typename T, TR2_OPTIONAL_REQUIRES(!has_overloaded_addressof<T>)>
+constexpr T* static_addressof(T& ref) {
+  return &ref;
+}
+
+template <typename T, TR2_OPTIONAL_REQUIRES(has_overloaded_addressof<T>)>
+T* static_addressof(T& ref) {
+  return std::addressof(ref);
+}
+
+// the call to convert<A>(b) has return type A and converts b to type A iff b
+// decltype(b) is implicitly convertible to A
+template <class U>
+constexpr U convert(U v) {
+  return v;
+}
+
+} // namespace detail_
+
+constexpr struct trivial_init_t {
+} trivial_init{};
+
+// 20.5.6, In-place construction
+constexpr struct in_place_t {
+} in_place{};
+
+// 20.5.7, Disengaged state indicator
+struct nullopt_t {
+  struct init {};
+  constexpr explicit nullopt_t(init) {}
+};
+constexpr nullopt_t nullopt{nullopt_t::init()};
+
+// 20.5.8, class bad_optional_access
+class bad_optional_access : public std::logic_error {
+ public:
+  explicit bad_optional_access(const std::string& what_arg)
+      : logic_error{what_arg} {}
+  explicit bad_optional_access(const char* what_arg) : logic_error{what_arg} {}
+};
+
+template <class T>
+union storage_t {
+  unsigned char dummy_;
+  T value_;
+
+  constexpr storage_t(trivial_init_t) noexcept : dummy_(){};
+
+  template <class... Args>
+  constexpr storage_t(Args&&... args)
+      : value_(constexpr_forward<Args>(args)...) {}
+
+  ~storage_t() {}
+};
+
+template <class T>
+union constexpr_storage_t {
+  unsigned char dummy_;
+  T value_;
+
+  constexpr constexpr_storage_t(trivial_init_t) noexcept : dummy_(){};
+
+  template <class... Args>
+  constexpr constexpr_storage_t(Args&&... args)
+      : value_(constexpr_forward<Args>(args)...) {}
+
+  ~constexpr_storage_t() = default;
+};
+
+template <class T>
+struct optional_base {
+  bool init_;
+  storage_t<T> storage_;
+
+  constexpr optional_base() noexcept : init_(false), storage_(trivial_init){};
+
+  explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {}
+
+  explicit constexpr optional_base(T&& v)
+      : init_(true), storage_(constexpr_move(v)) {}
+
+  template <class... Args>
+  explicit optional_base(in_place_t, Args&&... args)
+      : init_(true), storage_(constexpr_forward<Args>(args)...) {}
+
+  template <
+      class U,
+      class... Args,
+      TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+  explicit optional_base(
+      in_place_t,
+      std::initializer_list<U> il,
+      Args&&... args)
+      : init_(true), storage_(il, std::forward<Args>(args)...) {}
+
+  ~optional_base() {
+    if (init_)
+      storage_.value_.T::~T();
+  }
+};
+
+template <class T>
+struct constexpr_optional_base {
+  bool init_;
+  constexpr_storage_t<T> storage_;
+
+  constexpr constexpr_optional_base() noexcept
+      : init_(false), storage_(trivial_init){};
+
+  explicit constexpr constexpr_optional_base(const T& v)
+      : init_(true), storage_(v) {}
+
+  explicit constexpr constexpr_optional_base(T&& v)
+      : init_(true), storage_(constexpr_move(v)) {}
+
+  template <class... Args>
+  explicit constexpr constexpr_optional_base(in_place_t, Args&&... args)
+      : init_(true), storage_(constexpr_forward<Args>(args)...) {}
+
+  template <
+      class U,
+      class... Args,
+      TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+  OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base(
+      in_place_t,
+      std::initializer_list<U> il,
+      Args&&... args)
+      : init_(true), storage_(il, std::forward<Args>(args)...) {}
+
+  ~constexpr_optional_base() = default;
+};
+
+template <class T>
+using OptionalBase = typename std::conditional<
+    std::is_trivially_destructible<T>::value, // if possible
+    constexpr_optional_base<typename std::remove_const<
+        T>::type>, // use base with trivial destructor
+    optional_base<typename std::remove_const<T>::type>>::type;
+
+template <class T>
+class optional : private OptionalBase<T> {
+  static_assert(
+      !std::is_same<typename std::decay<T>::type, nullopt_t>::value,
+      "bad T");
+  static_assert(
+      !std::is_same<typename std::decay<T>::type, in_place_t>::value,
+      "bad T");
+
+  constexpr bool initialized() const noexcept {
+    return OptionalBase<T>::init_;
+  }
+  typename std::remove_const<T>::type* dataptr() {
+    return std::addressof(OptionalBase<T>::storage_.value_);
+  }
+  constexpr const T* dataptr() const {
+    return detail_::static_addressof(OptionalBase<T>::storage_.value_);
+  }
+
+#if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
+  constexpr const T& contained_val() const& {
+    return OptionalBase<T>::storage_.value_;
+  }
+#if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+  OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && {
+    return std::move(OptionalBase<T>::storage_.value_);
+  }
+  OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & {
+    return OptionalBase<T>::storage_.value_;
+  }
+#else
+  T& contained_val() & {
+    return OptionalBase<T>::storage_.value_;
+  }
+  T&& contained_val() && {
+    return std::move(OptionalBase<T>::storage_.value_);
+  }
+#endif
+#else
+  constexpr const T& contained_val() const {
+    return OptionalBase<T>::storage_.value_;
+  }
+  T& contained_val() {
+    return OptionalBase<T>::storage_.value_;
+  }
+#endif
+
+  void clear() noexcept {
+    if (initialized())
+      dataptr()->T::~T();
+    OptionalBase<T>::init_ = false;
+  }
+
+  template <class... Args>
+  void initialize(Args&&... args) noexcept(
+      noexcept(T(std::forward<Args>(args)...))) {
+    assert(!OptionalBase<T>::init_);
+    ::new (static_cast<void*>(dataptr())) T(std::forward<Args>(args)...);
+    OptionalBase<T>::init_ = true;
+  }
+
+  template <class U, class... Args>
+  void initialize(std::initializer_list<U> il, Args&&... args) noexcept(
+      noexcept(T(il, std::forward<Args>(args)...))) {
+    assert(!OptionalBase<T>::init_);
+    ::new (static_cast<void*>(dataptr())) T(il, std::forward<Args>(args)...);
+    OptionalBase<T>::init_ = true;
+  }
+
+ public:
+  typedef T value_type;
+
+  // 20.5.5.1, constructors
+  constexpr optional() noexcept : OptionalBase<T>(){};
+  constexpr optional(nullopt_t) noexcept : OptionalBase<T>(){};
+
+  optional(const optional& rhs) : OptionalBase<T>() {
+    if (rhs.initialized()) {
+      ::new (static_cast<void*>(dataptr())) T(*rhs);
+      OptionalBase<T>::init_ = true;
+    }
+  }
+
+  optional(optional&& rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value)
+      : OptionalBase<T>() {
+    if (rhs.initialized()) {
+      ::new (static_cast<void*>(dataptr())) T(std::move(*rhs));
+      OptionalBase<T>::init_ = true;
+    }
+  }
+
+  constexpr optional(const T& v) : OptionalBase<T>(v) {}
+
+  constexpr optional(T&& v) : OptionalBase<T>(constexpr_move(v)) {}
+
+  template <class... Args>
+  explicit constexpr optional(in_place_t, Args&&... args)
+      : OptionalBase<T>(in_place_t{}, constexpr_forward<Args>(args)...) {}
+
+  template <
+      class U,
+      class... Args,
+      TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+  OPTIONAL_CONSTEXPR_INIT_LIST explicit optional(
+      in_place_t,
+      std::initializer_list<U> il,
+      Args&&... args)
+      : OptionalBase<T>(in_place_t{}, il, constexpr_forward<Args>(args)...) {}
+
+  // 20.5.4.2, Destructor
+  ~optional() = default;
+
+  // 20.5.4.3, assignment
+  optional& operator=(nullopt_t) noexcept {
+    clear();
+    return *this;
+  }
+
+  optional& operator=(const optional& rhs) {
+    if (initialized() == true && rhs.initialized() == false)
+      clear();
+    else if (initialized() == false && rhs.initialized() == true)
+      initialize(*rhs);
+    else if (initialized() == true && rhs.initialized() == true)
+      contained_val() = *rhs;
+    return *this;
+  }
+
+  optional& operator=(optional&& rhs) noexcept(
+      std::is_nothrow_move_assignable<T>::value&&
+          std::is_nothrow_move_constructible<T>::value) {
+    if (initialized() == true && rhs.initialized() == false)
+      clear();
+    else if (initialized() == false && rhs.initialized() == true)
+      initialize(std::move(*rhs));
+    else if (initialized() == true && rhs.initialized() == true)
+      contained_val() = std::move(*rhs);
+    return *this;
+  }
+
+  template <class U>
+  auto operator=(U&& v) -> typename std::enable_if<
+      std::is_same<typename std::decay<U>::type, T>::value,
+      optional&>::type {
+    if (initialized()) {
+      contained_val() = std::forward<U>(v);
+    } else {
+      initialize(std::forward<U>(v));
+    }
+    return *this;
+  }
+
+  template <class... Args>
+  void emplace(Args&&... args) {
+    clear();
+    initialize(std::forward<Args>(args)...);
+  }
+
+  template <class U, class... Args>
+  void emplace(std::initializer_list<U> il, Args&&... args) {
+    clear();
+    initialize<U, Args...>(il, std::forward<Args>(args)...);
+  }
+
+  // 20.5.4.4, Swap
+  void swap(optional<T>& rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value&& noexcept(
+          swap(std::declval<T&>(), std::declval<T&>()))) {
+    if (initialized() == true && rhs.initialized() == false) {
+      rhs.initialize(std::move(**this));
+      clear();
+    } else if (initialized() == false && rhs.initialized() == true) {
+      initialize(std::move(*rhs));
+      rhs.clear();
+    } else if (initialized() == true && rhs.initialized() == true) {
+      using std::swap;
+      swap(**this, *rhs);
+    }
+  }
+
+  // 20.5.4.5, Observers
+
+  explicit constexpr operator bool() const noexcept {
+    return initialized();
+  }
+  constexpr bool has_value() const noexcept {
+    return initialized();
+  }
+
+  constexpr T const* operator->() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr());
+  }
+
+#if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+
+  OPTIONAL_MUTABLE_CONSTEXPR T* operator->() {
+    assert(initialized());
+    return dataptr();
+  }
+
+  constexpr T const& operator*() const& {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T& operator*() & {
+    assert(initialized());
+    return contained_val();
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T&& operator*() && {
+    assert(initialized());
+    return constexpr_move(contained_val());
+  }
+
+  constexpr T const& value() const& {
+    return initialized()
+        ? contained_val()
+        : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T& value() & {
+    return initialized()
+        ? contained_val()
+        : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T&& value() && {
+    if (!initialized())
+      throw bad_optional_access("bad optional access");
+    return std::move(contained_val());
+  }
+
+#else
+
+  T* operator->() {
+    assert(initialized());
+    return dataptr();
+  }
+
+  constexpr T const& operator*() const {
+    return contained_val();
+  }
+
+  T& operator*() {
+    assert(initialized());
+    return contained_val();
+  }
+
+  constexpr T const& value() const {
+    return initialized()
+        ? contained_val()
+        : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  T& value() {
+    return initialized()
+        ? contained_val()
+        : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+#endif
+
+#if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
+
+  template <class V>
+  constexpr T value_or(V&& v) const& {
+    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+
+  template <class V>
+  OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) && {
+    return *this
+        ? constexpr_move(const_cast<optional<T>&>(*this).contained_val())
+        : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#else
+
+  template <class V>
+  T value_or(V&& v) && {
+    return *this
+        ? constexpr_move(const_cast<optional<T>&>(*this).contained_val())
+        : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#endif
+
+#else
+
+  template <class V>
+  constexpr T value_or(V&& v) const {
+    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#endif
+
+  // 20.6.3.6, modifiers
+  void reset() noexcept {
+    clear();
+  }
+};
+
+template <class T>
+class optional<T&> {
+  static_assert(!std::is_same<T, nullopt_t>::value, "bad T");
+  static_assert(!std::is_same<T, in_place_t>::value, "bad T");
+  T* ref;
+
+ public:
+  // 20.5.5.1, construction/destruction
+  constexpr optional() noexcept : ref(nullptr) {}
+
+  constexpr optional(nullopt_t) noexcept : ref(nullptr) {}
+
+  constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {}
+
+  optional(T&&) = delete;
+
+  constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {}
+
+  explicit constexpr optional(in_place_t, T& v) noexcept
+      : ref(detail_::static_addressof(v)) {}
+
+  explicit optional(in_place_t, T&&) = delete;
+
+  ~optional() = default;
+
+  // 20.5.5.2, mutation
+  optional& operator=(nullopt_t) noexcept {
+    ref = nullptr;
+    return *this;
+  }
+
+  // optional& operator=(const optional& rhs) noexcept {
+  // ref = rhs.ref;
+  // return *this;
+  // }
+
+  // optional& operator=(optional&& rhs) noexcept {
+  // ref = rhs.ref;
+  // return *this;
+  // }
+
+  template <typename U>
+  auto operator=(U&& rhs) noexcept -> typename std::enable_if<
+      std::is_same<typename std::decay<U>::type, optional<T&>>::value,
+      optional&>::type {
+    ref = rhs.ref;
+    return *this;
+  }
+
+  template <typename U>
+  auto operator=(U&& rhs) noexcept -> typename std::enable_if<
+      !std::is_same<typename std::decay<U>::type, optional<T&>>::value,
+      optional&>::type = delete;
+
+  void emplace(T& v) noexcept {
+    ref = detail_::static_addressof(v);
+  }
+
+  void emplace(T&&) = delete;
+
+  void swap(optional<T&>& rhs) noexcept {
+    std::swap(ref, rhs.ref);
+  }
+
+  // 20.5.5.3, observers
+  constexpr T* operator->() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref);
+  }
+
+  constexpr T& operator*() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref);
+  }
+
+  constexpr T& value() const {
+    return ref ? *ref
+               : (throw bad_optional_access("bad optional access"), *ref);
+  }
+
+  explicit constexpr operator bool() const noexcept {
+    return ref != nullptr;
+  }
+
+  constexpr bool has_value() const noexcept {
+    return ref != nullptr;
+  }
+
+  template <class V>
+  constexpr typename std::decay<T>::type value_or(V&& v) const {
+    return *this ? **this
+                 : detail_::convert<typename std::decay<T>::type>(
+                       constexpr_forward<V>(v));
+  }
+
+  // x.x.x.x, modifiers
+  void reset() noexcept {
+    ref = nullptr;
+  }
+};
+
+template <class T>
+class optional<T&&> {
+  static_assert(sizeof(T) == 0, "optional rvalue references disallowed");
+};
+
+// 20.5.8, Relational operators
+template <class T>
+constexpr bool operator==(const optional<T>& x, const optional<T>& y) {
+  return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y;
+}
+
+template <class T>
+constexpr bool operator!=(const optional<T>& x, const optional<T>& y) {
+  return !(x == y);
+}
+
+template <class T>
+constexpr bool operator<(const optional<T>& x, const optional<T>& y) {
+  return (!y) ? false : (!x) ? true : *x < *y;
+}
+
+template <class T>
+constexpr bool operator>(const optional<T>& x, const optional<T>& y) {
+  return (y < x);
+}
+
+template <class T>
+constexpr bool operator<=(const optional<T>& x, const optional<T>& y) {
+  return !(y < x);
+}
+
+template <class T>
+constexpr bool operator>=(const optional<T>& x, const optional<T>& y) {
+  return !(x < y);
+}
+
+// 20.5.9, Comparison with nullopt
+template <class T>
+constexpr bool operator==(const optional<T>& x, nullopt_t) noexcept {
+  return (!x);
+}
+
+template <class T>
+constexpr bool operator==(nullopt_t, const optional<T>& x) noexcept {
+  return (!x);
+}
+
+template <class T>
+constexpr bool operator!=(const optional<T>& x, nullopt_t) noexcept {
+  return bool(x);
+}
+
+template <class T>
+constexpr bool operator!=(nullopt_t, const optional<T>& x) noexcept {
+  return bool(x);
+}
+
+template <class T>
+constexpr bool operator<(const optional<T>&, nullopt_t) noexcept {
+  return false;
+}
+
+template <class T>
+constexpr bool operator<(nullopt_t, const optional<T>& x) noexcept {
+  return bool(x);
+}
+
+template <class T>
+constexpr bool operator<=(const optional<T>& x, nullopt_t) noexcept {
+  return (!x);
+}
+
+template <class T>
+constexpr bool operator<=(nullopt_t, const optional<T>&) noexcept {
+  return true;
+}
+
+template <class T>
+constexpr bool operator>(const optional<T>& x, nullopt_t) noexcept {
+  return bool(x);
+}
+
+template <class T>
+constexpr bool operator>(nullopt_t, const optional<T>&) noexcept {
+  return false;
+}
+
+template <class T>
+constexpr bool operator>=(const optional<T>&, nullopt_t) noexcept {
+  return true;
+}
+
+template <class T>
+constexpr bool operator>=(nullopt_t, const optional<T>& x) noexcept {
+  return (!x);
+}
+
+// 20.5.10, Comparison with T
+template <class T>
+constexpr bool operator==(const optional<T>& x, const T& v) {
+  return bool(x) ? *x == v : false;
+}
+
+template <class T>
+constexpr bool operator==(const T& v, const optional<T>& x) {
+  return bool(x) ? v == *x : false;
+}
+
+template <class T>
+constexpr bool operator!=(const optional<T>& x, const T& v) {
+  return bool(x) ? *x != v : true;
+}
+
+template <class T>
+constexpr bool operator!=(const T& v, const optional<T>& x) {
+  return bool(x) ? v != *x : true;
+}
+
+template <class T>
+constexpr bool operator<(const optional<T>& x, const T& v) {
+  return bool(x) ? *x < v : true;
+}
+
+template <class T>
+constexpr bool operator>(const T& v, const optional<T>& x) {
+  return bool(x) ? v > *x : true;
+}
+
+template <class T>
+constexpr bool operator>(const optional<T>& x, const T& v) {
+  return bool(x) ? *x > v : false;
+}
+
+template <class T>
+constexpr bool operator<(const T& v, const optional<T>& x) {
+  return bool(x) ? v < *x : false;
+}
+
+template <class T>
+constexpr bool operator>=(const optional<T>& x, const T& v) {
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T>
+constexpr bool operator<=(const T& v, const optional<T>& x) {
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T>
+constexpr bool operator<=(const optional<T>& x, const T& v) {
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T>
+constexpr bool operator>=(const T& v, const optional<T>& x) {
+  return bool(x) ? v >= *x : true;
+}
+
+// Comparison of optional<T&> with T
+template <class T>
+constexpr bool operator==(const optional<T&>& x, const T& v) {
+  return bool(x) ? *x == v : false;
+}
+
+template <class T>
+constexpr bool operator==(const T& v, const optional<T&>& x) {
+  return bool(x) ? v == *x : false;
+}
+
+template <class T>
+constexpr bool operator!=(const optional<T&>& x, const T& v) {
+  return bool(x) ? *x != v : true;
+}
+
+template <class T>
+constexpr bool operator!=(const T& v, const optional<T&>& x) {
+  return bool(x) ? v != *x : true;
+}
+
+template <class T>
+constexpr bool operator<(const optional<T&>& x, const T& v) {
+  return bool(x) ? *x < v : true;
+}
+
+template <class T>
+constexpr bool operator>(const T& v, const optional<T&>& x) {
+  return bool(x) ? v > *x : true;
+}
+
+template <class T>
+constexpr bool operator>(const optional<T&>& x, const T& v) {
+  return bool(x) ? *x > v : false;
+}
+
+template <class T>
+constexpr bool operator<(const T& v, const optional<T&>& x) {
+  return bool(x) ? v < *x : false;
+}
+
+template <class T>
+constexpr bool operator>=(const optional<T&>& x, const T& v) {
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T>
+constexpr bool operator<=(const T& v, const optional<T&>& x) {
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T>
+constexpr bool operator<=(const optional<T&>& x, const T& v) {
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T>
+constexpr bool operator>=(const T& v, const optional<T&>& x) {
+  return bool(x) ? v >= *x : true;
+}
+
+// Comparison of optional<T const&> with T
+template <class T>
+constexpr bool operator==(const optional<const T&>& x, const T& v) {
+  return bool(x) ? *x == v : false;
+}
+
+template <class T>
+constexpr bool operator==(const T& v, const optional<const T&>& x) {
+  return bool(x) ? v == *x : false;
+}
+
+template <class T>
+constexpr bool operator!=(const optional<const T&>& x, const T& v) {
+  return bool(x) ? *x != v : true;
+}
+
+template <class T>
+constexpr bool operator!=(const T& v, const optional<const T&>& x) {
+  return bool(x) ? v != *x : true;
+}
+
+template <class T>
+constexpr bool operator<(const optional<const T&>& x, const T& v) {
+  return bool(x) ? *x < v : true;
+}
+
+template <class T>
+constexpr bool operator>(const T& v, const optional<const T&>& x) {
+  return bool(x) ? v > *x : true;
+}
+
+template <class T>
+constexpr bool operator>(const optional<const T&>& x, const T& v) {
+  return bool(x) ? *x > v : false;
+}
+
+template <class T>
+constexpr bool operator<(const T& v, const optional<const T&>& x) {
+  return bool(x) ? v < *x : false;
+}
+
+template <class T>
+constexpr bool operator>=(const optional<const T&>& x, const T& v) {
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T>
+constexpr bool operator<=(const T& v, const optional<const T&>& x) {
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T>
+constexpr bool operator<=(const optional<const T&>& x, const T& v) {
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T>
+constexpr bool operator>=(const T& v, const optional<const T&>& x) {
+  return bool(x) ? v >= *x : true;
+}
+
+// 20.5.12, Specialized algorithms
+template <class T>
+void swap(optional<T>& x, optional<T>& y) noexcept(noexcept(x.swap(y))) {
+  x.swap(y);
+}
+
+template <class T>
+constexpr optional<typename std::decay<T>::type> make_optional(T&& v) {
+  return optional<typename std::decay<T>::type>(constexpr_forward<T>(v));
+}
+
+template <class X>
+constexpr optional<X&> make_optional(std::reference_wrapper<X> v) {
+  return optional<X&>(v.get());
+}
+
+} // namespace at
+
+namespace std {
+template <typename T>
+struct hash<at::optional<T>> {
+  typedef typename hash<T>::result_type result_type;
+  typedef at::optional<T> argument_type;
+
+  constexpr result_type operator()(argument_type const& arg) const {
+    return arg ? std::hash<T>{}(*arg) : result_type{};
+  }
+};
+
+template <typename T>
+struct hash<at::optional<T&>> {
+  typedef typename hash<T>::result_type result_type;
+  typedef at::optional<T&> argument_type;
+
+  constexpr result_type operator()(argument_type const& arg) const {
+    return arg ? std::hash<T>{}(*arg) : result_type{};
+  }
+};
+} // namespace std
+
+#undef TR2_OPTIONAL_REQUIRES
+#undef TR2_OPTIONAL_ASSERTED_EXPRESSION
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
new file mode 100644
index 00000000000000..eed9f677a2ef18
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -0,0 +1,20 @@
+#pragma once
+// Contents of this file are copied from THCUNN/common.h for the ease of porting
+// THCUNN functions into ATen.
+
+namespace at { namespace cuda { namespace detail {
+
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+// Use 1024 threads per block, which requires cuda sm_2x or above
+constexpr int CUDA_NUM_THREADS = 1024;
+
+// CUDA: number of blocks for threads.
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+}}}  // namespace at::cuda::detail
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index 085f2723bf0455..7ce3da3c9e051c 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -319,6 +319,20 @@ struct AT_CUDA_API RNNDescriptor
   }
 };
 
+#if CUDNN_VERSION >= 7000
+
+struct AT_CUDA_API CTCLossDescriptor
+  : public Descriptor<cudnnCTCLossStruct,
+                      &cudnnCreateCTCLossDescriptor,
+                      &cudnnDestroyCTCLossDescriptor>
+{
+  void set(cudnnDataType_t datatype) {
+    AT_CUDNN_CHECK(cudnnSetCTCLossDescriptor(mut_desc(), datatype));
+  }
+};
+
+#endif
+
 union Constant
 {
   float f;
diff --git a/aten/src/ATen/detail/UniqueVoidPtr.cpp b/aten/src/ATen/detail/UniqueVoidPtr.cpp
deleted file mode 100644
index 07531d826367ae..00000000000000
--- a/aten/src/ATen/detail/UniqueVoidPtr.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <ATen/detail/UniqueVoidPtr.h>
-
-namespace at { namespace detail {
-
-void deleteNothing(void*) {}
-
-}} // namespace at
diff --git a/aten/src/ATen/detail/VariableHooksInterface.h b/aten/src/ATen/detail/VariableHooksInterface.h
index 287116490397f3..836dacb97766ec 100644
--- a/aten/src/ATen/detail/VariableHooksInterface.h
+++ b/aten/src/ATen/detail/VariableHooksInterface.h
@@ -3,6 +3,7 @@
 #include <ATen/Registry.h>
 #include <ATen/Error.h>
 #include <ATen/ScalarType.h>
+#include <ATen/Type.h>
 
 namespace at {
   class Context;
@@ -25,6 +26,10 @@ struct AT_API VariableHooksInterface {
   // squelch -Werror=non-virtual-dtor
   virtual ~VariableHooksInterface() {}
 
+  virtual Type& getVariableType(const at::Type& baseType) const {
+    AT_ERROR("cannot getVariableType without libtorch");
+  }
+
   virtual void registerVariableTypeFor(Context*, Backend backend, ScalarType scalar_type) const {
     // no-op if Variable not available; it'll get handled (if at all) when
     // libtorch.so gets loaded
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index 93c20d4be032f4..b012de25194361 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -290,7 +290,7 @@ def __init__(self, reason):
             'Backend::${DenseBackend}, ScalarType::Long)'),
     'THStorage*':
         CodeTemplate(
-            'checked_cast_storage<${Storage}>('
+            'checked_cast_storage<Storage>('
             '&${arg_name},"${arg_name}",${arg_pos}, '
             'Backend::${Backend}, ScalarType::${ScalarName})'),
     'THGenerator*':
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 0f2aaffd6eac9d..209cca57c293ff 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -103,10 +103,6 @@ def check_all_files_written(self):
 TEMPLATE_PATH = options.source_path + "/templates"
 GENERATOR_DERIVED = CodeTemplate.from_file(
     TEMPLATE_PATH + "/GeneratorDerived.h")
-STORAGE_DERIVED_CPP = CodeTemplate.from_file(
-    TEMPLATE_PATH + "/StorageDerived.cpp")
-STORAGE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/StorageDerived.h")
-
 TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.cpp")
 SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp")
 TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h")
@@ -237,7 +233,6 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
     env['isFloatingType'] = is_floating_type
     env['isIntegralType'] = not is_floating_type
     if density == 'Dense':
-        env['Storage'] = "{}{}Storage".format(backend, scalar_name)
         env['Tensor'] = "{}{}{}Tensor".format(density_tag, backend, scalar_name)
     env['Type'] = "{}{}{}Type".format(density_tag, backend, scalar_name)
     env['DenseTensor'] = "{}{}Tensor".format(backend, scalar_name)
@@ -246,7 +241,6 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
     env['storage_tensor_headers'] = []
     if density != 'Sparse':
         env['storage_tensor_headers'] = [
-            '#include "ATen/{}.h"'.format(env['Storage']),
             '#include "ATen/{}.h"'.format(env['Tensor']),
             '#include "ATen/{}ByteTensor.h"'.format(env['Backend']),
             '#include "ATen/{}IntTensor.h"'.format(env['Backend']),
@@ -322,8 +316,6 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
 
     if density != 'Sparse':
         # there are no storage or tensor types for sparse; it's all uniform
-        fm.write(env['Storage'] + ".cpp", STORAGE_DERIVED_CPP, env)
-        fm.write(env['Storage'] + ".h", STORAGE_DERIVED_H, env)
         env['TensorDenseOrSparse'] = TENSOR_DENSE_CPP.substitute(env)
         fm.write(env['Tensor'] + ".cpp", TENSOR_DERIVED_CPP, env)
         fm.write(env['Tensor'] + ".h", TENSOR_DERIVED_H, env)
@@ -379,7 +371,7 @@ def declare_outputs():
     for backend, density, scalar_types in iterate_types():
         scalar_name = scalar_types[0]
         full_backend = "Sparse" + backend if density == "Sparse" else backend
-        for kind in ["Storage", "Type", "Tensor"]:
+        for kind in ["Type", "Tensor"]:
             if kind != 'Type' and density == "Sparse":
                 # No Storage or Tensor for sparse
                 continue
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index a3dc735ab1e4cb..36f1e4c0bf86de 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -25,6 +25,16 @@ Tensor & selu_(Tensor & self) {
   return at::elu_(self, SELU_ALPHA, SELU_SCALE);
 }
 
+Tensor celu(const Tensor & self, Scalar alpha) {
+  double inv_alpha = 1. / alpha.to<double>();
+  return at::elu(self, 1.0, alpha, Scalar(inv_alpha));
+}
+
+Tensor & celu_(Tensor & self, Scalar alpha) {
+  double inv_alpha = 1. / alpha.to<double>();
+  return at::elu_(self, 1.0, alpha, Scalar(inv_alpha));
+}
+
 Tensor rrelu(const Tensor & self, Scalar lower, Scalar upper, bool training, Generator* generator) {
   return at::rrelu_with_noise(self, self.type().tensor(), lower, upper, training, generator);
 }
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index a537691f748171..4028e989b87022 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -402,11 +402,11 @@ at::Tensor _convolution_nogroup(
     bool transposed, IntList output_padding) {
 
   ConvParams params;
-  params.stride = stride;
-  params.padding = padding;
-  params.dilation = dilation;
+  params.stride = stride.vec();
+  params.padding = padding.vec();
+  params.dilation = dilation.vec();
   params.transposed = transposed;
-  params.output_padding = output_padding;
+  params.output_padding = output_padding.vec();
   params.groups = 1;
   params.benchmark = false;
   params.deterministic = false;
@@ -474,11 +474,11 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward(
   auto weight = weight_r;
 
   ConvParams params;
-  params.stride = stride_;
-  params.padding = padding_;
-  params.dilation = dilation_;
+  params.stride = stride_.vec();
+  params.padding = padding_.vec();
+  params.dilation = dilation_.vec();
   params.transposed = transposed_;
-  params.output_padding = output_padding_;
+  params.output_padding = output_padding_.vec();
   params.groups = groups_;
   params.benchmark = benchmark;
   params.deterministic = deterministic;
diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h
index 7a6e0788531172..c374740a3ce7d1 100644
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@@ -57,6 +57,7 @@ deviceforcuda scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t>& st
 
   // Boost alpha for higher acceptance probability.
   if (alpha < 1.0f) {
+    if (alpha == 0.f) return 0.f;
     scale *= std::pow(1 - standard_uniform.sample(), 1.0f / alpha);
     alpha += 1.0f;
   }
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 7599386ee74172..0026a9907d7eca 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -24,7 +24,7 @@ Tensor embedding(const Tensor & weight, const Tensor & indices,
     return weight.index_select(0, indices);
   }
 
-  auto size = std::vector<int64_t>(indices.sizes());
+  auto size = indices.sizes().vec();
   for (auto d : weight.sizes().slice(1)) {
     size.push_back(d);
   }
diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp
new file mode 100644
index 00000000000000..5f1c8255772dcf
--- /dev/null
+++ b/aten/src/ATen/native/GridSampler.cpp
@@ -0,0 +1,780 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/detail/CUDAHooksInterface.h"
+#include "ATen/native/GridSampler.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace at { namespace native {
+
+using at::native::detail::GridSamplerInterpolation;
+using at::native::detail::GridSamplerPadding;
+
+namespace {
+  static inline int64_t clip_coordinates(int64_t in, int64_t clip_limit) {
+    return std::min(clip_limit - 1, std::max(in, static_cast<int64_t>(0)));
+  }
+
+  static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) {
+    return h >= 0 && h < H && w >= 0 && w < W;
+  }
+
+  static inline bool within_bounds_3d(int64_t d, int64_t h, int64_t w, int64_t D, int64_t H, int64_t W) {
+    return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+  }
+
+  template<typename scalar_t>
+  static inline void safe_add_2d(scalar_t *data, int64_t h, int64_t w,
+                                 int64_t sH, int64_t sW, int64_t H, int64_t W,
+                                 scalar_t delta) {
+    if (within_bounds_2d(h, w, H, W)) {
+      data[h * sH + w * sW] += delta;
+    }
+  }
+
+  template<typename scalar_t>
+  static inline void safe_add_3d(scalar_t *data, int64_t d, int64_t h, int64_t w,
+                                 int64_t sD, int64_t sH, int64_t sW,
+                                 int64_t D, int64_t H, int64_t W,
+                                 scalar_t delta) {
+    if (within_bounds_3d(d, h, w, D, H, W)) {
+      data[d * sD + h * sH + w * sW] += delta;
+    }
+  }
+
+  template<typename scalar_t>
+  Tensor grid_sampler2d_cpu_impl(const Tensor& input, const Tensor& grid,
+                                 GridSamplerInterpolation interpolation_mode,
+                                 GridSamplerPadding padding_mode) {
+    int64_t N = input.size(0);
+    int64_t C = input.size(1);
+    int64_t inp_H = input.size(2);
+    int64_t inp_W = input.size(3);
+    int64_t out_H = grid.size(1);
+    int64_t out_W = grid.size(2);
+    auto output = at::empty({N, C, out_H, out_W}, input.options());
+    int64_t inp_sN = input.stride(0);
+    int64_t inp_sC = input.stride(1);
+    int64_t inp_sH = input.stride(2);
+    int64_t inp_sW = input.stride(3);
+    int64_t grid_sN = grid.stride(0);
+    int64_t grid_sH = grid.stride(1);
+    int64_t grid_sW = grid.stride(2);
+    int64_t grid_sCoor = grid.stride(3);
+    int64_t out_sN = output.stride(0);
+    int64_t out_sC = output.stride(1);
+    int64_t out_sH = output.stride(2);
+    int64_t out_sW = output.stride(3);
+    scalar_t *inp_ptr = input.data<scalar_t>();
+    scalar_t *out_ptr = output.data<scalar_t>();
+    scalar_t *grid_ptr = grid.data<scalar_t>();
+    // loop over each output pixel
+    #ifdef _OPENMP
+    #pragma omp parallel for
+    #endif
+    for (int64_t n = 0; n < N; ++n) {
+      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
+      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
+      for (int64_t h = 0; h < out_H; ++h) {
+        for (int64_t w = 0; w < out_W; ++w) {
+          // get the corresponding input x, y co-ordinates from grid
+          scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW];
+          scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor];
+
+          // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1]
+          ix = ((ix + 1) / 2) * (inp_W - 1);
+          iy = ((iy + 1) / 2) * (inp_H - 1);
+
+          // get NE, NW, SE, SW pixel values from (x, y)
+          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
+          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
+          int64_t ix_ne = ix_nw + 1;
+          int64_t iy_ne = iy_nw;
+          int64_t ix_sw = ix_nw;
+          int64_t iy_sw = iy_nw + 1;
+          int64_t ix_se = ix_nw + 1;
+          int64_t iy_se = iy_nw + 1;
+
+          // get surfaces to each neighbor:
+          scalar_t nw = (ix_se - ix)    * (iy_se - iy);
+          scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
+          scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
+          scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
+
+          if (padding_mode == GridSamplerPadding::Border) {
+            // clip coordinates to image borders
+            ix_nw = clip_coordinates(ix_nw, inp_W);
+            iy_nw = clip_coordinates(iy_nw, inp_H);
+            ix_ne = clip_coordinates(ix_ne, inp_W);
+            iy_ne = clip_coordinates(iy_ne, inp_H);
+            ix_sw = clip_coordinates(ix_sw, inp_W);
+            iy_sw = clip_coordinates(iy_sw, inp_H);
+            ix_se = clip_coordinates(ix_se, inp_W);
+            iy_se = clip_coordinates(iy_se, inp_H);
+          }
+
+          // calculate bilinear weighted pixel value and set output pixel
+          scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          scalar_t *inp_ptr_NC = inp_ptr_N;
+          for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            //   (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne
+            // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se
+            *out_ptr_NCHW = static_cast<scalar_t>(0);
+            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+              *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+            }
+            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+              *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+            }
+            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+              *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+            }
+            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+              *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+            }
+          }
+        }
+      }
+    }
+    return output;
+  }
+
+  template<typename scalar_t>
+  Tensor grid_sampler3d_cpu_impl(const Tensor& input, const Tensor& grid,
+                                 GridSamplerInterpolation interpolation_mode,
+                                 GridSamplerPadding padding_mode) {
+    int64_t N = input.size(0);
+    int64_t C = input.size(1);
+    int64_t inp_D = input.size(2);
+    int64_t inp_H = input.size(3);
+    int64_t inp_W = input.size(4);
+    int64_t out_D = grid.size(1);
+    int64_t out_H = grid.size(2);
+    int64_t out_W = grid.size(3);
+    auto output = at::empty({N, C, out_D, out_H, out_W}, input.options());
+    int64_t inp_sN = input.stride(0);
+    int64_t inp_sC = input.stride(1);
+    int64_t inp_sD = input.stride(2);
+    int64_t inp_sH = input.stride(3);
+    int64_t inp_sW = input.stride(4);
+    int64_t grid_sN = grid.stride(0);
+    int64_t grid_sD = grid.stride(1);
+    int64_t grid_sH = grid.stride(2);
+    int64_t grid_sW = grid.stride(3);
+    int64_t grid_sCoor = grid.stride(4);
+    int64_t out_sN = output.stride(0);
+    int64_t out_sC = output.stride(1);
+    int64_t out_sD = output.stride(2);
+    int64_t out_sH = output.stride(3);
+    int64_t out_sW = output.stride(4);
+    scalar_t *inp_ptr = input.data<scalar_t>();
+    scalar_t *out_ptr = output.data<scalar_t>();
+    scalar_t *grid_ptr = grid.data<scalar_t>();
+    // loop over each output pixel
+    #ifdef _OPENMP
+    #pragma omp parallel for
+    #endif
+    for (int64_t n = 0; n < N; ++n) {
+      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
+      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
+      for (int64_t d = 0; d < out_D; ++d) {
+        for (int64_t h = 0; h < out_H; ++h) {
+          for (int64_t w = 0; w < out_W; ++w) {
+            // get the corresponding input x, y, z co-ordinates from grid
+            scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
+            scalar_t ix = *grid_ptr_NDHW;
+            scalar_t iy = grid_ptr_NDHW[grid_sCoor];
+            scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor];
+
+            // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1]
+            ix = ((ix + 1) / 2) * (inp_W - 1);
+            iy = ((iy + 1) / 2) * (inp_H - 1);
+            iz = ((iz + 1) / 2) * (inp_D - 1);
+
+            // get corner pixel values from (x, y, z)
+            // for 4d, we used north-east-south-west
+            // for 5d, we add top-bottom
+            int64_t ix_tnw = static_cast<int64_t>(std::floor(ix));
+            int64_t iy_tnw = static_cast<int64_t>(std::floor(iy));
+            int64_t iz_tnw = static_cast<int64_t>(std::floor(iz));
+
+            int64_t ix_tne = ix_tnw + 1;
+            int64_t iy_tne = iy_tnw;
+            int64_t iz_tne = iz_tnw;
+
+            int64_t ix_tsw = ix_tnw;
+            int64_t iy_tsw = iy_tnw + 1;
+            int64_t iz_tsw = iz_tnw;
+
+            int64_t ix_tse = ix_tnw + 1;
+            int64_t iy_tse = iy_tnw + 1;
+            int64_t iz_tse = iz_tnw;
+
+            int64_t ix_bnw = ix_tnw;
+            int64_t iy_bnw = iy_tnw;
+            int64_t iz_bnw = iz_tnw + 1;
+
+            int64_t ix_bne = ix_tnw + 1;
+            int64_t iy_bne = iy_tnw;
+            int64_t iz_bne = iz_tnw + 1;
+
+            int64_t ix_bsw = ix_tnw;
+            int64_t iy_bsw = iy_tnw + 1;
+            int64_t iz_bsw = iz_tnw + 1;
+
+            int64_t ix_bse = ix_tnw + 1;
+            int64_t iy_bse = iy_tnw + 1;
+            int64_t iz_bse = iz_tnw + 1;
+
+            // get surfaces to each neighbor:
+            scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+            scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+            scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+            scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+            scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+            scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+            scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+            scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+            if (padding_mode == GridSamplerPadding::Border) {
+              // clip coordinates to image borders
+              ix_tnw = clip_coordinates(ix_tnw, inp_W);
+              iy_tnw = clip_coordinates(iy_tnw, inp_H);
+              iz_tnw = clip_coordinates(iz_tnw, inp_D);
+              ix_tne = clip_coordinates(ix_tne, inp_W);
+              iy_tne = clip_coordinates(iy_tne, inp_H);
+              iz_tne = clip_coordinates(iz_tne, inp_D);
+              ix_tsw = clip_coordinates(ix_tsw, inp_W);
+              iy_tsw = clip_coordinates(iy_tsw, inp_H);
+              iz_tsw = clip_coordinates(iz_tsw, inp_D);
+              ix_tse = clip_coordinates(ix_tse, inp_W);
+              iy_tse = clip_coordinates(iy_tse, inp_H);
+              iz_tse = clip_coordinates(iz_tse, inp_D);
+              ix_bnw = clip_coordinates(ix_bnw, inp_W);
+              iy_bnw = clip_coordinates(iy_bnw, inp_H);
+              iz_bnw = clip_coordinates(iz_bnw, inp_D);
+              ix_bne = clip_coordinates(ix_bne, inp_W);
+              iy_bne = clip_coordinates(iy_bne, inp_H);
+              iz_bne = clip_coordinates(iz_bne, inp_D);
+              ix_bsw = clip_coordinates(ix_bsw, inp_W);
+              iy_bsw = clip_coordinates(iy_bsw, inp_H);
+              iz_bsw = clip_coordinates(iz_bsw, inp_D);
+              ix_bse = clip_coordinates(ix_bse, inp_W);
+              iy_bse = clip_coordinates(iy_bse, inp_H);
+              iz_bse = clip_coordinates(iz_bse, inp_D);
+            }
+
+            // calculate bilinear weighted pixel value and set output pixel
+            scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+            scalar_t *inp_ptr_NC = inp_ptr_N;
+            for (int c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) {
+              //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne
+              // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse
+              // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne
+              // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse
+              *out_ptr_NCDHW = static_cast<scalar_t>(0);
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
+                *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
+                *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
+                *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
+                *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
+                *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
+                *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
+                *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
+                *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
+              }
+            }
+          }
+        }
+      }
+    }
+    return output;
+  }
+
+  template<typename scalar_t>
+  std::tuple<Tensor, Tensor>
+  grid_sampler2d_backward_cpu_impl(const Tensor& grad_output,
+                                   const Tensor& input, const Tensor& grid,
+                                   GridSamplerInterpolation interpolation_mode,
+                                   GridSamplerPadding padding_mode) {
+    auto grad_input = at::zeros_like(input);
+    auto grad_grid = at::empty_like(grid);
+    int64_t N = input.size(0);
+    int64_t C = input.size(1);
+    int64_t inp_H = input.size(2);
+    int64_t inp_W = input.size(3);
+    int64_t out_H = grid.size(1);
+    int64_t out_W = grid.size(2);
+    int64_t inp_sN = input.stride(0);
+    int64_t inp_sC = input.stride(1);
+    int64_t inp_sH = input.stride(2);
+    int64_t inp_sW = input.stride(3);
+    int64_t grid_sN = grid.stride(0);
+    int64_t grid_sH = grid.stride(1);
+    int64_t grid_sW = grid.stride(2);
+    int64_t grid_sCoor = grid.stride(3);
+    int64_t gOut_sN = grad_output.stride(0);
+    int64_t gOut_sC = grad_output.stride(1);
+    int64_t gOut_sH = grad_output.stride(2);
+    int64_t gOut_sW = grad_output.stride(3);
+    int64_t gInp_sN = grad_input.stride(0);
+    int64_t gInp_sC = grad_input.stride(1);
+    int64_t gInp_sH = grad_input.stride(2);
+    int64_t gInp_sW = grad_input.stride(3);
+    int64_t gGrid_sN = grad_grid.stride(0);
+    int64_t gGrid_sW = grad_grid.stride(2);
+    scalar_t *inp_ptr = input.data<scalar_t>();
+    scalar_t *grid_ptr = grid.data<scalar_t>();
+    scalar_t *gOut_ptr = grad_output.data<scalar_t>();
+    scalar_t *gInp_ptr = grad_input.data<scalar_t>();
+    scalar_t *gGrid_ptr = grad_grid.data<scalar_t>();
+    // loop over each output pixel
+    #ifdef _OPENMP
+    #pragma omp parallel for
+    #endif
+    for (int64_t n = 0; n < N; ++n) {
+      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
+      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
+      scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
+      for (int64_t h = 0; h < out_H; ++h) {
+        for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
+          // get the corresponding input x, y co-ordinates from grid
+          scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW];
+          scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor];
+
+          // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1]
+          ix = ((ix + 1) / 2) * (inp_W - 1);
+          iy = ((iy + 1) / 2) * (inp_H - 1);
+
+          // get NE, NW, SE, SW pixel values from (x, y)
+          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
+          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
+          int64_t ix_ne = ix_nw + 1;
+          int64_t iy_ne = iy_nw;
+          int64_t ix_sw = ix_nw;
+          int64_t iy_sw = iy_nw + 1;
+          int64_t ix_se = ix_nw + 1;
+          int64_t iy_se = iy_nw + 1;
+
+          // get surfaces to each neighbor:
+          scalar_t nw = (ix_se - ix)    * (iy_se - iy);
+          scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
+          scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
+          scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
+
+          int64_t ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl;
+
+          if (padding_mode == GridSamplerPadding::Border) {
+            // get clipped NE, NW, SE, SW pixel values from (x, y)
+            ix_nw_cl = clip_coordinates(ix_nw, inp_W);
+            iy_nw_cl = clip_coordinates(iy_nw, inp_H);
+            ix_ne_cl = clip_coordinates(ix_ne, inp_W);
+            iy_ne_cl = clip_coordinates(iy_ne, inp_H);
+            ix_sw_cl = clip_coordinates(ix_sw, inp_W);
+            iy_sw_cl = clip_coordinates(iy_sw, inp_H);
+            ix_se_cl = clip_coordinates(ix_se, inp_W);
+            iy_se_cl = clip_coordinates(iy_se, inp_H);
+          } else {
+            ix_nw_cl = ix_nw;
+            iy_nw_cl = iy_nw;
+            ix_ne_cl = ix_ne;
+            iy_ne_cl = iy_ne;
+            ix_sw_cl = ix_sw;
+            iy_sw_cl = iy_sw;
+            ix_se_cl = ix_se;
+            iy_se_cl = iy_se;
+          }
+
+          scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0);
+          scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
+          scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
+          scalar_t *inp_ptr_NC = inp_ptr_N;
+          // calculate bilinear weighted pixel value and set output pixel
+          for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
+            scalar_t gOut = *gOut_ptr_NCHW;
+
+            // calculate and set grad_input
+            safe_add_2d(gInp_ptr_NC, iy_nw_cl, ix_nw_cl, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut);
+            safe_add_2d(gInp_ptr_NC, iy_ne_cl, ix_ne_cl, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut);
+            safe_add_2d(gInp_ptr_NC, iy_sw_cl, ix_sw_cl, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut);
+            safe_add_2d(gInp_ptr_NC, iy_se_cl, ix_se_cl, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut);
+
+            // calculate grad_grid
+            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw_cl, ix_nw_cl, inp_H, inp_W)) {
+              scalar_t nw_val = inp_ptr_NC[iy_nw_cl * inp_sH + ix_nw_cl * inp_sW];
+              gix -= nw_val * (iy_se - iy) * gOut;
+              giy -= nw_val * (ix_se - ix) * gOut;
+            }
+            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne_cl, ix_ne_cl, inp_H, inp_W)) {
+              scalar_t ne_val = inp_ptr_NC[iy_ne_cl * inp_sH + ix_ne_cl * inp_sW];
+              gix += ne_val * (iy_sw - iy) * gOut;
+              giy -= ne_val * (ix - ix_sw) * gOut;
+            }
+            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw_cl, ix_sw_cl, inp_H, inp_W)) {
+              scalar_t sw_val = inp_ptr_NC[iy_sw_cl * inp_sH + ix_sw_cl * inp_sW];
+              gix -= sw_val * (iy - iy_ne) * gOut;
+              giy += sw_val * (ix_ne - ix) * gOut;
+            }
+            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se_cl, ix_se_cl, inp_H, inp_W)) {
+              scalar_t se_val = inp_ptr_NC[iy_se_cl * inp_sH + ix_se_cl * inp_sW];
+              gix += se_val * (iy - iy_nw) * gOut;
+              giy += se_val * (ix - ix_nw) * gOut;
+            }
+          }
+
+          // un-normalize grad_grid values back to [-1, 1] constraints
+          gix = gix * (inp_W - 1) / 2;
+          giy = giy * (inp_H - 1) / 2;
+
+          // assuming grad_grid is contiguous
+          gGrid_ptr_NHW[0] = gix;
+          gGrid_ptr_NHW[1] = giy;
+        }
+      }
+    }
+    return std::make_tuple(grad_input, grad_grid);
+  }
+
+  template<typename scalar_t>
+  std::tuple<Tensor, Tensor>
+  grid_sampler3d_backward_cpu_impl(const Tensor& grad_output,
+                                   const Tensor& input, const Tensor& grid,
+                                   GridSamplerInterpolation interpolation_mode,
+                                   GridSamplerPadding padding_mode) {
+    auto grad_input = at::zeros_like(input);
+    auto grad_grid = at::empty_like(grid);
+    int64_t N = input.size(0);
+    int64_t C = input.size(1);
+    int64_t inp_D = input.size(2);
+    int64_t inp_H = input.size(3);
+    int64_t inp_W = input.size(4);
+    int64_t out_D = grid.size(1);
+    int64_t out_H = grid.size(2);
+    int64_t out_W = grid.size(3);
+    int64_t inp_sN = input.stride(0);
+    int64_t inp_sC = input.stride(1);
+    int64_t inp_sD = input.stride(2);
+    int64_t inp_sH = input.stride(3);
+    int64_t inp_sW = input.stride(4);
+    int64_t grid_sN = grid.stride(0);
+    int64_t grid_sD = grid.stride(1);
+    int64_t grid_sH = grid.stride(2);
+    int64_t grid_sW = grid.stride(3);
+    int64_t grid_sCoor = grid.stride(4);
+    int64_t gOut_sN = grad_output.stride(0);
+    int64_t gOut_sC = grad_output.stride(1);
+    int64_t gOut_sD = grad_output.stride(2);
+    int64_t gOut_sH = grad_output.stride(3);
+    int64_t gOut_sW = grad_output.stride(4);
+    int64_t gInp_sN = grad_input.stride(0);
+    int64_t gInp_sC = grad_input.stride(1);
+    int64_t gInp_sD = grad_input.stride(2);
+    int64_t gInp_sH = grad_input.stride(3);
+    int64_t gInp_sW = grad_input.stride(4);
+    int64_t gGrid_sN = grad_grid.stride(0);
+    int64_t gGrid_sW = grad_grid.stride(3);
+    scalar_t *inp_ptr = input.data<scalar_t>();
+    scalar_t *grid_ptr = grid.data<scalar_t>();
+    scalar_t *gOut_ptr = grad_output.data<scalar_t>();
+    scalar_t *gInp_ptr = grad_input.data<scalar_t>();
+    scalar_t *gGrid_ptr = grad_grid.data<scalar_t>();
+    // loop over each output pixel
+    #ifdef _OPENMP
+    #pragma omp parallel for
+    #endif
+    for (int64_t n = 0; n < N; ++n) {
+      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
+      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
+      scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN;
+      for (int64_t d = 0; d < out_D; ++d) {
+        for (int64_t h = 0; h < out_H; ++h) {
+          for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) {
+            // get the corresponding input x, y, z co-ordinates from grid
+            scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
+            scalar_t ix = *grid_ptr_NDHW;
+            scalar_t iy = grid_ptr_NDHW[grid_sCoor];
+            scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor];
+
+            // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1]
+            ix = ((ix + 1) / 2) * (inp_W - 1);
+            iy = ((iy + 1) / 2) * (inp_H - 1);
+            iz = ((iz + 1) / 2) * (inp_D - 1);
+
+            // get corner pixel values from (x, y, z)
+            // for 4d, we used north-east-south-west
+            // for 5d, we add top-bottom
+            int64_t ix_tnw = static_cast<int64_t>(std::floor(ix));
+            int64_t iy_tnw = static_cast<int64_t>(std::floor(iy));
+            int64_t iz_tnw = static_cast<int64_t>(std::floor(iz));
+
+            int64_t ix_tne = ix_tnw + 1;
+            int64_t iy_tne = iy_tnw;
+            int64_t iz_tne = iz_tnw;
+
+            int64_t ix_tsw = ix_tnw;
+            int64_t iy_tsw = iy_tnw + 1;
+            int64_t iz_tsw = iz_tnw;
+
+            int64_t ix_tse = ix_tnw + 1;
+            int64_t iy_tse = iy_tnw + 1;
+            int64_t iz_tse = iz_tnw;
+
+            int64_t ix_bnw = ix_tnw;
+            int64_t iy_bnw = iy_tnw;
+            int64_t iz_bnw = iz_tnw + 1;
+
+            int64_t ix_bne = ix_tnw + 1;
+            int64_t iy_bne = iy_tnw;
+            int64_t iz_bne = iz_tnw + 1;
+
+            int64_t ix_bsw = ix_tnw;
+            int64_t iy_bsw = iy_tnw + 1;
+            int64_t iz_bsw = iz_tnw + 1;
+
+            int64_t ix_bse = ix_tnw + 1;
+            int64_t iy_bse = iy_tnw + 1;
+            int64_t iz_bse = iz_tnw + 1;
+
+            // get surfaces to each neighbor:
+            scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+            scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+            scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+            scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+            scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+            scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+            scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+            scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+            int64_t ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl;
+            int64_t ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl;
+            int64_t ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl;
+            int64_t ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl;
+
+            if (padding_mode == GridSamplerPadding::Border) {
+              // clip coordinates to image borders
+              ix_tnw_cl = clip_coordinates(ix_tnw, inp_W);
+              iy_tnw_cl = clip_coordinates(iy_tnw, inp_H);
+              iz_tnw_cl = clip_coordinates(iz_tnw, inp_D);
+              ix_tne_cl = clip_coordinates(ix_tne, inp_W);
+              iy_tne_cl = clip_coordinates(iy_tne, inp_H);
+              iz_tne_cl = clip_coordinates(iz_tne, inp_D);
+              ix_tsw_cl = clip_coordinates(ix_tsw, inp_W);
+              iy_tsw_cl = clip_coordinates(iy_tsw, inp_H);
+              iz_tsw_cl = clip_coordinates(iz_tsw, inp_D);
+              ix_tse_cl = clip_coordinates(ix_tse, inp_W);
+              iy_tse_cl = clip_coordinates(iy_tse, inp_H);
+              iz_tse_cl = clip_coordinates(iz_tse, inp_D);
+              ix_bnw_cl = clip_coordinates(ix_bnw, inp_W);
+              iy_bnw_cl = clip_coordinates(iy_bnw, inp_H);
+              iz_bnw_cl = clip_coordinates(iz_bnw, inp_D);
+              ix_bne_cl = clip_coordinates(ix_bne, inp_W);
+              iy_bne_cl = clip_coordinates(iy_bne, inp_H);
+              iz_bne_cl = clip_coordinates(iz_bne, inp_D);
+              ix_bsw_cl = clip_coordinates(ix_bsw, inp_W);
+              iy_bsw_cl = clip_coordinates(iy_bsw, inp_H);
+              iz_bsw_cl = clip_coordinates(iz_bsw, inp_D);
+              ix_bse_cl = clip_coordinates(ix_bse, inp_W);
+              iy_bse_cl = clip_coordinates(iy_bse, inp_H);
+              iz_bse_cl = clip_coordinates(iz_bse, inp_D);
+            } else {
+              ix_tnw_cl = ix_tnw;
+              iy_tnw_cl = iy_tnw;
+              iz_tnw_cl = iz_tnw;
+              ix_tne_cl = ix_tne;
+              iy_tne_cl = iy_tne;
+              iz_tne_cl = iz_tne;
+              ix_tsw_cl = ix_tsw;
+              iy_tsw_cl = iy_tsw;
+              iz_tsw_cl = iz_tsw;
+              ix_tse_cl = ix_tse;
+              iy_tse_cl = iy_tse;
+              iz_tse_cl = iz_tse;
+              ix_bnw_cl = ix_bnw;
+              iy_bnw_cl = iy_bnw;
+              iz_bnw_cl = iz_bnw;
+              ix_bne_cl = ix_bne;
+              iy_bne_cl = iy_bne;
+              iz_bne_cl = iz_bne;
+              ix_bsw_cl = ix_bsw;
+              iy_bsw_cl = iy_bsw;
+              iz_bsw_cl = iz_bsw;
+              ix_bse_cl = ix_bse;
+              iy_bse_cl = iy_bse;
+              iz_bse_cl = iz_bse;
+            }
+
+            scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0);
+            scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+            scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
+            scalar_t *inp_ptr_NC = inp_ptr_N;
+            // calculate bilinear weighted pixel value and set output pixel
+            for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
+              scalar_t gOut = *gOut_ptr_NCDHW;
+
+              // calculate and set grad_input
+              safe_add_3d(gInp_ptr_NC, iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut);
+              safe_add_3d(gInp_ptr_NC, iz_tne_cl, iy_tne_cl, ix_tne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut);
+              safe_add_3d(gInp_ptr_NC, iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut);
+              safe_add_3d(gInp_ptr_NC, iz_tse_cl, iy_tse_cl, ix_tse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut);
+              safe_add_3d(gInp_ptr_NC, iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut);
+              safe_add_3d(gInp_ptr_NC, iz_bne_cl, iy_bne_cl, ix_bne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut);
+              safe_add_3d(gInp_ptr_NC, iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut);
+              safe_add_3d(gInp_ptr_NC, iz_bse_cl, iy_bse_cl, ix_bse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut);
+
+              // calculate grad_grid
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, inp_D, inp_H, inp_W)) {
+                scalar_t tnw_val = inp_ptr_NC[iz_tnw_cl * inp_sD + iy_tnw_cl * inp_sH + ix_tnw_cl * inp_sW];
+                gix -= tnw_val * (iy_bse - iy)    * (iz_bse - iz)    * gOut;
+                giy -= tnw_val * (ix_bse - ix)    * (iz_bse - iz)    * gOut;
+                giz -= tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gOut;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne_cl, iy_tne_cl, ix_tne_cl, inp_D, inp_H, inp_W)) {
+                scalar_t tne_val = inp_ptr_NC[iz_tne_cl * inp_sD + iy_tne_cl * inp_sH + ix_tne_cl * inp_sW];
+                gix += tne_val * (iy_bsw - iy)    * (iz_bsw - iz)    * gOut;
+                giy -= tne_val * (ix    - ix_bsw) * (iz_bsw - iz)    * gOut;
+                giz -= tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gOut;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, inp_D, inp_H, inp_W)) {
+                scalar_t tsw_val = inp_ptr_NC[iz_tsw_cl * inp_sD + iy_tsw_cl * inp_sH + ix_tsw_cl * inp_sW];
+                gix -= tsw_val * (iy - iy_bne)    * (iz_bne - iz)    * gOut;
+                giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz)    * gOut;
+                giz -= tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gOut;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse_cl, iy_tse_cl, ix_tse_cl, inp_D, inp_H, inp_W)) {
+                scalar_t tse_val = inp_ptr_NC[iz_tse_cl * inp_sD + iy_tse_cl * inp_sH + ix_tse_cl * inp_sW];
+                gix += tse_val * (iy - iy_bnw)    * (iz_bnw - iz)    * gOut;
+                giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz)    * gOut;
+                giz -= tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gOut;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, inp_D, inp_H, inp_W)) {
+                scalar_t bnw_val = inp_ptr_NC[iz_bnw_cl * inp_sD + iy_bnw_cl * inp_sH + ix_bnw_cl * inp_sW];
+                gix -= bnw_val * (iy_tse - iy)    * (iz - iz_tse)    * gOut;
+                giy -= bnw_val * (ix_tse - ix)    * (iz - iz_tse)    * gOut;
+                giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gOut;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne_cl, iy_bne_cl, ix_bne_cl, inp_D, inp_H, inp_W)) {
+                scalar_t bne_val = inp_ptr_NC[iz_bne_cl * inp_sD + iy_bne_cl * inp_sH + ix_bne_cl * inp_sW];
+                gix += bne_val * (iy_tsw - iy)    * (iz - iz_tsw)    * gOut;
+                giy -= bne_val * (ix    - ix_tsw) * (iz - iz_tsw)    * gOut;
+                giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gOut;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, inp_D, inp_H, inp_W)) {
+                scalar_t bsw_val = inp_ptr_NC[iz_bsw_cl * inp_sD + iy_bsw_cl * inp_sH + ix_bsw_cl * inp_sW];
+                gix -= bsw_val * (iy - iy_tne)    * (iz - iz_tne)    * gOut;
+                giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne)    * gOut;
+                giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gOut;
+              }
+              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse_cl, iy_bse_cl, ix_bse_cl, inp_D, inp_H, inp_W)) {
+                scalar_t bse_val = inp_ptr_NC[iz_bse_cl * inp_sD + iy_bse_cl * inp_sH + ix_bse_cl * inp_sW];
+                gix += bse_val * (iy - iy_tnw)    * (iz - iz_tnw)    * gOut;
+                giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw)    * gOut;
+                giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gOut;
+              }
+            }
+
+            // un-normalize grad_grid values back to [-1, 1] constraints
+            gix = gix * (inp_W - 1) / 2;
+            giy = giy * (inp_H - 1) / 2;
+            giz = giz * (inp_D - 1) / 2;
+
+            // assuming grad_grid is contiguous
+            gGrid_ptr_NDHW[0] = gix;
+            gGrid_ptr_NDHW[1] = giy;
+            gGrid_ptr_NDHW[2] = giz;
+          }
+        }
+      }
+    }
+    return std::make_tuple(grad_input, grad_grid);
+  }
+}
+
+// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
+Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid,
+                           int64_t interpolation_mode, int64_t padding_mode) {
+  return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler2d_cpu", [&] {
+    return grid_sampler2d_cpu_impl<scalar_t>(
+      input, grid, static_cast<GridSamplerInterpolation>(interpolation_mode),
+      static_cast<GridSamplerPadding>(padding_mode));
+  });
+}
+
+// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
+Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid,
+                           int64_t interpolation_mode, int64_t padding_mode) {
+  return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler3d_cpu", [&] {
+    return grid_sampler3d_cpu_impl<scalar_t>(
+      input, grid, static_cast<GridSamplerInterpolation>(interpolation_mode),
+      static_cast<GridSamplerPadding>(padding_mode));
+  });
+}
+
+// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
+std::tuple<Tensor, Tensor>
+grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
+                             int64_t interpolation_mode, int64_t padding_mode) {
+  return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu", [&] {
+    return grid_sampler2d_backward_cpu_impl<scalar_t>(
+      grad_output, input, grid,
+      static_cast<GridSamplerInterpolation>(interpolation_mode),
+      static_cast<GridSamplerPadding>(padding_mode));
+  });
+}
+
+// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
+std::tuple<Tensor, Tensor>
+grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
+                             int64_t interpolation_mode, int64_t padding_mode) {
+  return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_3d_backward_cpu", [&] {
+    return grid_sampler3d_backward_cpu_impl<scalar_t>(
+      grad_output, input, grid,
+      static_cast<GridSamplerInterpolation>(interpolation_mode),
+      static_cast<GridSamplerPadding>(padding_mode));
+  });
+}
+
+Tensor grid_sampler(const Tensor& input, const Tensor& grid, int64_t padding_mode) {
+  AT_CHECK(
+    (input.dim() == 4 || input.dim() == 5) && input.dim() == grid.dim(),
+    "grid_sampler(): expected 4D or 5D input and grid with same number "
+    "dimensions, but got input with sizes ", input.sizes(),
+    " and grid with sizes ", grid.sizes());
+  AT_CHECK(
+    input.size(0) == grid.size(0),
+    "grid_sampler(): expected grid and input to have same batch size, but got "
+    "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes());
+  AT_CHECK(
+    grid.size(-1) == input.dim() - 2,
+    "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last "
+    "dimension, but got grid with sizes ", grid.sizes());
+  // cudnn does not support inputs larger than 1024
+  if (at::native::cudnn_is_acceptable(input) &&
+      static_cast<GridSamplerPadding>(padding_mode) == GridSamplerPadding::Zeros &&
+      input.dim() == 4 &&
+      input.size(1) <= 1024) {
+    return cudnn_grid_sampler(input, grid);
+  }
+  if (input.dim() == 4) {
+    return at::grid_sampler_2d(input, grid, 0, padding_mode);
+  } else {
+    return at::grid_sampler_3d(input, grid, 0, padding_mode);
+  }
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h
new file mode 100644
index 00000000000000..f39b4e996469fa
--- /dev/null
+++ b/aten/src/ATen/native/GridSampler.h
@@ -0,0 +1,9 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+
+namespace at { namespace native { namespace detail {
+
+  enum class GridSamplerInterpolation {Bilinear, Nearest};
+  enum class GridSamplerPadding {Zeros, Border, Reflection};
+
+}}}  // namespace at::native::detail
diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp
index 9720adb4895769..e4eb336cd5f453 100644
--- a/aten/src/ATen/native/Indexing.cpp
+++ b/aten/src/ATen/native/Indexing.cpp
@@ -69,11 +69,7 @@ static std::vector<Tensor> expandByteTensors(const Tensor & self, TensorList ind
       }
       // Replace with nonzeros
       auto nonzero = index.nonzero();
-#ifndef USE_TH_SIZE_ZERO_DIM
-      auto special_empty = nonzero.numel() == 0;
-#else
       auto special_empty = false;
-#endif
       for (int64_t j = 0; j < index.dim(); j++) {
         if (special_empty) {
           // We can't call select on an empty tensor so we just create an empty
@@ -214,26 +210,10 @@ static Tensor computeLinearIndex(const Tensor & src, TensorList indices) {
   return linearIndex;
 }
 
-#ifndef USE_TH_SIZE_ZERO_DIM
-static bool hasEmptyTensor(TensorList tensors) {
-  for (auto& tensor : tensors) {
-    if (tensor.defined() && tensor.numel() == 0) {
-      return true;
-    }
-  }
-  return false;
-}
-#endif
-
 static std::tuple<Tensor, Tensor> makeLinearIndex(Tensor self, TensorList orig) {
   checkIndexTensorTypes(orig);
   // first expand ByteTensor (boolean masks) into 1 or more LongTensors
   auto indices = expandByteTensors(self, orig);
-#ifndef USE_TH_SIZE_ZERO_DIM
-  if (hasEmptyTensor(indices)) {
-    return std::make_tuple(self, self.type().toScalarType(kLong).tensor());
-  }
-#endif
   // next broadcast all index tensors together
   indices = expand_outplace(indices);
   // add missing null Tensors so that it matches self.dim()
@@ -299,11 +279,11 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten
   }
 
   // Check that source and destination slices have the same size
-  auto selfSlicedSizes = std::vector<int64_t>(self.sizes());
+  auto selfSlicedSizes = self.sizes().vec();
   if (selfSlicedSizes.size() > 0) {
     selfSlicedSizes.erase(selfSlicedSizes.begin() + dim);
   }
-  auto sourceSlicedSizes = std::vector<int64_t>(source.sizes());
+  auto sourceSlicedSizes = source.sizes().vec();
   if (sourceSlicedSizes.size() > 0) {
     sourceSlicedSizes.erase(sourceSlicedSizes.begin() + dim);
   }
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index cb24e71119f9b1..c82bf8ba0ae043 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -1,6 +1,7 @@
 #include "ATen/ATen.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/WrapDimUtilsMulti.h"
+#include <cctype>
 
 namespace at { namespace native {
 
@@ -136,6 +137,8 @@ Tensor einsum(std::string eqn, TensorList tensors) {
   } else {
     in_eqn = eqn;
   }
+  // remove spaces for einsum compatibility (#9929)
+  in_eqn.erase(std::remove_if(in_eqn.begin(), in_eqn.end(), isspace), in_eqn.end());
 
   // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index
   int64_t operand = 0;
@@ -212,7 +215,7 @@ Tensor einsum(std::string eqn, TensorList tensors) {
             num_output_dims++;
           }
         }
-      } else {                              // letter (hopefully)
+      } else if (! isspace(c)) {                              // letter (hopefully)
         AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side");
         AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
         int64_t letter_num = c-'a';
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
new file mode 100644
index 00000000000000..092b7255eb4a0d
--- /dev/null
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -0,0 +1,365 @@
+// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann
+// Licensed under the BSD-3-Clause license
+// This is the CPU implementation of the Connectionist Temporal Loss.
+// We mostly follow Graves.
+// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf
+// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based.
+// Graves et al call the probabilities y, we use log_probs (also calling them inputs)
+
+#include <ATen/ATen.h>
+#include "ATen/Dispatch.h"
+#include "ATen/TensorUtils.h"
+
+#include <numeric>
+#include <type_traits>
+
+namespace at {
+namespace native {
+
+namespace {
+
+// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done
+template<typename target_t>
+static inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) {
+  if (idx % 2 == 0) {
+    return BLANK;
+  } else {
+    return target[offset + stride * (idx / 2)];
+  }
+}
+
+// This kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1).
+// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha).
+// The function returns the loss and the alphas, the alphas are kept for the backward step. The wrapper (ctc_loss below) hides
+// the alphas from the user by only returning the loss.
+template<typename scalar_t, ScalarType target_scalar_type>
+std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) {
+  // log_probs: input_len x batch_size x num_labels
+  // targets [int64]: batch_size x target_length OR sum(target_lengths)
+  constexpr scalar_t neginf = -std::numeric_limits<scalar_t>::infinity();
+  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
+
+  CheckedFrom c = "ctc_loss_cpu";
+  auto log_probs_arg = TensorArg(log_probs, "log_probs", 1);
+  auto targets_arg = TensorArg(targets, "targets", 2);
+  checkScalarType(c, targets_arg, target_scalar_type);
+  checkDim(c, log_probs_arg, 3);
+  checkDimRange(c, targets_arg, 1, 3);
+
+  int64_t batch_size = log_probs.size(1);
+  int64_t num_labels = log_probs.size(2);
+  AT_CHECK(BLANK < num_labels, "blank must be in label range");
+  AT_CHECK((int64_t) input_lengths.size() == batch_size, "input_lengths must be of size batch_size");
+  AT_CHECK((int64_t) target_lengths.size() == batch_size, "target_lengths must be of size batch_size");
+
+  size_t tg_target_stride;
+  int64_t max_target_length;
+  std::vector<int64_t> tg_batch_offsets(batch_size);
+  if (targets.dim() == 1) { // concatenated targets
+    int64_t pos = 0;
+    max_target_length = 0;
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets[i] = pos;
+      pos += target_lengths[i];
+      if (max_target_length < target_lengths[i])
+	max_target_length = target_lengths[i];
+    }
+    tg_target_stride = targets.stride(0);
+    checkSize(c, targets_arg, 0, pos);
+  }
+  else { // batch x max_target_length
+    // dim is 2
+    int64_t tg_batch_stride = targets.stride(0);
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets[i] = i * tg_batch_stride;
+    }
+    tg_target_stride = targets.stride(1);
+    max_target_length = targets.size(1);
+    checkSize(c, targets_arg, 0, batch_size);
+    AT_CHECK(targets.size(1) >= max_target_length,
+             "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg,
+             " (while checking arguments for ", c, ")");
+  }
+  int64_t max_input_length = log_probs.size(0);
+  for (int64_t b = 0; b < batch_size; b++) {
+    AT_CHECK(input_lengths[b] <= max_input_length,
+	     "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
+	     " (while checking arguments for ", c, ")");
+  }
+
+  Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
+  Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options());
+
+  auto lpp  = log_probs.permute({1,0,2});
+  auto log_probs_a_global = lpp.accessor<scalar_t, 3>();
+  auto log_alpha_a_global = log_alpha.accessor<scalar_t, 3>();
+  auto targets_data = targets.data<target_t>();
+  auto neg_log_likelihood_a = neg_log_likelihood.accessor<scalar_t, 1>();
+
+  // alpha calculation for the first row, the three equations for alpha_1 above eq (6)
+  // first the default
+  log_alpha.narrow(1, 0, 1).fill_(neginf);
+  #pragma omp parallel for
+  for (int64_t b = 0; b < batch_size; b++) {
+    int64_t input_length = input_lengths[b];
+    int64_t target_length = target_lengths[b];
+    auto log_probs_a = log_probs_a_global[b];
+    auto log_alpha_a = log_alpha_a_global[b];
+    int64_t tg_batch_offset = tg_batch_offsets[b];
+
+    // the first two items of alpha_t above eq (6)
+    log_alpha_a[0][0] = log_probs_a[0][BLANK];
+    if (target_length > 0)
+      log_alpha_a[0][1] = log_probs_a[0][get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)];
+
+    // now the loop over the inputs
+    for (int64_t t=1; t<input_length; t++) {
+      for (int64_t s=0; s<2*target_length+1; s++) {
+	auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+	// this loop over s could be parallel/vectorized, too, but the required items are one index apart
+	// alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
+	// for the cuda implementation, that gave a speed boost.
+	// This is eq (6) and (7), la1,2,3 are the three summands. We keep track of the maximum for the logsumexp calculation.
+
+        scalar_t la1 = log_alpha_a[t-1][s];
+        scalar_t lamax = la1;
+        scalar_t la2, la3;
+        if (s > 0) {
+          la2 = log_alpha_a[t-1][s-1];
+          if (la2 > lamax)
+            lamax = la2;
+        } else {
+          la2 = neginf;
+        }
+        if ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) !=
+                        current_target_prime)) {
+          la3 = log_alpha_a[t-1][s-2];
+          if (la3 > lamax)
+            lamax = la3;
+        } else {
+          la3 = neginf;
+        }
+        if (lamax == neginf) // cannot do neginf-neginf
+          lamax = 0;
+	// this is the assignment of eq (6)
+        log_alpha_a[t][s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + log_probs_a[t][current_target_prime];
+      }
+    }
+    // the likelihood is the the sum of the last two alphas, eq (8), the loss is the negative log likelihood
+    scalar_t l1 = log_alpha_a[input_length-1][target_length*2];
+    scalar_t l2 = log_alpha_a[input_length-1][target_length*2-1];
+    scalar_t m = std::max(l1, l2);
+    m = ((m == neginf) ? 0 : m);
+    scalar_t log_likelihood = std::log(std::exp(l1-m)+std::exp(l2-m))+m;
+    neg_log_likelihood_a[b] = -log_likelihood;
+  }
+
+  return std::make_tuple(neg_log_likelihood, log_alpha);
+}
+
+// This is the backward. It consists of two phases:
+// a) computing the beta analogous to the alphas in the forward (backward half of the forward-backward algorithm) (eq (10) and (11))
+// b) collecting the per-activation characters for all s and wrapping the gradient (eq (16), the collection is the sum)
+template<typename scalar_t, ScalarType target_scalar_type>
+Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths,
+                                      const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) {
+  constexpr scalar_t neginf = -std::numeric_limits<scalar_t>::infinity();
+  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
+  int64_t max_input_length = log_probs.size(0);
+  int64_t batch_size = log_probs.size(1);
+  int64_t num_labels = log_probs.size(2);
+  Tensor grad = at::full_like(log_probs, neginf); // at this point, this is log of empty sum
+
+  // The admin bits. We don't do much checking and assume that the forward did.
+  int64_t tg_target_stride;
+  int64_t max_target_length;
+  std::vector<int64_t> tg_batch_offsets(batch_size);
+
+  if (targets.dim() == 1) { // concatenated targets
+    int64_t pos = 0;
+    max_target_length = 0;
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets[i] = pos;
+      pos += target_lengths[i];
+      if (max_target_length < target_lengths[i])
+	max_target_length = target_lengths[i];
+    }
+    tg_target_stride = targets.stride(0);
+  }
+  else { // batch x max_target_length
+    // dim is 2
+    int64_t tg_batch_stride = targets.stride(0);
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets[i] = i * tg_batch_stride;
+    }
+    tg_target_stride = targets.stride(1);
+    max_target_length = targets.size(1);
+  }
+
+  Tensor log_beta = at::empty_like(log_alpha);  // could be optimized to use only 2 rows
+  auto lpp  = log_probs.permute({1,0,2});
+  auto log_probs_a_global = lpp.accessor<scalar_t, 3>();
+  auto log_alpha_a_global = log_alpha.accessor<scalar_t, 3>();
+  auto log_beta_a_global = log_beta.accessor<scalar_t, 3>();
+  auto gp = grad.permute({1,0,2});
+  auto grad_a_global = gp.accessor<scalar_t, 3>();
+  auto targets_data = targets.data<target_t>();
+
+  #pragma omp parallel for
+  for (int64_t b = 0; b < batch_size; b++) {
+    auto log_probs_a = log_probs_a_global[b];
+    auto log_alpha_a = log_alpha_a_global[b];
+    auto log_beta_a = log_beta_a_global[b];
+    auto grad_a = grad_a_global[b];
+    int64_t input_length = input_lengths[b];
+    int64_t target_length = target_lengths[b];
+    int64_t tg_batch_offset = tg_batch_offsets[b];
+
+    // the initialization of beta before eq (10)
+    // here we do the fill for each batch item separately, as the input lengths will differ, so the t in which
+    // we start varies
+    if (input_length > 0) {
+      log_beta.narrow(0, b, 1).narrow(1, input_length-1, 1).fill_(neginf);
+      log_beta_a[input_length-1][2*target_length] = log_probs_a[input_length-1][BLANK];
+      grad_a[input_length-1][BLANK] = log_alpha_a[input_length-1][2*target_length] + log_beta_a[input_length-1][2*target_length];
+
+      if (target_length > 0) {
+        auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 2*target_length-1, BLANK);
+        log_beta_a[input_length-1][2*target_length-1] = log_probs_a[input_length-1][current_target_prime];
+
+        // the first two are a blank and a non-blank, so we know they are different and we don't need to do log+
+        grad_a[input_length-1][current_target_prime] = log_alpha_a[input_length-1][2*target_length-1] + log_beta_a[input_length-1][2*target_length-1];
+      }
+    }
+
+    // now loop applying eq (10) / (11)
+    for (int64_t t=input_length-2; t>=0; t--) {
+      // this loop over s could be parallel/vectorized and doesn't really need to be descending...
+      // alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
+      // for the cuda implementation, that gave a speed boost.
+      for (int64_t s=2*target_length; s>=0; s--) {
+        scalar_t lb1 = log_beta_a[t+1][s];
+        scalar_t lbmax = lb1;
+        scalar_t lb2, lb3;
+        auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+        if (s < 2*target_length) {
+          lb2 = log_beta_a[t+1][s+1];
+          if (lb2 > lbmax)
+            lbmax = lb2;
+        } else {
+          lb2 = neginf;
+        }
+        if ((s < 2*target_length-1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
+                                        current_target_prime)) {
+          lb3 = log_beta_a[t+1][s+2];
+          if (lb3 > lbmax)
+            lbmax = lb3;
+        } else {
+          lb3 = neginf;
+        }
+        if (lbmax == neginf)
+          lbmax = 0;
+
+        log_beta_a[t][s] = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + log_probs_a[t][current_target_prime];
+        // one might check whether one can vectorize this better when done after the t-loop...
+	// now that we have beta, we fill in the sum of alpha*beta in eq (16)
+	// in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency
+	// issue (several s can map to the same target character)
+        // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s]
+        scalar_t log_alpha_beta =  log_alpha_a[t][s] + log_beta_a[t][s];
+        scalar_t &lcab = grad_a[t][current_target_prime];
+        if (lcab == neginf) {
+          lcab = log_alpha_beta;
+        } else {
+          scalar_t max = std::max(lcab, log_alpha_beta);
+          lcab = std::log(std::exp(lcab-max)+std::exp(log_alpha_beta-max))+max;
+        }
+      }
+    }
+
+    // now grad has the sum of eq (16)
+    // now we wrap up the calculation by adding in the remaining items of eq (16)
+    // this could be a great target for further vectorization.
+    // grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16)
+    scalar_t nll = neg_log_likelihood.accessor<scalar_t, 1>()[b];
+    scalar_t gr =  grad_out.accessor<scalar_t, 1>()[b];
+    for (int64_t t = 0; t < input_length; t++) { // or go for the full thing?
+      for (int64_t c = 0; c < num_labels; c++) {
+        scalar_t& res = grad_a[t][c];
+        scalar_t lp = log_probs_a[t][c];
+        res = std::exp(lp)-std::exp(res + nll - lp) * gr;
+      }
+    }
+    // zero the remainder
+    if (input_length < max_input_length) {
+      grad.narrow(0, input_length, max_input_length - input_length).narrow(1, b, 1).zero_();
+    }
+  }
+  return grad;
+}
+
+} // namespace
+
+std::tuple<Tensor, Tensor> ctc_loss_cpu(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) {
+  return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] {
+      if (targets.type().scalarType() == kLong) {
+	return ctc_loss_cpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
+      } else {
+	return ctc_loss_cpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
+      }
+  });
+}
+
+Tensor ctc_loss_backward_cpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths,
+                             const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) {
+  return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] {
+      if (targets.type().scalarType() == kLong) {
+	return ctc_loss_backward_cpu_template<scalar_t,kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
+      } else {
+	return ctc_loss_backward_cpu_template<scalar_t,kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
+      }
+  });
+}
+
+// this wrapper function dispatches to the native and cudnn implementations and hides the alpha/grad from the user (by just returning the loss)
+// the gradient is implemented for _cudnn_ctc_loss (just in derivatives.yaml) and _ctc_loss and this function has automatic gradients
+// it also handles the reduction if desired
+Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, int64_t reduction) {
+  auto& ctx = at::globalContext();
+
+  bool use_cudnn =
+    detail::getCUDAHooks().compiledWithCuDNN() &&
+    (detail::getCUDAHooks().versionCuDNN() >= 7000) &&
+    ctx.userEnabledCuDNN() &&
+    (BLANK == 0) && (targets.dim()==1) &&
+    (log_probs.type().scalarType() == at::kFloat) &&
+    (targets.type().scalarType() == at::kInt) &&
+    (log_probs.type().backend() == Backend::CUDA);
+
+  if (use_cudnn) {
+    // we don't know that input_lengths and target_lengths have the same size (they should, but we didn't check yet)
+    int64_t max_input_length = log_probs.size(0);
+    for (int64_t b = 0; b < input_lengths.size(); b++) {
+      use_cudnn &= (input_lengths[b] == max_input_length);
+    }
+    for (int64_t b = 0; b < target_lengths.size(); b++) {
+      use_cudnn &= (target_lengths[b] <= 256);
+    }
+  }
+
+  Tensor res;
+  if (use_cudnn) {
+    res = std::get<0>(at::_cudnn_ctc_loss(log_probs, targets, input_lengths, target_lengths, BLANK, ctx.deterministicCuDNN()));
+  } else {
+    res = std::get<0>(at::_ctc_loss(log_probs, targets, input_lengths, target_lengths, BLANK));
+  }
+  if (reduction == Reduction::ElementwiseMean) {
+    auto target_lengths_t = at::tensor(target_lengths, res.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(res.type());
+    return (res / target_lengths_t).mean();
+  } else if (reduction == Reduction::Sum) {
+    return res.sum();
+  }
+  return res;
+}
+
+} } // at::native
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index d6ebbd4573a70c..b84b9c3f36b3ea 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -141,17 +141,9 @@ Tensor& eye_out_cpu(Tensor& result, int64_t n) {
 }
 
 Tensor& eye_out_cpu(Tensor& result, int64_t n, int64_t m) {
-#ifndef USE_TH_SIZE_ZERO_DIM
-  AT_CHECK(n > 0, "n must be greater than 0, got ", n);
-#else
   AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
-#endif
 
-#ifndef USE_TH_SIZE_ZERO_DIM
-  if(m <= 0) {
-#else
   if(m < 0) {
-#endif
     m = n;
   }
 
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index f7ced03c5ab6fc..be7e626fa1b748 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -12,6 +12,10 @@
 namespace at {
 namespace native {
 
+std::vector<Tensor> broadcast_tensors(TensorList tensors) {
+  return expand_outplace(tensors);
+}
+
 static void check_cat_no_zero_dim(TensorList tensors) {
   for(size_t i = 0; i < tensors.size(); ++i) {
     auto& t = tensors[i];
@@ -78,9 +82,6 @@ Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_
   } else {
     diag_size = std::max<int64_t>(std::min(self.size(dim1)+offset, self.size(dim2)), 0);
   }
-#ifndef USE_TH_SIZE_ZERO_DIM
-  AT_CHECK(diag_size > 0, "invalid diagonal offset ", offset); // the diagonal offset was too large in magnitude
-#endif
 
   // NumPy allows you to specify offsets "off the end"; let's just be careful not to
   // set a ridiculous storage_offset in that case (technically it shouldn't matter
@@ -95,8 +96,8 @@ Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_
 
   // construct new size and stride: we drop dim1 and dim2 (maximum first for not changing the index of the minumum)
   // the new ("joint") dimension is appended to the end of the shape / stride to match numpy semantics
-  auto sizes = std::vector<int64_t>(self.sizes());
-  auto strides = std::vector<int64_t>(self.strides());
+  auto sizes = self.sizes().vec();
+  auto strides = self.strides().vec();
   sizes.erase(sizes.begin() + std::max(dim1, dim2));
   strides.erase(strides.begin() + std::max(dim1, dim2));
   sizes.erase(sizes.begin() + std::min(dim1, dim2));
@@ -157,11 +158,7 @@ Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   if (start != cur_size) {  // start being the end is valid, but not a valid dim specification.
     start = maybe_wrap_dim(start, cur_size);
   }
-#ifndef USE_TH_SIZE_ZERO_DIM
-  if (length <= 0 || start > cur_size - length) {
-#else
   if (length < 0 || start > cur_size - length) {
-#endif
     AT_ERROR("start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
   }
   return at::slice(self, dim, start, start + length, 1);
@@ -246,14 +243,6 @@ static std::vector<int64_t> infer_size(IntList shape, int64_t numel) {
       AT_CHECK(newsize != 0, "cannot reshape tensor of 0 elements into shape ", shape);
       res[*infer_dim] = numel / newsize;
     }
-#ifndef USE_TH_SIZE_ZERO_DIM
-    if (numel == 0) {
-      // Collapse zero-element shapes into one dimension because TH handles zeros
-      // in sizes strangely: x.resize_(1, 0) has shape (1,). TODO: remove this
-      // once we have multi-dimensional empty tensors.
-      return {0};
-    }
-#endif
     return res;
   }
 
@@ -291,8 +280,8 @@ Tensor select(const Tensor& self, int64_t dim, int64_t index) {
   if (index < 0) {
     index += size;
   }
-  auto sizes = std::vector<int64_t>(self.sizes());
-  auto strides = std::vector<int64_t>(self.strides());
+  auto sizes = self.sizes().vec();
+  auto strides = self.strides().vec();
   auto storage_offset = self.storage_offset() + index * strides[dim];
   sizes.erase(sizes.begin() + dim);
   strides.erase(strides.begin() + dim);
@@ -303,8 +292,8 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_
   int64_t ndim = self.dim();
   AT_CHECK(ndim > 0, "slice() cannot be applied to a 0-dim tensor.");
   dim = maybe_wrap_dim(dim, ndim);
-  auto sizes = std::vector<int64_t>(self.sizes());
-  auto strides = std::vector<int64_t>(self.strides());
+  auto sizes = self.sizes().vec();
+  auto strides = self.strides().vec();
   if (step <= 0) {
     // TODO: support negative strides
     throw std::runtime_error("slice step must be positive");
@@ -327,12 +316,6 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_
   }
   auto storage_offset = self.storage_offset() + start * strides[dim];
   auto len = end - start;
-#ifndef USE_TH_SIZE_ZERO_DIM
-  if (len == 0) {
-    // TODO: currently we don't have support for 0-sized dims, return size 0 tensor for now
-    return self.type().tensor();
-  }
-#endif
   sizes[dim] = (len + step - 1) / step;  // round-up
   strides[dim] *= step;
   return self.as_strided(sizes, strides, storage_offset);
@@ -424,7 +407,7 @@ static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t di
   }
 
   if (self._indices().numel() == 0 && self._values().numel() == 0) {
-    std::vector<int64_t> sizes(self.sizes());
+    auto sizes = self.sizes().vec();
     std::swap(sizes[dim0], sizes[dim1]);
 
     return self.sparse_raw_resize_(sizes, self._sparseDims(), self._denseDims());
@@ -439,7 +422,7 @@ static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t di
     row0.copy_(row1);
     row1.copy_(tmp);
 
-    std::vector<int64_t> sizes(self.sizes());
+    auto sizes = self.sizes().vec();
     std::swap(sizes[dim0], sizes[dim1]);
 
     return self.sparse_raw_resize_(sizes, -1, -1);
@@ -458,8 +441,8 @@ Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) {
     return sparse_transpose_(self, dim0, dim1);
   }
 
-  std::vector<int64_t> strides(self.strides());
-  std::vector<int64_t> sizes(self.sizes());
+  auto strides = self.strides().vec();
+  auto sizes = self.sizes().vec();
   std::swap(strides[dim0], strides[dim1]);
   std::swap(sizes[dim0], sizes[dim1]);
   return self.as_strided_(sizes, strides);
@@ -478,8 +461,8 @@ Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) {
     return sparse_transpose_(self_clone, dim0, dim1);
   }
 
-  std::vector<int64_t> strides(self.strides());
-  std::vector<int64_t> sizes(self.sizes());
+  auto strides = self.strides().vec();
+  auto sizes = self.sizes().vec();
   std::swap(strides[dim0], strides[dim1]);
   std::swap(sizes[dim0], sizes[dim1]);
   return self.as_strided(sizes, strides);
@@ -539,13 +522,8 @@ inferSqueezeGeometry(const Tensor& tensor, int64_t dim) {
 
 std::tuple<std::vector<int64_t>, std::vector<int64_t> >
 inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) {
-#ifndef USE_TH_SIZE_ZERO_DIM
-  if (tensor.numel() == 0) {
-    throw std::runtime_error("cannot unsqueeze empty tensor");
-  }
-#endif
-  std::vector<int64_t> sizes(tensor.sizes());
-  std::vector<int64_t> strides(tensor.strides());
+  auto sizes = tensor.sizes().vec();
+  auto strides = tensor.strides().vec();
   int64_t new_stride = dim >= tensor.dim() ? 1 : sizes[dim] * strides[dim];
   sizes.insert(sizes.begin() + dim, 1);
   strides.insert(strides.begin() + dim, new_stride);
@@ -563,7 +541,7 @@ Tensor squeeze(const Tensor& self, int64_t dim) {
   dim = maybe_wrap_dim(dim, dims);
 
   if (dims == 0 || self.sizes()[dim] != 1) {
-    return self.as_strided(self.sizes().vec(), self.strides().vec());
+    return self.as_strided(self.sizes(), self.strides());
   }
   auto g = inferSqueezeGeometry(self, dim);
   return self.as_strided(std::get<0>(g), std::get<1>(g));
@@ -579,7 +557,7 @@ Tensor & squeeze_(Tensor& self, int64_t dim) {
   dim = maybe_wrap_dim(dim, self.dim());
 
   if (dims == 0 || self.sizes()[dim] != 1) {
-    return self.as_strided_(self.sizes().vec(), self.strides().vec());
+    return self.as_strided_(self.sizes(), self.strides());
   }
   auto g = inferSqueezeGeometry(self, dim);
   return self.as_strided_(std::get<0>(g), std::get<1>(g));
diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
index 84759874ef5355..0648387b35d5ae 100644
--- a/aten/src/ATen/native/TensorTransformations.cpp
+++ b/aten/src/ATen/native/TensorTransformations.cpp
@@ -13,7 +13,7 @@ Tensor flip_cpu(const Tensor& self, IntList dims) {
   const int64_t total_dims = self.dim(), flip_dims_size = dims.size();
   flip_check_errors(total_dims, flip_dims_size, dims);
 
-  auto flip_dims_v = std::vector<int64_t>(dims);
+  auto flip_dims_v = dims.vec();
   wrap_all_dims(flip_dims_v, total_dims);
   std::sort(flip_dims_v.begin(), flip_dims_v.end());
   auto final_indices = std::vector<at::Tensor>(total_dims);
diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h
index 2504a2c3f201b8..9b8c7d62b585c6 100644
--- a/aten/src/ATen/native/TensorTransformations.h
+++ b/aten/src/ATen/native/TensorTransformations.h
@@ -14,7 +14,7 @@ static inline void flip_check_errors(int64_t total_dims, int64_t flip_dims_size,
   AT_CHECK(flip_dims_size > 0 && flip_dims_size <= total_dims,
     "flip dims size out of range, got flip dims size=", flip_dims_size);
 
-  auto flip_dims_v = std::vector<int64_t>(dims);
+  auto flip_dims_v = dims.vec();
 
   // check if dims axis within range
   auto min_max_d = std::minmax_element(flip_dims_v.begin(), flip_dims_v.end());
diff --git a/aten/src/ATen/native/Vision.cpp b/aten/src/ATen/native/Vision.cpp
deleted file mode 100644
index 458e9aca23f0fe..00000000000000
--- a/aten/src/ATen/native/Vision.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "ATen/ATen.h"
-#include "ATen/NativeFunctions.h"
-#include "ATen/detail/CUDAHooksInterface.h"
-
-namespace {
-  enum GridSamplerMode {GridSamplerModeZeros, GridSamplerModeBorder};
-}
-
-namespace at { namespace native {
-
-Tensor grid_sampler(const Tensor& input, const Tensor& grid, int64_t padding_mode) {
-  // cudnn does not support inputs larger than 1024
-  if (at::native::cudnn_is_acceptable(input) &&
-      padding_mode == GridSamplerModeZeros &&
-      input.dim() == 4 &&
-      input.size(1) <= 1024) {
-    return cudnn_grid_sampler(input, grid);
-  }
-  if (input.dim() == 4) {
-    return thnn_grid_sampler_bilinear2d(input, grid, padding_mode);
-  }
-  if (input.dim() == 5) {
-    return thnn_grid_sampler_bilinear3d(input, grid, padding_mode);
-  }
-  AT_ERROR("grid_sampler(): input must be 4d or 5d but got input of shape: ", input.dim());
-}
-
-}}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu
new file mode 100644
index 00000000000000..a47865f2023474
--- /dev/null
+++ b/aten/src/ATen/native/cuda/GridSampler.cu
@@ -0,0 +1,788 @@
+#include "ATen/ATen.h"
+#include "ATen/native/GridSampler.h"
+#include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/CUDAApplyUtils.cuh"
+#include "ATen/cuda/detail/TensorInfo.cuh"
+#include "ATen/cuda/detail/IndexUtils.cuh"
+#include "ATen/cuda/detail/KernelUtils.h"
+
+namespace at { namespace native {
+
+using namespace at::cuda::detail;
+
+using at::native::detail::GridSamplerInterpolation;
+using at::native::detail::GridSamplerPadding;
+
+namespace {
+  static __forceinline__ __device__
+  int clip_coordinates(int in, int clip_limit) {
+    return ::min(clip_limit - 1, ::max(in, static_cast<int>(0)));
+  }
+
+  static __forceinline__ __device__
+  bool within_bounds_2d(int h, int w, int H, int W) {
+    return h >= 0 && h < H && w >= 0 && w < W;
+  }
+
+  static __forceinline__ __device__
+  bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
+    return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+  }
+
+  template<typename scalar_t>
+  static __forceinline__ __device__
+  void safe_add_2d(scalar_t *data, int h, int w,
+                   int sH, int sW, int H, int W,
+                   scalar_t delta) {
+    if (within_bounds_2d(h, w, H, W)) {
+      atomicAdd(data + h * sH + w * sW, delta);
+    }
+  }
+
+  template<typename scalar_t>
+  static __forceinline__ __device__
+  void safe_add_3d(scalar_t *data, int d, int h, int w,
+                   int sD, int sH, int sW, int D, int H, int W,
+                   scalar_t delta) {
+    if (within_bounds_3d(d, h, w, D, H, W)) {
+      atomicAdd(data + d * sD + h * sH + w * sW, delta);
+    }
+  }
+
+  template <typename scalar_t>
+  __launch_bounds__(1024)
+  __global__ void grid_sampler_2d_kernel(
+      const int nthreads,
+      TensorInfo<scalar_t, int> input,
+      TensorInfo<scalar_t, int> grid,
+      TensorInfo<scalar_t, int> output,
+      const GridSamplerPadding padding_mode) {
+
+    int C = input.sizes[1];
+    int inp_H = input.sizes[2];
+    int inp_W = input.sizes[3];
+    int out_H = grid.sizes[1];
+    int out_W = grid.sizes[2];
+    int inp_sN = input.strides[0];
+    int inp_sC = input.strides[1];
+    int inp_sH = input.strides[2];
+    int inp_sW = input.strides[3];
+    int grid_sN = grid.strides[0];
+    int grid_sH = grid.strides[1];
+    int grid_sW = grid.strides[2];
+    int grid_sCoor = grid.strides[3];
+    int out_sN = output.strides[0];
+    int out_sC = output.strides[1];
+    int out_sH = output.strides[2];
+    int out_sW = output.strides[3];
+
+    CUDA_KERNEL_LOOP(index, nthreads) {
+      const int w = index % out_W;
+      const int h = (index / out_W) % out_H;
+      const int n = index / (out_H * out_W);
+      const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+      // get the corresponding input x, y co-ordinates from grid
+      scalar_t ix = grid.data[grid_offset];
+      scalar_t iy = grid.data[grid_offset + grid_sCoor];
+
+      // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1]
+      float ixf = ((ix + 1.f) / 2) * (inp_W - 1);
+      float iyf = ((iy + 1.f) / 2) * (inp_H - 1);
+
+      ix = static_cast<scalar_t>(ixf);
+      iy = static_cast<scalar_t>(iyf);
+
+      // get NE, NW, SE, SW pixel values from (x, y)
+      int ix_nw = static_cast<int>(::floor(ixf));
+      int iy_nw = static_cast<int>(::floor(iyf));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      // get surfaces to each neighbor:
+      scalar_t nw = (ix_se - ix)    * (iy_se - iy);
+      scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
+      scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
+      scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
+
+      // calculate bilinear weighted pixel value and set output pixel
+      if (padding_mode == GridSamplerPadding::Border) {
+        // clip coordinates to image borders
+        ix_nw = clip_coordinates(ix_nw, inp_W);
+        iy_nw = clip_coordinates(iy_nw, inp_H);
+        ix_ne = clip_coordinates(ix_ne, inp_W);
+        iy_ne = clip_coordinates(iy_ne, inp_H);
+        ix_sw = clip_coordinates(ix_sw, inp_W);
+        iy_sw = clip_coordinates(iy_sw, inp_H);
+        ix_se = clip_coordinates(ix_se, inp_W);
+        iy_se = clip_coordinates(iy_se, inp_H);
+      }
+
+      auto inp_ptr_NC = input.data + n * inp_sN;
+      auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<scalar_t>(0);
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    }
+  }
+
+  template <typename scalar_t>
+  __launch_bounds__(1024)
+  __global__ void grid_sampler_3d_kernel(
+      const int nthreads,
+      TensorInfo<scalar_t, int> input,
+      TensorInfo<scalar_t, int> grid,
+      TensorInfo<scalar_t, int> output,
+      const GridSamplerPadding padding_mode) {
+
+    int C = input.sizes[1];
+    int inp_D = input.sizes[2];
+    int inp_H = input.sizes[3];
+    int inp_W = input.sizes[4];
+    int out_D = grid.sizes[1];
+    int out_H = grid.sizes[2];
+    int out_W = grid.sizes[3];
+    int inp_sN = input.strides[0];
+    int inp_sC = input.strides[1];
+    int inp_sD = input.strides[2];
+    int inp_sH = input.strides[3];
+    int inp_sW = input.strides[4];
+    int grid_sN = grid.strides[0];
+    int grid_sD = grid.strides[1];
+    int grid_sH = grid.strides[2];
+    int grid_sW = grid.strides[3];
+    int grid_sCoor = grid.strides[4];
+    int out_sN = output.strides[0];
+    int out_sC = output.strides[1];
+    int out_sD = output.strides[2];
+    int out_sH = output.strides[3];
+    int out_sW = output.strides[4];
+
+    CUDA_KERNEL_LOOP(index, nthreads) {
+      const int w = index % out_W;
+      const int h = (index / out_W) % out_H;
+      const int d = (index / (out_H * out_W)) % out_D;
+      const int n = index / (out_D * out_H * out_W);
+      const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+      // get the corresponding input x, y, z co-ordinates from grid
+      scalar_t ix = grid.data[grid_offset];
+      scalar_t iy = grid.data[grid_offset + grid_sCoor];
+      scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];
+
+      // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1]
+      float ixf = ((ix + 1.f) / 2) * (inp_W - 1);
+      float iyf = ((iy + 1.f) / 2) * (inp_H - 1);
+      float izf = ((iz + 1.f) / 2) * (inp_D - 1);
+
+      ix = static_cast<scalar_t>(ixf);
+      iy = static_cast<scalar_t>(iyf);
+      iz = static_cast<scalar_t>(izf);
+
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      int ix_tnw = static_cast<int>(::floor(ix));
+      int iy_tnw = static_cast<int>(::floor(iy));
+      int iz_tnw = static_cast<int>(::floor(iz));
+
+      int ix_tne = ix_tnw + 1;
+      int iy_tne = iy_tnw;
+      int iz_tne = iz_tnw;
+
+      int ix_tsw = ix_tnw;
+      int iy_tsw = iy_tnw + 1;
+      int iz_tsw = iz_tnw;
+
+      int ix_tse = ix_tnw + 1;
+      int iy_tse = iy_tnw + 1;
+      int iz_tse = iz_tnw;
+
+      int ix_bnw = ix_tnw;
+      int iy_bnw = iy_tnw;
+      int iz_bnw = iz_tnw + 1;
+
+      int ix_bne = ix_tnw + 1;
+      int iy_bne = iy_tnw;
+      int iz_bne = iz_tnw + 1;
+
+      int ix_bsw = ix_tnw;
+      int iy_bsw = iy_tnw + 1;
+      int iz_bsw = iz_tnw + 1;
+
+      int ix_bse = ix_tnw + 1;
+      int iy_bse = iy_tnw + 1;
+      int iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+      scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+      scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+      scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+      scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+      scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+      scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+      scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+      if (padding_mode == GridSamplerPadding::Border) {
+        // clip coordinates to image borders
+        ix_tnw = clip_coordinates(ix_tnw, inp_W);
+        iy_tnw = clip_coordinates(iy_tnw, inp_H);
+        iz_tnw = clip_coordinates(iz_tnw, inp_D);
+        ix_tne = clip_coordinates(ix_tne, inp_W);
+        iy_tne = clip_coordinates(iy_tne, inp_H);
+        iz_tne = clip_coordinates(iz_tne, inp_D);
+        ix_tsw = clip_coordinates(ix_tsw, inp_W);
+        iy_tsw = clip_coordinates(iy_tsw, inp_H);
+        iz_tsw = clip_coordinates(iz_tsw, inp_D);
+        ix_tse = clip_coordinates(ix_tse, inp_W);
+        iy_tse = clip_coordinates(iy_tse, inp_H);
+        iz_tse = clip_coordinates(iz_tse, inp_D);
+        ix_bnw = clip_coordinates(ix_bnw, inp_W);
+        iy_bnw = clip_coordinates(iy_bnw, inp_H);
+        iz_bnw = clip_coordinates(iz_bnw, inp_D);
+        ix_bne = clip_coordinates(ix_bne, inp_W);
+        iy_bne = clip_coordinates(iy_bne, inp_H);
+        iz_bne = clip_coordinates(iz_bne, inp_D);
+        ix_bsw = clip_coordinates(ix_bsw, inp_W);
+        iy_bsw = clip_coordinates(iy_bsw, inp_H);
+        iz_bsw = clip_coordinates(iz_bsw, inp_D);
+        ix_bse = clip_coordinates(ix_bse, inp_W);
+        iy_bse = clip_coordinates(iy_bse, inp_H);
+        iz_bse = clip_coordinates(iz_bse, inp_D);
+      }
+
+      auto inp_ptr_NC = input.data + n * inp_sN;
+      auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne
+        // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse
+        // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne
+        // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse
+        *out_ptr_NCDHW = static_cast<scalar_t>(0);
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
+        }
+      }
+    }
+  }
+
+  template <typename scalar_t>
+  __launch_bounds__(1024)
+  __global__ void grid_sampler_2d_backward_kernel(
+      const int nthreads,
+      TensorInfo<scalar_t, int> grad_output,
+      TensorInfo<scalar_t, int> input,
+      TensorInfo<scalar_t, int> grid,
+      TensorInfo<scalar_t, int> grad_input,  // initialized to zeros
+      TensorInfo<scalar_t, int> grad_grid,   // initialized to empty
+      const GridSamplerPadding padding_mode) {
+
+    int C = input.sizes[1];
+    int inp_H = input.sizes[2];
+    int inp_W = input.sizes[3];
+    int out_H = grid.sizes[1];
+    int out_W = grid.sizes[2];
+    int inp_sN = input.strides[0];
+    int inp_sC = input.strides[1];
+    int inp_sH = input.strides[2];
+    int inp_sW = input.strides[3];
+    int grid_sN = grid.strides[0];
+    int grid_sH = grid.strides[1];
+    int grid_sW = grid.strides[2];
+    int grid_sCoor = grid.strides[3];
+    int gOut_sN = grad_output.strides[0];
+    int gOut_sC = grad_output.strides[1];
+    int gOut_sH = grad_output.strides[2];
+    int gOut_sW = grad_output.strides[3];
+    int gInp_sN = grad_input.strides[0];
+    int gInp_sC = grad_input.strides[1];
+    int gInp_sH = grad_input.strides[2];
+    int gInp_sW = grad_input.strides[3];
+    int gGrid_sW = grad_grid.strides[2];
+
+    CUDA_KERNEL_LOOP(index, nthreads) {
+      const int w = index % out_W;
+      const int h = (index / out_W) % out_H;
+      const int n = index / (out_H * out_W);
+      const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+      // get the corresponding input x, y co-ordinates from grid
+      scalar_t ix = grid.data[grid_offset];
+      scalar_t iy = grid.data[grid_offset + grid_sCoor];
+
+      // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1]
+      float ixf = ((ix + 1.f) / 2) * (inp_W - 1);
+      float iyf = ((iy + 1.f) / 2) * (inp_H - 1);
+
+      ix = static_cast<scalar_t>(ixf);
+      iy = static_cast<scalar_t>(iyf);
+
+      // get NE, NW, SE, SW pixel values from (x, y)
+      int ix_nw = static_cast<int>(::floor(ixf));
+      int iy_nw = static_cast<int>(::floor(iyf));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      // get surfaces to each neighbor:
+      scalar_t nw = (ix_se - ix)    * (iy_se - iy);
+      scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
+      scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
+      scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
+
+      int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl;
+
+      // calculate bilinear weighted pixel value and set output pixel
+      if (padding_mode == GridSamplerPadding::Border) {
+        // clip coordinates to image borders
+        ix_nw_cl = clip_coordinates(ix_nw, inp_W);
+        iy_nw_cl = clip_coordinates(iy_nw, inp_H);
+        ix_ne_cl = clip_coordinates(ix_ne, inp_W);
+        iy_ne_cl = clip_coordinates(iy_ne, inp_H);
+        ix_sw_cl = clip_coordinates(ix_sw, inp_W);
+        iy_sw_cl = clip_coordinates(iy_sw, inp_H);
+        ix_se_cl = clip_coordinates(ix_se, inp_W);
+        iy_se_cl = clip_coordinates(iy_se, inp_H);
+      } else {
+        ix_nw_cl = ix_nw;
+        iy_nw_cl = iy_nw;
+        ix_ne_cl = ix_ne;
+        iy_ne_cl = iy_ne;
+        ix_sw_cl = ix_sw;
+        iy_sw_cl = iy_sw;
+        ix_se_cl = ix_se;
+        iy_se_cl = iy_se;
+      }
+
+      scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0);
+      scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN;
+      scalar_t *inp_ptr_NC = input.data + n * inp_sN;
+      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, gInp_ptr_NC += gInp_sC, gOut_ptr_NCHW += gOut_sC) {
+        scalar_t gOut = *gOut_ptr_NCHW;
+
+        // calculate and set grad_input
+        safe_add_2d(gInp_ptr_NC, iy_nw_cl, ix_nw_cl, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut);
+        safe_add_2d(gInp_ptr_NC, iy_ne_cl, ix_ne_cl, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut);
+        safe_add_2d(gInp_ptr_NC, iy_sw_cl, ix_sw_cl, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut);
+        safe_add_2d(gInp_ptr_NC, iy_se_cl, ix_se_cl, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut);
+
+        // calculate grad_grid
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw_cl, ix_nw_cl, inp_H, inp_W)) {
+          scalar_t nw_val = inp_ptr_NC[iy_nw_cl * inp_sH + ix_nw_cl * inp_sW];
+          gix -= nw_val * (iy_se - iy) * gOut;
+          giy -= nw_val * (ix_se - ix) * gOut;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne_cl, ix_ne_cl, inp_H, inp_W)) {
+          scalar_t ne_val = inp_ptr_NC[iy_ne_cl * inp_sH + ix_ne_cl * inp_sW];
+          gix += ne_val * (iy_sw - iy) * gOut;
+          giy -= ne_val * (ix - ix_sw) * gOut;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw_cl, ix_sw_cl, inp_H, inp_W)) {
+          scalar_t sw_val = inp_ptr_NC[iy_sw_cl * inp_sH + ix_sw_cl * inp_sW];
+          gix -= sw_val * (iy - iy_ne) * gOut;
+          giy += sw_val * (ix_ne - ix) * gOut;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se_cl, ix_se_cl, inp_H, inp_W)) {
+          scalar_t se_val = inp_ptr_NC[iy_se_cl * inp_sH + ix_se_cl * inp_sW];
+          gix += se_val * (iy - iy_nw) * gOut;
+          giy += se_val * (ix - ix_nw) * gOut;
+        }
+      }
+
+      // un-normalize grad_grid values back to [-1, 1] constraints
+      gix = gix * (inp_W - 1.f) / 2;
+      giy = giy * (inp_H - 1.f) / 2;
+
+      // assuming grad_grid is contiguous
+      // thus we can
+      //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NHW
+      //   2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
+      scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
+      gGrid_ptr_NHW[0] = gix;
+      gGrid_ptr_NHW[1] = giy;
+    }
+  }
+
+  template <typename scalar_t>
+  __launch_bounds__(1024)
+  __global__ void grid_sampler_3d_backward_kernel(
+      const int nthreads,
+      TensorInfo<scalar_t, int> grad_output,
+      TensorInfo<scalar_t, int> input,
+      TensorInfo<scalar_t, int> grid,
+      TensorInfo<scalar_t, int> grad_input,  // initialized to zeros
+      TensorInfo<scalar_t, int> grad_grid,   // initialized to empty
+      const GridSamplerPadding padding_mode) {
+
+    int C = input.sizes[1];
+    int inp_D = input.sizes[2];
+    int inp_H = input.sizes[3];
+    int inp_W = input.sizes[4];
+    int out_D = grid.sizes[1];
+    int out_H = grid.sizes[2];
+    int out_W = grid.sizes[3];
+    int inp_sN = input.strides[0];
+    int inp_sC = input.strides[1];
+    int inp_sD = input.strides[2];
+    int inp_sH = input.strides[3];
+    int inp_sW = input.strides[4];
+    int grid_sN = grid.strides[0];
+    int grid_sD = grid.strides[1];
+    int grid_sH = grid.strides[2];
+    int grid_sW = grid.strides[3];
+    int grid_sCoor = grid.strides[4];
+    int gOut_sN = grad_output.strides[0];
+    int gOut_sC = grad_output.strides[1];
+    int gOut_sD = grad_output.strides[2];
+    int gOut_sH = grad_output.strides[3];
+    int gOut_sW = grad_output.strides[4];
+    int gInp_sN = grad_input.strides[0];
+    int gInp_sC = grad_input.strides[1];
+    int gInp_sD = grad_input.strides[2];
+    int gInp_sH = grad_input.strides[3];
+    int gInp_sW = grad_input.strides[4];
+    int gGrid_sW = grad_grid.strides[3];
+
+    CUDA_KERNEL_LOOP(index, nthreads) {
+      const int w = index % out_W;
+      const int h = (index / out_W) % out_H;
+      const int d = (index / (out_H * out_W)) % out_D;
+      const int n = index / (out_D * out_H * out_W);
+      const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+      // get the corresponding input x, y, z co-ordinates from grid
+      scalar_t ix = grid.data[grid_offset];
+      scalar_t iy = grid.data[grid_offset + grid_sCoor];
+      scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];
+
+      // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1]
+      float ixf = ((ix + 1.f) / 2) * (inp_W - 1);
+      float iyf = ((iy + 1.f) / 2) * (inp_H - 1);
+      float izf = ((iz + 1.f) / 2) * (inp_D - 1);
+
+      ix = static_cast<scalar_t>(ixf);
+      iy = static_cast<scalar_t>(iyf);
+      iz = static_cast<scalar_t>(izf);
+
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      int ix_tnw = static_cast<int>(::floor(ix));
+      int iy_tnw = static_cast<int>(::floor(iy));
+      int iz_tnw = static_cast<int>(::floor(iz));
+
+      int ix_tne = ix_tnw + 1;
+      int iy_tne = iy_tnw;
+      int iz_tne = iz_tnw;
+
+      int ix_tsw = ix_tnw;
+      int iy_tsw = iy_tnw + 1;
+      int iz_tsw = iz_tnw;
+
+      int ix_tse = ix_tnw + 1;
+      int iy_tse = iy_tnw + 1;
+      int iz_tse = iz_tnw;
+
+      int ix_bnw = ix_tnw;
+      int iy_bnw = iy_tnw;
+      int iz_bnw = iz_tnw + 1;
+
+      int ix_bne = ix_tnw + 1;
+      int iy_bne = iy_tnw;
+      int iz_bne = iz_tnw + 1;
+
+      int ix_bsw = ix_tnw;
+      int iy_bsw = iy_tnw + 1;
+      int iz_bsw = iz_tnw + 1;
+
+      int ix_bse = ix_tnw + 1;
+      int iy_bse = iy_tnw + 1;
+      int iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+      scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+      scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+      scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+      scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+      scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+      scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+      scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+      int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl;
+      int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl;
+      int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl;
+      int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl;
+
+      if (padding_mode == GridSamplerPadding::Border) {
+        // clip coordinates to image borders
+        ix_tnw_cl = clip_coordinates(ix_tnw, inp_W);
+        iy_tnw_cl = clip_coordinates(iy_tnw, inp_H);
+        iz_tnw_cl = clip_coordinates(iz_tnw, inp_D);
+        ix_tne_cl = clip_coordinates(ix_tne, inp_W);
+        iy_tne_cl = clip_coordinates(iy_tne, inp_H);
+        iz_tne_cl = clip_coordinates(iz_tne, inp_D);
+        ix_tsw_cl = clip_coordinates(ix_tsw, inp_W);
+        iy_tsw_cl = clip_coordinates(iy_tsw, inp_H);
+        iz_tsw_cl = clip_coordinates(iz_tsw, inp_D);
+        ix_tse_cl = clip_coordinates(ix_tse, inp_W);
+        iy_tse_cl = clip_coordinates(iy_tse, inp_H);
+        iz_tse_cl = clip_coordinates(iz_tse, inp_D);
+        ix_bnw_cl = clip_coordinates(ix_bnw, inp_W);
+        iy_bnw_cl = clip_coordinates(iy_bnw, inp_H);
+        iz_bnw_cl = clip_coordinates(iz_bnw, inp_D);
+        ix_bne_cl = clip_coordinates(ix_bne, inp_W);
+        iy_bne_cl = clip_coordinates(iy_bne, inp_H);
+        iz_bne_cl = clip_coordinates(iz_bne, inp_D);
+        ix_bsw_cl = clip_coordinates(ix_bsw, inp_W);
+        iy_bsw_cl = clip_coordinates(iy_bsw, inp_H);
+        iz_bsw_cl = clip_coordinates(iz_bsw, inp_D);
+        ix_bse_cl = clip_coordinates(ix_bse, inp_W);
+        iy_bse_cl = clip_coordinates(iy_bse, inp_H);
+        iz_bse_cl = clip_coordinates(iz_bse, inp_D);
+      } else {
+        ix_tnw_cl = ix_tnw;
+        iy_tnw_cl = iy_tnw;
+        iz_tnw_cl = iz_tnw;
+        ix_tne_cl = ix_tne;
+        iy_tne_cl = iy_tne;
+        iz_tne_cl = iz_tne;
+        ix_tsw_cl = ix_tsw;
+        iy_tsw_cl = iy_tsw;
+        iz_tsw_cl = iz_tsw;
+        ix_tse_cl = ix_tse;
+        iy_tse_cl = iy_tse;
+        iz_tse_cl = iz_tse;
+        ix_bnw_cl = ix_bnw;
+        iy_bnw_cl = iy_bnw;
+        iz_bnw_cl = iz_bnw;
+        ix_bne_cl = ix_bne;
+        iy_bne_cl = iy_bne;
+        iz_bne_cl = iz_bne;
+        ix_bsw_cl = ix_bsw;
+        iy_bsw_cl = iy_bsw;
+        iz_bsw_cl = iz_bsw;
+        ix_bse_cl = ix_bse;
+        iy_bse_cl = iy_bse;
+        iz_bse_cl = iz_bse;
+      }
+
+      scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0);
+      scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+      scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN;
+      scalar_t *inp_ptr_NC = input.data + n * inp_sN;
+      // calculate bilinear weighted pixel value and set output pixel
+      for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
+        scalar_t gOut = *gOut_ptr_NCDHW;
+
+        // calculate and set grad_input
+        safe_add_3d(gInp_ptr_NC, iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut);
+        safe_add_3d(gInp_ptr_NC, iz_tne_cl, iy_tne_cl, ix_tne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut);
+        safe_add_3d(gInp_ptr_NC, iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut);
+        safe_add_3d(gInp_ptr_NC, iz_tse_cl, iy_tse_cl, ix_tse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut);
+        safe_add_3d(gInp_ptr_NC, iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut);
+        safe_add_3d(gInp_ptr_NC, iz_bne_cl, iy_bne_cl, ix_bne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut);
+        safe_add_3d(gInp_ptr_NC, iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut);
+        safe_add_3d(gInp_ptr_NC, iz_bse_cl, iy_bse_cl, ix_bse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut);
+
+        // calculate grad_grid
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, inp_D, inp_H, inp_W)) {
+          scalar_t tnw_val = inp_ptr_NC[iz_tnw_cl * inp_sD + iy_tnw_cl * inp_sH + ix_tnw_cl * inp_sW];
+          gix -= tnw_val * (iy_bse - iy)    * (iz_bse - iz)    * gOut;
+          giy -= tnw_val * (ix_bse - ix)    * (iz_bse - iz)    * gOut;
+          giz -= tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gOut;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne_cl, iy_tne_cl, ix_tne_cl, inp_D, inp_H, inp_W)) {
+          scalar_t tne_val = inp_ptr_NC[iz_tne_cl * inp_sD + iy_tne_cl * inp_sH + ix_tne_cl * inp_sW];
+          gix += tne_val * (iy_bsw - iy)    * (iz_bsw - iz)    * gOut;
+          giy -= tne_val * (ix    - ix_bsw) * (iz_bsw - iz)    * gOut;
+          giz -= tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gOut;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, inp_D, inp_H, inp_W)) {
+          scalar_t tsw_val = inp_ptr_NC[iz_tsw_cl * inp_sD + iy_tsw_cl * inp_sH + ix_tsw_cl * inp_sW];
+          gix -= tsw_val * (iy - iy_bne)    * (iz_bne - iz)    * gOut;
+          giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz)    * gOut;
+          giz -= tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gOut;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse_cl, iy_tse_cl, ix_tse_cl, inp_D, inp_H, inp_W)) {
+          scalar_t tse_val = inp_ptr_NC[iz_tse_cl * inp_sD + iy_tse_cl * inp_sH + ix_tse_cl * inp_sW];
+          gix += tse_val * (iy - iy_bnw)    * (iz_bnw - iz)    * gOut;
+          giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz)    * gOut;
+          giz -= tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gOut;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, inp_D, inp_H, inp_W)) {
+          scalar_t bnw_val = inp_ptr_NC[iz_bnw_cl * inp_sD + iy_bnw_cl * inp_sH + ix_bnw_cl * inp_sW];
+          gix -= bnw_val * (iy_tse - iy)    * (iz - iz_tse)    * gOut;
+          giy -= bnw_val * (ix_tse - ix)    * (iz - iz_tse)    * gOut;
+          giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gOut;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne_cl, iy_bne_cl, ix_bne_cl, inp_D, inp_H, inp_W)) {
+          scalar_t bne_val = inp_ptr_NC[iz_bne_cl * inp_sD + iy_bne_cl * inp_sH + ix_bne_cl * inp_sW];
+          gix += bne_val * (iy_tsw - iy)    * (iz - iz_tsw)    * gOut;
+          giy -= bne_val * (ix    - ix_tsw) * (iz - iz_tsw)    * gOut;
+          giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gOut;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, inp_D, inp_H, inp_W)) {
+          scalar_t bsw_val = inp_ptr_NC[iz_bsw_cl * inp_sD + iy_bsw_cl * inp_sH + ix_bsw_cl * inp_sW];
+          gix -= bsw_val * (iy - iy_tne)    * (iz - iz_tne)    * gOut;
+          giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne)    * gOut;
+          giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gOut;
+        }
+        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse_cl, iy_bse_cl, ix_bse_cl, inp_D, inp_H, inp_W)) {
+          scalar_t bse_val = inp_ptr_NC[iz_bse_cl * inp_sD + iy_bse_cl * inp_sH + ix_bse_cl * inp_sW];
+          gix += bse_val * (iy - iy_tnw)    * (iz - iz_tnw)    * gOut;
+          giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw)    * gOut;
+          giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gOut;
+        }
+      }
+
+      // un-normalize grad_grid values back to [-1, 1] constraints
+      gix = gix * (inp_W - 1) / 2;
+      giy = giy * (inp_H - 1) / 2;
+      giz = giz * (inp_D - 1) / 2;
+
+      // assuming grad_grid is contiguous
+      // thus we can
+      //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NDHW
+      //   2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
+      scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
+      gGrid_ptr_NDHW[0] = gix;
+      gGrid_ptr_NDHW[1] = giy;
+      gGrid_ptr_NDHW[2] = giz;
+    }
+  }
+}  // namespace
+
+// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
+Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
+                            int64_t interpolation_mode, int64_t padding_mode) {
+  auto N = input.size(0);
+  auto H = grid.size(1);
+  auto W = grid.size(2);
+  auto output = at::empty({N, input.size(1), H, W}, input.options());
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_cuda", [&] {
+    int count = static_cast<int>(N * H * W);
+    grid_sampler_2d_kernel<scalar_t>
+      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
+        count,
+        getTensorInfo<scalar_t, int>(input),
+        getTensorInfo<scalar_t, int>(grid),
+        getTensorInfo<scalar_t, int>(output),
+        static_cast<GridSamplerPadding>(padding_mode));
+  });
+  return output;
+}
+
+// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
+Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
+                            int64_t interpolation_mode, int64_t padding_mode) {
+  auto N = input.size(0);
+  auto D = grid.size(1);
+  auto H = grid.size(2);
+  auto W = grid.size(3);
+  auto output = at::empty({N, input.size(1), D, H, W}, input.options());
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_cuda", [&] {
+    int count = static_cast<int>(N * D * H * W);
+    grid_sampler_3d_kernel<scalar_t>
+      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
+        count,
+        getTensorInfo<scalar_t, int>(input),
+        getTensorInfo<scalar_t, int>(grid),
+        getTensorInfo<scalar_t, int>(output),
+        static_cast<GridSamplerPadding>(padding_mode));
+  });
+  return output;
+}
+
+// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
+std::tuple<Tensor, Tensor>
+grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
+                              int64_t interpolation_mode, int64_t padding_mode) {
+  auto N = input.size(0);
+  auto H = grid.size(1);
+  auto W = grid.size(2);
+  auto grad_input = at::zeros_like(input);
+  auto grad_grid = at::empty_like(grid);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_backward_cuda", [&] {
+    int count = static_cast<int>(N * H * W);
+    grid_sampler_2d_backward_kernel<scalar_t>
+      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
+        count,
+        getTensorInfo<scalar_t, int>(grad_output),
+        getTensorInfo<scalar_t, int>(input),
+        getTensorInfo<scalar_t, int>(grid),
+        getTensorInfo<scalar_t, int>(grad_input),
+        getTensorInfo<scalar_t, int>(grad_grid),
+        static_cast<GridSamplerPadding>(padding_mode));
+  });
+  return std::make_tuple(grad_input, grad_grid);
+}
+
+// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
+std::tuple<Tensor, Tensor>
+grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
+                              int64_t interpolation_mode, int64_t padding_mode) {
+  auto N = input.size(0);
+  auto D = grid.size(1);
+  auto H = grid.size(2);
+  auto W = grid.size(3);
+  auto grad_input = at::zeros_like(input);
+  auto grad_grid = at::empty_like(grid);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_3d_backward_cuda", [&] {
+    int count = static_cast<int>(N * D * H * W);
+    grid_sampler_3d_backward_kernel<scalar_t>
+      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
+        count,
+        getTensorInfo<scalar_t, int>(grad_output),
+        getTensorInfo<scalar_t, int>(input),
+        getTensorInfo<scalar_t, int>(grid),
+        getTensorInfo<scalar_t, int>(grad_input),
+        getTensorInfo<scalar_t, int>(grad_grid),
+        static_cast<GridSamplerPadding>(padding_mode));
+  });
+  return std::make_tuple(grad_input, grad_grid);
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index 4b474e0c079e77..12f22fcaf2f216 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -76,6 +76,9 @@ void gpu_nullary_kernel(TensorIterator& iter, const func_t& f) {
   using arg0_t = typename traits::result_type;
 
   int64_t numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
   if (iter.is_trivial_1d()) {
     auto strides = iter.get_inner_strides();
     int stride0 = strides[0];
@@ -105,6 +108,9 @@ void gpu_unary_kernel(TensorIterator& iter, const func_t& f) {
   using arg1_t = typename traits::arg1_t;
 
   int64_t numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
   if (iter.is_cpu_scalar(1)) {
     auto a = iter.scalar_value<arg1_t>(1);
     iter.remove_operand(1);
@@ -152,6 +158,9 @@ void gpu_binary_kernel(TensorIterator& iter, const func_t& f) {
   using arg2_t = typename traits::arg2_t;
 
   int numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
   if (iter.is_cpu_scalar(1)) {
     auto a = iter.scalar_value<arg1_t>(1);
     iter.remove_operand(1);
diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu
new file mode 100644
index 00000000000000..70ece3f4440cf7
--- /dev/null
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@@ -0,0 +1,625 @@
+// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann
+// Licensed under the BSD-3-Clause license
+// This is the GPU implementation of the Connectionist Temporal Loss.
+// We mostly follow Graves.
+// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf
+// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based.
+// Graves et al call the probabilities y, we use log_probs (also calling them inputs)
+// A few optimizations (simmilar to those here, but also some I didn't take) are described in
+// 2. Minmin Sun: http://on-demand.gputechconf.com/gtc/2016/presentation/s6383-minmin-sun-speech-recognition.pdf
+
+#include <ATen/TensorUtils.h>
+#include <ATen/Error.h>
+
+#include <ATen/ATen.h>
+#include "ATen/Dispatch.h"
+#include "ATen/cuda/CUDAApplyUtils.cuh"
+
+#include <type_traits>
+#include <numeric>
+
+namespace at {
+namespace native {
+
+namespace {
+
+// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done
+// __restrict__ impact to be measured, https://devblogs.nvidia.com/cuda-pro-tip-optimize-pointer-aliasing/
+template<typename target_t>
+__device__ static inline int64_t get_target_prime(const target_t* __restrict__ target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) {
+  if (idx % 2 == 0) {
+    return BLANK;
+  } else {
+    return target[offset + stride * (idx / 2)];
+  }
+}
+
+// this kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1).
+// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha).
+// In total it would be more efficient to compute the beta in the same kernel (e.g. cudnn does this). While the beta are not
+// needed for the loss itself (just the grad), we can return log_alpha+log_beta (so same space as currently) and the overhead
+// is small and the use-case for loss without grad is relatively limited.
+// We parallelize by batch and target sequence. Empirically, it is faster to loop over the input (log probs) sequence  and do
+// target in parallel, even if it means more frequent __syncthreads.
+// In contrast to the cuDNN implementation, we allow large target lengths. For this we need that all previous `s` have been
+// computed when we start a new block_s. This is why we have our own for loop here.
+template<typename scalar_t, typename target_t>
+__global__ void ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
+                                    const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
+                                    const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
+                                    scalar_t* __restrict__ neg_log_likelihood_data,
+                                    int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
+                                    int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
+                                    const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
+                                    int64_t batch_size, int64_t BLANK) {
+
+  constexpr scalar_t neginf = -INFINITY;
+
+  // bookkeeping
+  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+  int64_t input_length = input_lengths[b];
+  int64_t target_length = target_lengths[b];
+  int64_t lp_batch_offset = b*lp_batch_stride;
+  int64_t la_batch_offset = b*la_batch_stride;
+  int64_t tg_batch_offset = tg_batch_offsets[b];
+
+  if (b >= batch_size)
+    return;
+
+  // first row (t=0), the three equations for alpha_1 above eq (6)
+  for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) {
+    int64_t s = threadIdx.x + block_s;
+    scalar_t la;
+    switch (s) {
+    case 0:
+      la = log_probs_data[lp_batch_offset + lp_char_stride * BLANK];
+      break;
+    case 1:
+      if (target_length > 0) {
+        la = log_probs_data[lp_batch_offset + lp_char_stride * get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)];
+      }
+      else {
+        la = neginf;
+      }
+      break;
+    default:
+      la = neginf;
+    }
+    if (s < 2*max_target_length+1)
+      log_alpha_data[la_batch_offset + /* la_input_stride * 0 */ + la_target_stride * s] = la;
+  }
+
+  for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) {
+    int64_t s = threadIdx.x + block_s;
+
+    // These two only depend on s, so we can cache them.
+    int64_t current_char;       // l_s in eq (6)
+    bool have_three;            // flag which of the two cases in eq (6) we have
+    if (s < 2*target_length+1) {
+      current_char = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+      have_three = ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) !=
+				current_char));
+    } else {
+      current_char = BLANK;
+      have_three = false;
+    }
+    for (int64_t t=1; t < max_input_length; t++) {
+      __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
+      if ((t < input_length) && (target_length > 0) && (s < 2*target_length+1)) {
+	// only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands,
+	// lamax is the maximum for the logsumexp trick.
+        scalar_t la1 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * s];
+        scalar_t lamax = la1;
+        scalar_t la2, la3;
+        if (s > 0) {
+          la2 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-1)];
+          if (la2 > lamax)
+            lamax = la2;
+        } else {
+          la2 = neginf;
+        }
+        if (have_three) {
+          la3 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-2)];
+          if (la3 > lamax)
+            lamax = la3;
+        } else {
+          la3 = neginf;
+        }
+        if (lamax == neginf) // when all are neginf. (then the whole thing is neginf, but we can pretend)
+          lamax = 0;
+
+        log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax
+          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_char];
+      } else {
+	// otherwise we just set to neginf
+        if (s < 2*max_target_length+1)
+          log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = neginf;
+      }
+    }
+  }
+  __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
+
+  // compute the loss (eq (8))
+  if (threadIdx.x == 0) {
+    scalar_t l1 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2)];
+    scalar_t l2 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2-1)];
+    scalar_t m = ((l1 > l2) ? l1 : l2);
+    m = ((m == neginf) ? 0 : m);
+    scalar_t log_likelihood = std::log(std::exp(l1-m)+std::exp(l2-m))+m;
+    neg_log_likelihood_data[b] = -log_likelihood;
+  }
+}
+
+// The forward computation. Lot's of admin and a call to the alpha kernel.
+// Note: we do not check that the labels are in the valid range. As we use
+// them for indexing in the kernels, you'll see memory errors when you
+// pass corrupt labels.
+// We support both a 2-dimensional tensor as targets (one set of targets in each row) and
+// a 1-dimensional tensor where all targets are concatenated (and we use target_lengths
+// to figure out where they begin).
+// We return log_alpha (currently, might change to (log_alpha+log_beta) to be passed to the
+// backward. The dispatch function will only return the loss.
+template<typename scalar_t, ScalarType target_scalar_type>
+std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const Tensor& targets_, IntList input_lengths, IntList target_lengths, int64_t BLANK) {
+  // log_probs: input_len x batch_size x num_labels
+  // targets [int64]: batch_size x target_length OR sum(target_lengths)
+  CheckedFrom c = "ctc_loss_gpu";
+  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
+  auto targets = targets_.toType(log_probs.type().toScalarType(target_scalar_type)); // to log_probs cuda if it isn't there already
+  auto log_probs_arg = TensorArg(log_probs, "log_probs", 1);
+  auto targets_arg = TensorArg(targets, "targets", 2);
+  checkAllSameGPU(c, {log_probs_arg, targets_arg});
+
+  checkScalarType(c, targets_arg, target_scalar_type);
+  checkDim(c, log_probs_arg, 3);
+  checkDimRange(c, targets_arg, 1, 3);
+
+  int64_t batch_size = log_probs.size(1);
+  int64_t num_labels = log_probs.size(2);
+  AT_CHECK(BLANK < num_labels, "blank must be in label range");
+  AT_CHECK(input_lengths.size() == batch_size, "input_lengths must be of size batch_size");
+  AT_CHECK(target_lengths.size() == batch_size, "target_lengths must be of size batch_size");
+
+  int64_t lp_input_stride = log_probs.stride(0);
+  int64_t lp_char_stride = log_probs.stride(2);
+  int64_t tg_target_stride;
+
+  int64_t max_target_length;
+  auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong)));
+  auto tg_batch_offsets_data = tg_batch_offsets.data<int64_t>();
+  if (targets.dim() == 1) { // concatenated targets
+    int64_t pos = 0;
+    max_target_length = 0;
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets_data[i] = pos;
+      pos += target_lengths[i];
+      if (max_target_length < target_lengths[i])
+	max_target_length = target_lengths[i];
+    }
+    tg_target_stride = targets.stride(0);
+    checkSize(c, targets_arg, 0, pos);
+  }
+  else { // batch x max_target_length
+    // dim is 2
+    int64_t tg_batch_stride = targets.stride(0);
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets_data[i] = i * tg_batch_stride;
+    }
+    tg_target_stride = targets.stride(1);
+    max_target_length = targets.size(1);
+    checkSize(c, targets_arg, 0, batch_size);
+    AT_CHECK(targets.size(1) >= max_target_length,
+             "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg,
+             " (while checking arguments for ", c, ")");
+  }
+  int64_t max_input_length = log_probs.size(0);
+  for (int64_t b = 0; b < batch_size; b++) {
+    AT_CHECK(input_lengths[b] <= max_input_length,
+	     "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
+	     " (while checking arguments for ", c, ")");
+  }
+
+  auto target_lengths_t = at::tensor(target_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong));
+  auto input_lengths_t = at::tensor(input_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong));
+  tg_batch_offsets = tg_batch_offsets.toType(targets.type().toScalarType(kLong));
+
+  Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
+  Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options());
+
+  // Very likely, we could be more clever here, e.g. learning (or genralizing and reusing) from SoftMax.cu...
+  constexpr int max_threads = 1024;
+  int threads_target = max_threads;
+  while (threads_target / 2 >= 2*max_target_length+1) {
+    threads_target /= 2;
+  }
+  int threads_batch = std::min(max_threads / threads_target, (int) batch_size);
+
+  dim3 block(threads_target, threads_batch);
+  dim3 grid((2*max_target_length+1 + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  ctc_loss_log_alpha_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>(
+		      log_alpha.data<scalar_t>(),
+                      log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
+                      targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
+                      neg_log_likelihood.data<scalar_t>(),
+                      log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
+                      log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
+                      tg_batch_offsets.data<int64_t>(), tg_target_stride,
+                      batch_size, BLANK);
+  return std::make_tuple(neg_log_likelihood, log_alpha);
+}
+
+// The second (backward) half of the forward backward algorithm, (10) and (11). This is parallel to the
+// alpha kernel above. (As mentioned above, it might make sense do the calculation in the alpha kernel.)
+template<typename scalar_t, typename target_t>
+__global__ void ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
+                                             const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
+                                             const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
+                                             int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
+                                             int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
+                                             const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
+                                             int64_t batch_size, int64_t BLANK) {
+  constexpr scalar_t neginf = -INFINITY;
+
+  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+
+  int64_t input_length = input_lengths[b];
+  int64_t target_length = target_lengths[b];
+  int64_t lp_batch_offset = b*lp_batch_stride;
+  int64_t lb_batch_offset = b*lb_batch_stride;
+  int64_t tg_batch_offset = tg_batch_offsets[b];
+
+  if (b >= batch_size)
+    return;
+
+  // "first" row, the beta initiaization before eq (10) (t=target_length - differes per batch)
+  for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
+    int64_t s = threadIdx.x + block_s;
+    scalar_t lb;
+    if (s == 2*target_length) {
+      lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * BLANK];
+    } else if ((target_length > 0) && (s == 2*target_length-1)) {
+      int64_t current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+      lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * current_target_prime];
+    } else {
+      lb = neginf;
+    }
+    if (s < 2*max_target_length+1) {
+      log_beta_data[lb_batch_offset + (input_length-1) * lb_input_stride + lb_target_stride * s] = lb;
+    }
+  }
+
+  // go backward in s
+  for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
+    int64_t s = threadIdx.x + block_s;
+    int64_t current_target_prime;
+    bool have_three;
+    if (s < 2*target_length+1) {
+      current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+      have_three = ((s < 2*target_length-1) &&
+		    (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
+		     current_target_prime));
+    } else {
+      current_target_prime = BLANK;
+      have_three = false;
+    }
+    // now go backward in t. Note that we need to skip the last timestep that we did above.
+    for (int64_t t=max_input_length-2; t>=0; t--) {
+      __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch item
+      if ((t < input_length-1) && (target_length > 0) && (s < 2*target_length+1)) {
+        scalar_t lb1 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * s];
+        scalar_t lbmax = lb1;
+        scalar_t lb2, lb3;
+
+        if (s < 2*target_length) {
+          lb2 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+1)];
+          if (lb2 > lbmax)
+            lbmax = lb2;
+        } else {
+          lb2 = neginf;
+        }
+        if (have_three) {
+          lb3 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+2)];
+          if (lb3 > lbmax)
+            lbmax = lb3;
+        } else {
+          lb3 = neginf;
+        }
+        if (lbmax == neginf)
+          lbmax = 0;
+
+        scalar_t lb = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax
+          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime];
+
+        log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb;
+      } else if ((s < 2*max_target_length+1) || (t >= input_length)) {
+          log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf;
+      }
+    }
+  }
+}
+
+// This implements the subtrahend of equation (16) for all *nonblank* characters.
+// It assumes you have probs in gradient_data when called
+// and it modifies gradient_data to be, the gradient.
+// In order to facilitate this inplace update, We don't actually do this in logspace.
+// (The other variant implemented uses log_space and the differences seem to be
+//  not so problematic at least with unit normal distributed test activations.)
+// Internally this uses atomicAdd because different threads may write to the same
+// gradient position.
+// This is parallelised over b and s again.
+// Note that for us, the Z of eqn (16) is actually constant for all t and it is the
+// likelihood - this is why we use the negative log likelihood below.
+// We also multiply by the input gradient to keep with standard autograd style.
+// I took this trick from [2], for moderate alphabet sizes a log-space
+// calculation (with an atomic log add) is similarly in performance, but for large
+// alphabets the inplace nature is a considerable advantage.
+template<typename scalar_t, typename target_t>
+__global__ void ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_data,
+                                                     const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride,
+                                                     const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data,
+                                                     const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
+                                                     const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
+                                                     const scalar_t* __restrict__ neg_log_likelihood_data,
+                                                     int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride,
+                                                     int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
+                                                     int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
+                                                     int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
+                                                     const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
+                                                     int64_t batch_size, int64_t num_labels, int64_t BLANK) {
+  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+  int64_t s = threadIdx.x + blockIdx.x * blockDim.y; // note, this directly indexes into targets, no targets prime!
+
+  if (b >= batch_size)
+    return;
+
+  int64_t input_length = input_lengths[b];
+  int64_t target_length = target_lengths[b];
+  int64_t gr_batch_offset = b*gr_batch_stride;
+  int64_t lp_batch_offset = b*lp_batch_stride;
+  int64_t la_batch_offset = b*la_batch_stride;
+  int64_t lb_batch_offset = b*lb_batch_stride;
+  int64_t tg_batch_offset = tg_batch_offsets[b];
+
+  if (s >= target_length)
+    return;
+
+  int64_t target = targets_data[tg_batch_offset + s * tg_target_stride];
+  scalar_t nll = neg_log_likelihood_data[b];
+  scalar_t gr =  grad_out_data[b * grad_out_batch_stride];
+
+  for (int64_t t = 0; t < input_length; t++) {
+    scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * target];
+    atomicAdd(&gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * target],
+	      -std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)]
+			+ log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)]
+			+ nll - lp) * gr);
+  }
+}
+
+// This is the naive implementation of equation (16). It is parallelised in batch and input timestep.
+// It appears to be faster than the above method for small batch sizes.
+template<typename scalar_t, typename target_t>
+__global__ void ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
+                                                     const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride,
+                                                     const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data,
+                                                     const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
+                                                     const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
+                                                     const scalar_t* __restrict__ neg_log_likelihood_data,
+                                                     int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride,
+                                                     int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
+                                                     int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
+                                                     int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
+                                                     const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
+                                                     int64_t batch_size, int64_t num_labels, int64_t BLANK) {
+
+  constexpr scalar_t neginf = -INFINITY;
+  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+  int64_t t = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if ((t >= max_input_length) || (b >= batch_size))
+    return;
+
+  int64_t input_length = input_lengths[b];
+  int64_t target_length = target_lengths[b];
+  int64_t gr_batch_offset = b*gr_batch_stride;
+  int64_t lp_batch_offset = b*lp_batch_stride;
+  int64_t la_batch_offset = b*la_batch_stride;
+  int64_t lb_batch_offset = b*lb_batch_stride;
+  int64_t tg_batch_offset = tg_batch_offsets[b];
+
+  // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s]
+  for (int s = 0; s < 2*max_target_length+1; s++) {
+    if ((target_length > 0) && (s < 2*target_length+1)) {
+      int64_t current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+      scalar_t log_alpha_beta = (log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s]
+                                 + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s]);
+      scalar_t& lcab = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * current_target_prime];
+      if (lcab == neginf) {
+        lcab = log_alpha_beta;
+      } else {
+        scalar_t max = ((lcab > log_alpha_beta) ? lcab : log_alpha_beta);
+        lcab = std::log(std::exp(lcab-max)+std::exp(log_alpha_beta-max))+max;
+      }
+    }
+  }
+
+  scalar_t nll = neg_log_likelihood_data[b];
+  scalar_t gr =  grad_out_data[b * grad_out_batch_stride];
+
+  for (int64_t c = 0; c < num_labels; c++) {
+    scalar_t& res = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * c];
+    if (t < input_length) {
+      scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * c];
+      res = std::exp(lp)-std::exp(res + nll - lp) * gr;
+    }
+    else {
+      res = 0.;
+    }
+  }
+}
+
+// The backward. It essentially computes eq 16 by using the above kernels.
+// We don't do a lot of checking as we envision this to be called only when backpropagating through a (well-checked) forward.
+template<typename scalar_t, ScalarType target_scalar_type>
+Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets_, IntList input_lengths, IntList target_lengths,
+				      const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) {
+  constexpr scalar_t neginf = -INFINITY;
+  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
+  auto targets = targets_.toType(log_probs.type().toScalarType(target_scalar_type)); // to cuda if it isn't there already
+  int64_t batch_size = log_probs.size(1);
+  int64_t num_labels = log_probs.size(2);
+  int64_t lp_input_stride = log_probs.stride(0);
+  int64_t lp_char_stride = log_probs.stride(2);
+  int64_t tg_target_stride;
+
+  int64_t max_target_length;
+  auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong)));
+  auto tg_batch_offsets_data = tg_batch_offsets.data<int64_t>();
+  if (targets.dim() == 1) { // concatenated targets
+    int64_t pos = 0;
+    max_target_length = 0;
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets_data[i] = pos;
+      pos += target_lengths[i];
+      if (max_target_length < target_lengths[i])
+	max_target_length = target_lengths[i];
+    }
+    tg_target_stride = targets.stride(0);
+  }
+  else { // batch x max_target_length
+    // dim is 2
+    int64_t tg_batch_stride = targets.stride(0);
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets_data[i] = i * tg_batch_stride;
+    }
+    tg_target_stride = targets.stride(1);
+    max_target_length = targets.size(1);
+  }
+  auto target_lengths_t = at::tensor(target_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong));
+  auto input_lengths_t = at::tensor(input_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong));
+  tg_batch_offsets = tg_batch_offsets.toType(targets.type().toScalarType(kLong));
+
+  Tensor log_beta = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
+  Tensor grad = at::full_like(log_probs, neginf); // initialization for log(sum (alpha beta))
+
+  // As above, there may be better configurations to use.
+  constexpr int max_threads = 1024;
+  int threads_target = max_threads;
+  while (threads_target / 2 >= 2*max_target_length+1) {
+    threads_target /= 2;
+  }
+  int threads_batch = std::min(max_threads / threads_target, (int) batch_size);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  {
+    dim3 block(threads_target, threads_batch);
+    dim3 grid((2*max_target_length+1 + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch);
+
+    ctc_loss_backward_log_beta_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>
+      (log_beta.data<scalar_t>(),
+       log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
+       targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
+       log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
+       log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
+       tg_batch_offsets.data<int64_t>(), tg_target_stride,
+       batch_size, BLANK);
+  }
+
+  // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on
+  // the (capped) difference of timings.
+  // Note that for OK problems target length <= input length, so we
+  // only consider input length.
+  bool is_large = (2*log_probs.size(0)+(24*batch_size)/10+(2*num_labels)/10) > 450;
+  if (is_large) { // large alphabet, large batch
+    // this computes the probs, minuend in (16)
+    exp_out(grad, log_probs);
+    // now we compute the subtrahend for the blanks. It is a straightforward reduction because we know that
+    // blanks are in every other position.
+    // maybe we should kernelize this, too.
+    auto grad_blank = grad.narrow(2, BLANK, 1);
+    grad_blank -= (at::logsumexp(log_alpha.as_strided({batch_size, log_alpha.size(1), max_target_length+1},
+						      {log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2})
+				 + log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1},
+						       {log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}),
+				 2, true)
+		   .permute({1, 0, 2})
+		   .add_(neg_log_likelihood.view({1, batch_size, 1}))
+		   .sub_(log_probs.narrow(2, BLANK, 1))
+		   .exp_()
+		   );
+    // Tor the non-blank characters, we use a kernel to compute the subtrahend.
+    // Again we might configure block and grid in a better way.
+    int threads_target = max_threads;
+    while (threads_target / 2 >= max_target_length) {
+      threads_target /= 2;
+    }
+    int threads_batch = std::min(max_threads / threads_target, (int) batch_size);
+    dim3 block(threads_target, threads_batch);
+    dim3 grid((max_target_length + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch);
+    ctc_loss_backward_collect_nonblank_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>
+      (grad.data<scalar_t>(),
+       grad_out.data<scalar_t>(), grad_out.stride(0),
+       log_alpha.data<scalar_t>(), log_beta.data<scalar_t>(),
+       log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
+       targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
+       neg_log_likelihood.data<scalar_t>(),
+       grad.stride(0), grad.stride(1), grad.stride(2),
+       log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
+       log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
+       log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
+       tg_batch_offsets.data<int64_t>(), tg_target_stride,
+       batch_size, num_labels, BLANK);
+  } else { // small problem, use naive algorithm
+    // Still no block/grid configuration guru...
+    int threads_input = max_threads;
+    while (threads_input / 2 >= log_probs.size(0)) {
+      threads_input /= 2;
+    }
+    threads_batch = std::min(max_threads / threads_input, (int) batch_size);
+    dim3 block(threads_input, threads_batch);
+    dim3 grid((log_probs.size(0) + threads_input-1)/threads_input, (batch_size+threads_batch-1)/threads_batch);
+
+    ctc_loss_backward_collect_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>
+      (grad.data<scalar_t>(),
+       grad_out.data<scalar_t>(), grad_out.stride(0),
+       log_alpha.data<scalar_t>(), log_beta.data<scalar_t>(),
+       log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
+       targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
+       neg_log_likelihood.data<scalar_t>(),
+       grad.stride(0), grad.stride(1), grad.stride(2),
+       log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
+       log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
+       log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
+       tg_batch_offsets.data<int64_t>(), tg_target_stride,
+       batch_size, num_labels, BLANK);
+  }
+  return grad;
+}
+
+} // namespace
+
+std::tuple<Tensor, Tensor> ctc_loss_gpu(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) {
+  return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] {
+      if (targets.type().scalarType() == kLong) {
+	return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
+      } else {
+	return ctc_loss_gpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
+      }
+    });
+}
+
+Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths,
+                             const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) {
+  return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] {
+      if (targets.type().scalarType() == kLong) {
+	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
+      } else {
+	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
+      }
+    });
+}
+
+} } // at::native
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
index 420733dc558c06..5cde662fba78a6 100644
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -20,17 +20,9 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n) {
 }
 
 Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) {
-#ifndef USE_TH_SIZE_ZERO_DIM
-  AT_CHECK(n > 0, "n must be greater than 0, got ", n);
-#else
   AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
-#endif
 
-#ifndef USE_TH_SIZE_ZERO_DIM
-  if(m <= 0) {
-#else
   if(m < 0) {
-#endif
     m = n;
   }
 
diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu
index 7fa1fe64f28d6f..f97395d6392ca6 100644
--- a/aten/src/ATen/native/cuda/TensorTransformations.cu
+++ b/aten/src/ATen/native/cuda/TensorTransformations.cu
@@ -80,7 +80,7 @@ Tensor flip_cuda(const Tensor& self, IntList dims) {
     return out_tensor;
   }
 
-  auto flip_dims = std::vector<int64_t>(dims);
+  auto flip_dims = dims.vec();
   wrap_all_dims(flip_dims, total_dims);
 
   // use kernel_pointwise_flip_apply2 only when to-flip dim is the 1st or last dim, where collapseDims can reduce the amount of work
@@ -99,10 +99,10 @@ Tensor flip_cuda(const Tensor& self, IntList dims) {
 
   auto flip_dims_t = at::CPU(kLong).tensorFromBlob(flip_dims.data(), {static_cast<int64_t>(flip_dims.size())});
 
-  auto shape = std::vector<int64_t>(in_tensor.sizes());
+  auto shape = in_tensor.sizes().vec();
   auto shape_t = at::CPU(kLong).tensorFromBlob(shape.data(), {static_cast<int64_t>(shape.size())});
 
-  auto strides = std::vector<int64_t>(in_tensor.strides());
+  auto strides = in_tensor.strides().vec();
   auto strides_t = at::CPU(kLong).tensorFromBlob(strides.data(), {static_cast<int64_t>(strides.size())});
 
   // stride_contiguous is the stride of non-contiguous tensor after calling contiguous(),
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
new file mode 100644
index 00000000000000..966aa20e0a128d
--- /dev/null
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -0,0 +1,92 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Config.h>
+#include <ATen/cuda/CUDAConfig.h>
+#if AT_CUDNN_ENABLED()
+  #include <ATen/cudnn/Descriptors.h>
+#endif
+
+
+#if !AT_CUDNN_ENABLED() || (CUDNN_VERSION < 7000)
+
+namespace at { namespace native {
+
+// See Note [ATen preprocessor philosophy]
+
+std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, bool deterministic) {
+  throw std::runtime_error("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support");
+}
+
+}}
+
+#else // AT_CUDNN_ENABLED
+
+#include <ATen/cudnn/Descriptors.h>
+#include <ATen/cudnn/Types.h>
+#include <ATen/cudnn/Utils.h>
+
+#include <ATen/TensorUtils.h>
+
+namespace at { namespace native {
+
+namespace {
+
+}  // namespace
+
+std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tensor& targets_t, IntList input_lengths_, IntList target_lengths_, int64_t BLANK, bool deterministic) {
+  CheckedFrom c = "cudnn_ctc_loss";
+  TensorArg log_probs { log_probs_t, "log_probs", 1 };
+  TensorArg targets { targets_t, "targets", 2 };
+  checkDim(c, log_probs, 3);
+  checkScalarType(c, log_probs, kFloat);
+  checkDim(c, targets, 1);
+  checkScalarType(c, targets, kInt);
+  checkContiguous(c, targets); // ?
+  checkBackend(c, {*log_probs}, Backend::CUDA);
+  checkBackend(c, {*targets}, Backend::CPU);
+  int64_t batch_size = log_probs->size(1);
+  AT_CHECK(input_lengths_.size() == batch_size, "input_lengths needs to have size to match batch_size");
+  AT_CHECK(target_lengths_.size() == batch_size, "target_lengths needs to have size to match batch_size");
+
+  std::vector<int> input_lengths(input_lengths_.begin(), input_lengths_.end());
+  std::vector<int> target_lengths(target_lengths_.begin(), target_lengths_.end());
+
+  setCuDNNStreamToCurrent();
+  AT_CHECK(BLANK == 0, "blank must be label 0 for cudnn_ctc_loss");
+  // checked in dispatch:
+  // assert other conditions for cudnnCTCLoss: all label lengths <= 256
+  // all input lengths = logprob.size(0)
+
+  auto handle = getCudnnHandle();
+
+  cudnnCTCLossAlgo_t algo = (deterministic ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC);
+
+  Tensor probs = log_probs->softmax(2);
+  TensorDescriptor probs_desc{probs};
+  Tensor grad = at::empty_like(probs);
+  TensorDescriptor grad_desc{grad};
+
+  CTCLossDescriptor ctc_loss_desc;
+  ctc_loss_desc.set(CUDNN_DATA_FLOAT);
+
+  size_t workspace_size;
+  AT_CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(handle, probs_desc.desc(), grad_desc.desc(),
+					      targets->data<int>(), target_lengths.data(), input_lengths.data(),
+					      algo, ctc_loss_desc.desc(), &workspace_size));
+
+
+  Tensor workspace = log_probs->type().toScalarType(kByte).tensor(workspace_size); // new way of doing this with empty?
+  Tensor costs = at::empty({log_probs->size(1)}, log_probs->options());
+
+  AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(),
+			      targets->data<int>(), target_lengths.data(), input_lengths.data(),
+			      costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo,
+			      ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size));
+
+  return std::make_tuple(costs, grad);
+}
+
+
+}}  // namespace at::native
+
+#endif
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 63f0d7a29578f9..08e84618e81db3 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -166,7 +166,7 @@ namespace {
     std::vector<TensorDescriptor> descriptors(batch_sizes.size());
     size_t i = 0;
     // To be mutated in the loop
-    std::vector<int64_t> batch_tensor_size(tensor.sizes());
+    auto batch_tensor_size = tensor.sizes().vec();
     for (auto batch_size : batch_sizes) {
       batch_tensor_size[0] = batch_size;
       // NB: cuDNN RNN API does not support 2d descriptors, so we
@@ -994,7 +994,7 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
   if (output_mask[3]) {
     dw = at::native::_cudnn_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve);
   }
-  return std::tuple<Tensor, Tensor, Tensor, TensorList>{dx, dhx, dcx, dw};
+  return std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>>{dx, dhx, dcx, dw};
 }
 
 // TODO: I am not sure if we actually need the 'dropout' and 'train' parameters
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 8692d6165ff72a..2a8941675d6c9f 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -29,6 +29,11 @@
 - func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor
   variants: function, method
 
+- func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _cudnn_ctc_loss
+
 - func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
   variants: function
   dispatch:
@@ -244,6 +249,9 @@
 - func: blackman_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
   variants: function
 
+- func: broadcast_tensors(TensorList tensors) -> TensorList
+  variants: function
+
 - func: cat(TensorList tensors, int64_t dim=0) -> Tensor
   variants: function
 
@@ -504,6 +512,21 @@
 - func: cumprod_out(Tensor result, Tensor self, int64_t dim) -> Tensor
   variants: function
 
+- func: ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
+  variants: function
+
+- func: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU:  ctc_loss_cpu
+    CUDA: ctc_loss_gpu
+
+- func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int64_t blank) -> Tensor
+  variants: function
+  dispatch:
+    CPU: ctc_loss_backward_cpu
+    CUDA: ctc_loss_backward_gpu
+
 - func: det(Tensor self) -> Tensor
 
 - func: diagflat(Tensor self, int64_t offset=0) -> Tensor
@@ -715,9 +738,45 @@
   variants: function
   deprecated: true
 
+# NOTE [ grid_sampler Native Functions ]
+# `grid_sampler` does all the shape checking and then dispatches to one of
+# `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which
+# has the corresponding backward defined as native functions as well. Therefore,
+# in these functions and their backwards, no more shape checking is done.
+#
+# Additionally, arguments `padding_mode` and `interpolation_mode` are cast to
+# enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in
+# `interpolation_mode` because it only supports Bilinear interpolation mode.
+#
+# ssnl: Currently `interpolation_mode` is just a placeholder. It is not really
+#       used. Everywhere Bilinear is assumed. I will add Nearest soon.
 - func: grid_sampler(Tensor input, Tensor grid, int64_t padding_mode) -> Tensor
   variants: function
 
+- func: grid_sampler_2d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor
+  variants: function
+  dispatch:
+    CPU: grid_sampler_2d_cpu
+    CUDA: grid_sampler_2d_cuda
+
+- func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: grid_sampler_2d_backward_cpu
+    CUDA: grid_sampler_2d_backward_cuda
+
+- func: grid_sampler_3d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor
+  variants: function
+  dispatch:
+    CPU: grid_sampler_3d_cpu
+    CUDA: grid_sampler_3d_cuda
+
+- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU: grid_sampler_3d_backward_cpu
+    CUDA: grid_sampler_3d_backward_cuda
+
 - func: hann_window(int64_t window_length, TensorOptions options={}) -> Tensor
   variants: function
 
@@ -1270,6 +1329,12 @@
 - func: selu_(Tensor self) -> Tensor
   variants: function
 
+- func: celu(Tensor self, Scalar alpha=1.0) -> Tensor
+  variants: function
+
+- func: celu_(Tensor self, Scalar alpha=1.0) -> Tensor
+  variants: function
+
 - func: sigmoid(Tensor self) -> Tensor
 
 - func: sigmoid_(Tensor self) -> Tensor
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 0cac9bcb9131fa..7a7e8be5c7ff6a 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -63,7 +63,7 @@ SparseTensor new_sparse(const SparseType& dtype) {
   AT_ASSERT(!dtype.is_variable());
   AT_ASSERT(dtype.is_sparse());
   // TODO: Hmm... this const_cast business seems a bit dodgy
-  return SparseTensor(new SparseTensorImpl(const_cast<SparseType*>(&dtype)), /* retain */ false);
+  return SparseTensor(new SparseTensorImpl(dtype.backend(), dtype.scalarType()), /* retain */ false);
 }
 
 /*** Helper methods ***/
diff --git a/aten/src/ATen/native/sparse/SparseUtils.h b/aten/src/ATen/native/sparse/SparseUtils.h
index 226b9084579031..aac948e4940241 100644
--- a/aten/src/ATen/native/sparse/SparseUtils.h
+++ b/aten/src/ATen/native/sparse/SparseUtils.h
@@ -118,7 +118,7 @@ inline Tensor _new_values_with_size_of(const Tensor& values, int64_t nnz) {
     // That's the assumption this code makes.
     return values.type().tensor({nnz});
   } else {
-    std::vector<int64_t> size = values.sizes();
+    std::vector<int64_t> size = values.sizes().vec();
     size[0] = nnz;
     return values.type().tensor(size);
   }
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
index 02b190e4901c55..ff4b0e0c57736c 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -81,7 +81,7 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
   int64_t newNnz = newEnd.first - indicesIter;
 
   indices1D.resize_({1, newNnz});
-  std::vector<int64_t> newValues_size(values.sizes());
+  auto newValues_size = values.sizes().vec();
   newValues_size[0] = newNnz;
   Tensor newValues = at::empty(newValues_size, values.options());
 
diff --git a/aten/src/ATen/nn.yaml b/aten/src/ATen/nn.yaml
index 86783e4f76dcd6..8a8a8a5dbe954b 100644
--- a/aten/src/ATen/nn.yaml
+++ b/aten/src/ATen/nn.yaml
@@ -58,7 +58,7 @@
 
 # Activation functions
 
-- name: elu(Tensor self, Scalar alpha=1, Scalar scale=1)
+- name: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1)
   cname: ELU
   has_inplace: True
   scalar_check:
@@ -274,11 +274,3 @@
 - name: thnn_conv_dilated3d(Tensor self, Tensor weight, IntList[3] kernel_size, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1)
   cname: VolumetricDilatedConvolution
   buffers: [columns, ones]
-
-# Vision
-
-- name: thnn_grid_sampler_bilinear2d(Tensor self, Tensor grid, int64_t padding_mode)
-  cname: SpatialGridSamplerBilinear
-
-- name: thnn_grid_sampler_bilinear3d(Tensor self, Tensor grid, int64_t padding_mode)
-  cname: VolumetricGridSamplerBilinear
diff --git a/aten/src/ATen/optional.h b/aten/src/ATen/optional.h
index 287ddd8577b340..0a395bae67cda6 100644
--- a/aten/src/ATen/optional.h
+++ b/aten/src/ATen/optional.h
@@ -1,982 +1 @@
-// Copyright (C) 2011 - 2012 Andrzej Krzemienski.
-//
-// Use, modification, and distribution is subject to the Boost Software
-// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
-// http://www.boost.org/LICENSE_1_0.txt)
-//
-// The idea and interface is based on Boost.Optional library
-// authored by Fernando Luis Cacciola Carballal
-//
-// From https://github.com/akrzemi1/Optional
-//
-// ATen:
-// - Move to `at` namespace.
-// - Remove macro use in line 478 because the nvcc device compiler cannot handle it.
-
-#pragma once
-
-# include <utility>
-# include <type_traits>
-# include <initializer_list>
-# include <cassert>
-# include <functional>
-# include <string>
-# include <stdexcept>
-
-# define TR2_OPTIONAL_REQUIRES(...) typename std::enable_if<__VA_ARGS__::value, bool>::type = false
-
-# if defined __GNUC__ // NOTE: GNUC is also defined for Clang
-#   if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)
-#     define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
-#   elif (__GNUC__ > 4)
-#     define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
-#   endif
-#
-#   if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)
-#     define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
-#   elif (__GNUC__ > 4)
-#     define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
-#   endif
-#
-#   if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1)
-#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
-#   elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9)
-#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
-#   elif (__GNUC__ > 4)
-#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
-#   endif
-# endif
-#
-# if defined __clang_major__
-#   if (__clang_major__ == 3 && __clang_minor__ >= 5)
-#     define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
-#   elif (__clang_major__ > 3)
-#     define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
-#   endif
-#   if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
-#     define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
-#   elif (__clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2)
-#     define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
-#   endif
-# endif
-#
-# if defined _MSC_VER
-#   if (_MSC_VER >= 1900)
-#     define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
-#   endif
-# endif
-
-# if defined __clang__
-#   if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9)
-#     define OPTIONAL_HAS_THIS_RVALUE_REFS 1
-#   else
-#     define OPTIONAL_HAS_THIS_RVALUE_REFS 0
-#   endif
-# elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
-#   define OPTIONAL_HAS_THIS_RVALUE_REFS 1
-# elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
-#   define OPTIONAL_HAS_THIS_RVALUE_REFS 1
-# else
-#   define OPTIONAL_HAS_THIS_RVALUE_REFS 0
-# endif
-
-
-# if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
-#   define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1
-#   define OPTIONAL_CONSTEXPR_INIT_LIST constexpr
-# else
-#   define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0
-#   define OPTIONAL_CONSTEXPR_INIT_LIST
-# endif
-
-# if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && (__cplusplus != 201103L)
-#   define OPTIONAL_HAS_MOVE_ACCESSORS 1
-# else
-#   define OPTIONAL_HAS_MOVE_ACCESSORS 0
-# endif
-
-# // In C++11 constexpr implies const, so we need to make non-const members also non-constexpr
-# if (defined __cplusplus) && (__cplusplus == 201103L)
-#   define OPTIONAL_MUTABLE_CONSTEXPR
-# else
-#   define OPTIONAL_MUTABLE_CONSTEXPR constexpr
-# endif
-
-namespace at {
-
-// 20.5.4, optional for object types
-template <class T> class optional;
-
-// 20.5.5, optional for lvalue reference types
-template <class T> class optional<T&>;
-
-
-// workaround: std utility functions aren't constexpr yet
-template <class T> inline constexpr T&& constexpr_forward(typename std::remove_reference<T>::type& t) noexcept
-{
-  return static_cast<T&&>(t);
-}
-
-template <class T> inline constexpr T&& constexpr_forward(typename std::remove_reference<T>::type&& t) noexcept
-{
-    static_assert(!std::is_lvalue_reference<T>::value, "!!");
-    return static_cast<T&&>(t);
-}
-
-template <class T> inline constexpr typename std::remove_reference<T>::type&& constexpr_move(T&& t) noexcept
-{
-    return static_cast<typename std::remove_reference<T>::type&&>(t);
-}
-
-
-#if defined NDEBUG
-# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR)
-#else
-# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) ((CHECK) ? (EXPR) : ([]{assert(!#CHECK);}(), (EXPR)))
-#endif
-
-
-namespace detail_
-{
-
-// static_addressof: a constexpr version of addressof
-template <typename T>
-struct has_overloaded_addressof
-{
-  template <class X>
-  constexpr static bool has_overload(...) { return false; }
-
-  template <class X, size_t S = sizeof(std::declval<X&>().operator&()) >
-  constexpr static bool has_overload(bool) { return true; }
-
-  constexpr static bool value = has_overload<T>(true);
-};
-
-template <typename T, TR2_OPTIONAL_REQUIRES(!has_overloaded_addressof<T>)>
-constexpr T* static_addressof(T& ref)
-{
-  return &ref;
-}
-
-template <typename T, TR2_OPTIONAL_REQUIRES(has_overloaded_addressof<T>)>
-T* static_addressof(T& ref)
-{
-  return std::addressof(ref);
-}
-
-
-// the call to convert<A>(b) has return type A and converts b to type A iff b decltype(b) is implicitly convertible to A
-template <class U>
-constexpr U convert(U v) { return v; }
-
-} // namespace detail
-
-
-constexpr struct trivial_init_t{} trivial_init{};
-
-
-// 20.5.6, In-place construction
-constexpr struct in_place_t{} in_place{};
-
-
-// 20.5.7, Disengaged state indicator
-struct nullopt_t
-{
-  struct init{};
-  constexpr explicit nullopt_t(init){}
-};
-constexpr nullopt_t nullopt{nullopt_t::init()};
-
-
-// 20.5.8, class bad_optional_access
-class bad_optional_access : public std::logic_error {
-public:
-  explicit bad_optional_access(const std::string& what_arg) : logic_error{what_arg} {}
-  explicit bad_optional_access(const char* what_arg) : logic_error{what_arg} {}
-};
-
-
-template <class T>
-union storage_t
-{
-  unsigned char dummy_;
-  T value_;
-
-  constexpr storage_t( trivial_init_t ) noexcept : dummy_() {};
-
-  template <class... Args>
-  constexpr storage_t( Args&&... args ) : value_(constexpr_forward<Args>(args)...) {}
-
-  ~storage_t(){}
-};
-
-
-template <class T>
-union constexpr_storage_t
-{
-    unsigned char dummy_;
-    T value_;
-
-    constexpr constexpr_storage_t( trivial_init_t ) noexcept : dummy_() {};
-
-    template <class... Args>
-    constexpr constexpr_storage_t( Args&&... args ) : value_(constexpr_forward<Args>(args)...) {}
-
-    ~constexpr_storage_t() = default;
-};
-
-
-template <class T>
-struct optional_base
-{
-    bool init_;
-    storage_t<T> storage_;
-
-    constexpr optional_base() noexcept : init_(false), storage_(trivial_init) {};
-
-    explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {}
-
-    explicit constexpr optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {}
-
-    template <class... Args> explicit optional_base(in_place_t, Args&&... args)
-        : init_(true), storage_(constexpr_forward<Args>(args)...) {}
-
-    template <class U, class... Args, TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
-    explicit optional_base(in_place_t, std::initializer_list<U> il, Args&&... args)
-        : init_(true), storage_(il, std::forward<Args>(args)...) {}
-
-    ~optional_base() { if (init_) storage_.value_.T::~T(); }
-};
-
-
-template <class T>
-struct constexpr_optional_base
-{
-    bool init_;
-    constexpr_storage_t<T> storage_;
-
-    constexpr constexpr_optional_base() noexcept : init_(false), storage_(trivial_init) {};
-
-    explicit constexpr constexpr_optional_base(const T& v) : init_(true), storage_(v) {}
-
-    explicit constexpr constexpr_optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {}
-
-    template <class... Args> explicit constexpr constexpr_optional_base(in_place_t, Args&&... args)
-      : init_(true), storage_(constexpr_forward<Args>(args)...) {}
-
-    template <class U, class... Args, TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
-    OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base(in_place_t, std::initializer_list<U> il, Args&&... args)
-      : init_(true), storage_(il, std::forward<Args>(args)...) {}
-
-    ~constexpr_optional_base() = default;
-};
-
-template <class T>
-using OptionalBase = typename std::conditional<
-    std::is_trivially_destructible<T>::value,                          // if possible
-    constexpr_optional_base<typename std::remove_const<T>::type>, // use base with trivial destructor
-    optional_base<typename std::remove_const<T>::type>
->::type;
-
-
-
-template <class T>
-class optional : private OptionalBase<T>
-{
-  static_assert( !std::is_same<typename std::decay<T>::type, nullopt_t>::value, "bad T" );
-  static_assert( !std::is_same<typename std::decay<T>::type, in_place_t>::value, "bad T" );
-
-
-  constexpr bool initialized() const noexcept { return OptionalBase<T>::init_; }
-  typename std::remove_const<T>::type* dataptr() {  return std::addressof(OptionalBase<T>::storage_.value_); }
-  constexpr const T* dataptr() const { return detail_::static_addressof(OptionalBase<T>::storage_.value_); }
-
-# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
-  constexpr const T& contained_val() const& { return OptionalBase<T>::storage_.value_; }
-#   if OPTIONAL_HAS_MOVE_ACCESSORS == 1
-  OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && { return std::move(OptionalBase<T>::storage_.value_); }
-  OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & { return OptionalBase<T>::storage_.value_; }
-#   else
-  T& contained_val() & { return OptionalBase<T>::storage_.value_; }
-  T&& contained_val() && { return std::move(OptionalBase<T>::storage_.value_); }
-#   endif
-# else
-  constexpr const T& contained_val() const { return OptionalBase<T>::storage_.value_; }
-  T& contained_val() { return OptionalBase<T>::storage_.value_; }
-# endif
-
-  void clear() noexcept {
-    if (initialized()) dataptr()->T::~T();
-    OptionalBase<T>::init_ = false;
-  }
-
-  template <class... Args>
-  void initialize(Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...)))
-  {
-    assert(!OptionalBase<T>::init_);
-    ::new (static_cast<void*>(dataptr())) T(std::forward<Args>(args)...);
-    OptionalBase<T>::init_ = true;
-  }
-
-  template <class U, class... Args>
-  void initialize(std::initializer_list<U> il, Args&&... args) noexcept(noexcept(T(il, std::forward<Args>(args)...)))
-  {
-    assert(!OptionalBase<T>::init_);
-    ::new (static_cast<void*>(dataptr())) T(il, std::forward<Args>(args)...);
-    OptionalBase<T>::init_ = true;
-  }
-
-public:
-  typedef T value_type;
-
-  // 20.5.5.1, constructors
-  constexpr optional() noexcept : OptionalBase<T>()  {};
-  constexpr optional(nullopt_t) noexcept : OptionalBase<T>() {};
-
-  optional(const optional& rhs)
-  : OptionalBase<T>()
-  {
-    if (rhs.initialized()) {
-        ::new (static_cast<void*>(dataptr())) T(*rhs);
-        OptionalBase<T>::init_ = true;
-    }
-  }
-
-  optional(optional&& rhs) noexcept(std::is_nothrow_move_constructible<T>::value)
-  : OptionalBase<T>()
-  {
-    if (rhs.initialized()) {
-        ::new (static_cast<void*>(dataptr())) T(std::move(*rhs));
-        OptionalBase<T>::init_ = true;
-    }
-  }
-
-  constexpr optional(const T& v) : OptionalBase<T>(v) {}
-
-  constexpr optional(T&& v) : OptionalBase<T>(constexpr_move(v)) {}
-
-  template <class... Args>
-  explicit constexpr optional(in_place_t, Args&&... args)
-  : OptionalBase<T>(in_place_t{}, constexpr_forward<Args>(args)...) {}
-
-  template <class U, class... Args, TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
-  OPTIONAL_CONSTEXPR_INIT_LIST explicit optional(in_place_t, std::initializer_list<U> il, Args&&... args)
-  : OptionalBase<T>(in_place_t{}, il, constexpr_forward<Args>(args)...) {}
-
-  // 20.5.4.2, Destructor
-  ~optional() = default;
-
-  // 20.5.4.3, assignment
-  optional& operator=(nullopt_t) noexcept
-  {
-    clear();
-    return *this;
-  }
-
-  optional& operator=(const optional& rhs)
-  {
-    if      (initialized() == true  && rhs.initialized() == false) clear();
-    else if (initialized() == false && rhs.initialized() == true)  initialize(*rhs);
-    else if (initialized() == true  && rhs.initialized() == true)  contained_val() = *rhs;
-    return *this;
-  }
-
-  optional& operator=(optional&& rhs)
-  noexcept(std::is_nothrow_move_assignable<T>::value && std::is_nothrow_move_constructible<T>::value)
-  {
-    if      (initialized() == true  && rhs.initialized() == false) clear();
-    else if (initialized() == false && rhs.initialized() == true)  initialize(std::move(*rhs));
-    else if (initialized() == true  && rhs.initialized() == true)  contained_val() = std::move(*rhs);
-    return *this;
-  }
-
-  template <class U>
-  auto operator=(U&& v)
-  -> typename std::enable_if
-  <
-    std::is_same<typename std::decay<U>::type, T>::value,
-    optional&
-  >::type
-  {
-    if (initialized()) { contained_val() = std::forward<U>(v); }
-    else               { initialize(std::forward<U>(v));  }
-    return *this;
-  }
-
-
-  template <class... Args>
-  void emplace(Args&&... args)
-  {
-    clear();
-    initialize(std::forward<Args>(args)...);
-  }
-
-  template <class U, class... Args>
-  void emplace(std::initializer_list<U> il, Args&&... args)
-  {
-    clear();
-    initialize<U, Args...>(il, std::forward<Args>(args)...);
-  }
-
-  // 20.5.4.4, Swap
-  void swap(optional<T>& rhs) noexcept(std::is_nothrow_move_constructible<T>::value && noexcept(swap(std::declval<T&>(), std::declval<T&>())))
-  {
-    if      (initialized() == true  && rhs.initialized() == false) { rhs.initialize(std::move(**this)); clear(); }
-    else if (initialized() == false && rhs.initialized() == true)  { initialize(std::move(*rhs)); rhs.clear(); }
-    else if (initialized() == true  && rhs.initialized() == true)  { using std::swap; swap(**this, *rhs); }
-  }
-
-  // 20.5.4.5, Observers
-
-  explicit constexpr operator bool() const noexcept { return initialized(); }
-  constexpr bool has_value() const noexcept { return initialized(); }
-
-  constexpr T const* operator ->() const {
-    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr());
-  }
-
-# if OPTIONAL_HAS_MOVE_ACCESSORS == 1
-
-  OPTIONAL_MUTABLE_CONSTEXPR T* operator ->() {
-    assert (initialized());
-    return dataptr();
-  }
-
-  constexpr T const& operator *() const& {
-    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val());
-  }
-
-  OPTIONAL_MUTABLE_CONSTEXPR T& operator *() & {
-    assert (initialized());
-    return contained_val();
-  }
-
-  OPTIONAL_MUTABLE_CONSTEXPR T&& operator *() && {
-    assert (initialized());
-    return constexpr_move(contained_val());
-  }
-
-  constexpr T const& value() const& {
-    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
-  }
-
-  OPTIONAL_MUTABLE_CONSTEXPR T& value() & {
-    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
-  }
-
-  OPTIONAL_MUTABLE_CONSTEXPR T&& value() && {
-    if (!initialized()) throw bad_optional_access("bad optional access");
-	return std::move(contained_val());
-  }
-
-# else
-
-  T* operator ->() {
-    assert (initialized());
-    return dataptr();
-  }
-
-  constexpr T const& operator *() const {
-    return contained_val();
-  }
-
-  T& operator *() {
-    assert (initialized());
-    return contained_val();
-  }
-
-  constexpr T const& value() const {
-    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
-  }
-
-  T& value() {
-    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
-  }
-
-# endif
-
-# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
-
-  template <class V>
-  constexpr T value_or(V&& v) const&
-  {
-    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
-  }
-
-#   if OPTIONAL_HAS_MOVE_ACCESSORS == 1
-
-  template <class V>
-  OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) &&
-  {
-    return *this ? constexpr_move(const_cast<optional<T>&>(*this).contained_val()) : detail_::convert<T>(constexpr_forward<V>(v));
-  }
-
-#   else
-
-  template <class V>
-  T value_or(V&& v) &&
-  {
-    return *this ? constexpr_move(const_cast<optional<T>&>(*this).contained_val()) : detail_::convert<T>(constexpr_forward<V>(v));
-  }
-
-#   endif
-
-# else
-
-  template <class V>
-  constexpr T value_or(V&& v) const
-  {
-    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
-  }
-
-# endif
-
-  // 20.6.3.6, modifiers
-  void reset() noexcept { clear(); }
-};
-
-
-template <class T>
-class optional<T&>
-{
-  static_assert( !std::is_same<T, nullopt_t>::value, "bad T" );
-  static_assert( !std::is_same<T, in_place_t>::value, "bad T" );
-  T* ref;
-
-public:
-
-  // 20.5.5.1, construction/destruction
-  constexpr optional() noexcept : ref(nullptr) {}
-
-  constexpr optional(nullopt_t) noexcept : ref(nullptr) {}
-
-  constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {}
-
-  optional(T&&) = delete;
-
-  constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {}
-
-  explicit constexpr optional(in_place_t, T& v) noexcept : ref(detail_::static_addressof(v)) {}
-
-  explicit optional(in_place_t, T&&) = delete;
-
-  ~optional() = default;
-
-  // 20.5.5.2, mutation
-  optional& operator=(nullopt_t) noexcept {
-    ref = nullptr;
-    return *this;
-  }
-
-  // optional& operator=(const optional& rhs) noexcept {
-    // ref = rhs.ref;
-    // return *this;
-  // }
-
-  // optional& operator=(optional&& rhs) noexcept {
-    // ref = rhs.ref;
-    // return *this;
-  // }
-
-  template <typename U>
-  auto operator=(U&& rhs) noexcept
-  -> typename std::enable_if
-  <
-    std::is_same<typename std::decay<U>::type, optional<T&>>::value,
-    optional&
-  >::type
-  {
-    ref = rhs.ref;
-    return *this;
-  }
-
-  template <typename U>
-  auto operator=(U&& rhs) noexcept
-  -> typename std::enable_if
-  <
-    !std::is_same<typename std::decay<U>::type, optional<T&>>::value,
-    optional&
-  >::type
-  = delete;
-
-  void emplace(T& v) noexcept {
-    ref = detail_::static_addressof(v);
-  }
-
-  void emplace(T&&) = delete;
-
-
-  void swap(optional<T&>& rhs) noexcept
-  {
-    std::swap(ref, rhs.ref);
-  }
-
-  // 20.5.5.3, observers
-  constexpr T* operator->() const {
-    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref);
-  }
-
-  constexpr T& operator*() const {
-    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref);
-  }
-
-  constexpr T& value() const {
-    return ref ? *ref : (throw bad_optional_access("bad optional access"), *ref);
-  }
-
-  explicit constexpr operator bool() const noexcept {
-    return ref != nullptr;
-  }
-
-  constexpr bool has_value() const noexcept {
-    return ref != nullptr;
-  }
-
-  template <class V>
-  constexpr typename std::decay<T>::type value_or(V&& v) const
-  {
-    return *this ? **this : detail_::convert<typename std::decay<T>::type>(constexpr_forward<V>(v));
-  }
-
-  // x.x.x.x, modifiers
-  void reset() noexcept { ref = nullptr; }
-};
-
-
-template <class T>
-class optional<T&&>
-{
-  static_assert( sizeof(T) == 0, "optional rvalue references disallowed" );
-};
-
-
-// 20.5.8, Relational operators
-template <class T> constexpr bool operator==(const optional<T>& x, const optional<T>& y)
-{
-  return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y;
-}
-
-template <class T> constexpr bool operator!=(const optional<T>& x, const optional<T>& y)
-{
-  return !(x == y);
-}
-
-template <class T> constexpr bool operator<(const optional<T>& x, const optional<T>& y)
-{
-  return (!y) ? false : (!x) ? true : *x < *y;
-}
-
-template <class T> constexpr bool operator>(const optional<T>& x, const optional<T>& y)
-{
-  return (y < x);
-}
-
-template <class T> constexpr bool operator<=(const optional<T>& x, const optional<T>& y)
-{
-  return !(y < x);
-}
-
-template <class T> constexpr bool operator>=(const optional<T>& x, const optional<T>& y)
-{
-  return !(x < y);
-}
-
-
-// 20.5.9, Comparison with nullopt
-template <class T> constexpr bool operator==(const optional<T>& x, nullopt_t) noexcept
-{
-  return (!x);
-}
-
-template <class T> constexpr bool operator==(nullopt_t, const optional<T>& x) noexcept
-{
-  return (!x);
-}
-
-template <class T> constexpr bool operator!=(const optional<T>& x, nullopt_t) noexcept
-{
-  return bool(x);
-}
-
-template <class T> constexpr bool operator!=(nullopt_t, const optional<T>& x) noexcept
-{
-  return bool(x);
-}
-
-template <class T> constexpr bool operator<(const optional<T>&, nullopt_t) noexcept
-{
-  return false;
-}
-
-template <class T> constexpr bool operator<(nullopt_t, const optional<T>& x) noexcept
-{
-  return bool(x);
-}
-
-template <class T> constexpr bool operator<=(const optional<T>& x, nullopt_t) noexcept
-{
-  return (!x);
-}
-
-template <class T> constexpr bool operator<=(nullopt_t, const optional<T>&) noexcept
-{
-  return true;
-}
-
-template <class T> constexpr bool operator>(const optional<T>& x, nullopt_t) noexcept
-{
-  return bool(x);
-}
-
-template <class T> constexpr bool operator>(nullopt_t, const optional<T>&) noexcept
-{
-  return false;
-}
-
-template <class T> constexpr bool operator>=(const optional<T>&, nullopt_t) noexcept
-{
-  return true;
-}
-
-template <class T> constexpr bool operator>=(nullopt_t, const optional<T>& x) noexcept
-{
-  return (!x);
-}
-
-
-
-// 20.5.10, Comparison with T
-template <class T> constexpr bool operator==(const optional<T>& x, const T& v)
-{
-  return bool(x) ? *x == v : false;
-}
-
-template <class T> constexpr bool operator==(const T& v, const optional<T>& x)
-{
-  return bool(x) ? v == *x : false;
-}
-
-template <class T> constexpr bool operator!=(const optional<T>& x, const T& v)
-{
-  return bool(x) ? *x != v : true;
-}
-
-template <class T> constexpr bool operator!=(const T& v, const optional<T>& x)
-{
-  return bool(x) ? v != *x : true;
-}
-
-template <class T> constexpr bool operator<(const optional<T>& x, const T& v)
-{
-  return bool(x) ? *x < v : true;
-}
-
-template <class T> constexpr bool operator>(const T& v, const optional<T>& x)
-{
-  return bool(x) ? v > *x : true;
-}
-
-template <class T> constexpr bool operator>(const optional<T>& x, const T& v)
-{
-  return bool(x) ? *x > v : false;
-}
-
-template <class T> constexpr bool operator<(const T& v, const optional<T>& x)
-{
-  return bool(x) ? v < *x : false;
-}
-
-template <class T> constexpr bool operator>=(const optional<T>& x, const T& v)
-{
-  return bool(x) ? *x >= v : false;
-}
-
-template <class T> constexpr bool operator<=(const T& v, const optional<T>& x)
-{
-  return bool(x) ? v <= *x : false;
-}
-
-template <class T> constexpr bool operator<=(const optional<T>& x, const T& v)
-{
-  return bool(x) ? *x <= v : true;
-}
-
-template <class T> constexpr bool operator>=(const T& v, const optional<T>& x)
-{
-  return bool(x) ? v >= *x : true;
-}
-
-
-// Comparison of optional<T&> with T
-template <class T> constexpr bool operator==(const optional<T&>& x, const T& v)
-{
-  return bool(x) ? *x == v : false;
-}
-
-template <class T> constexpr bool operator==(const T& v, const optional<T&>& x)
-{
-  return bool(x) ? v == *x : false;
-}
-
-template <class T> constexpr bool operator!=(const optional<T&>& x, const T& v)
-{
-  return bool(x) ? *x != v : true;
-}
-
-template <class T> constexpr bool operator!=(const T& v, const optional<T&>& x)
-{
-  return bool(x) ? v != *x : true;
-}
-
-template <class T> constexpr bool operator<(const optional<T&>& x, const T& v)
-{
-  return bool(x) ? *x < v : true;
-}
-
-template <class T> constexpr bool operator>(const T& v, const optional<T&>& x)
-{
-  return bool(x) ? v > *x : true;
-}
-
-template <class T> constexpr bool operator>(const optional<T&>& x, const T& v)
-{
-  return bool(x) ? *x > v : false;
-}
-
-template <class T> constexpr bool operator<(const T& v, const optional<T&>& x)
-{
-  return bool(x) ? v < *x : false;
-}
-
-template <class T> constexpr bool operator>=(const optional<T&>& x, const T& v)
-{
-  return bool(x) ? *x >= v : false;
-}
-
-template <class T> constexpr bool operator<=(const T& v, const optional<T&>& x)
-{
-  return bool(x) ? v <= *x : false;
-}
-
-template <class T> constexpr bool operator<=(const optional<T&>& x, const T& v)
-{
-  return bool(x) ? *x <= v : true;
-}
-
-template <class T> constexpr bool operator>=(const T& v, const optional<T&>& x)
-{
-  return bool(x) ? v >= *x : true;
-}
-
-// Comparison of optional<T const&> with T
-template <class T> constexpr bool operator==(const optional<const T&>& x, const T& v)
-{
-  return bool(x) ? *x == v : false;
-}
-
-template <class T> constexpr bool operator==(const T& v, const optional<const T&>& x)
-{
-  return bool(x) ? v == *x : false;
-}
-
-template <class T> constexpr bool operator!=(const optional<const T&>& x, const T& v)
-{
-  return bool(x) ? *x != v : true;
-}
-
-template <class T> constexpr bool operator!=(const T& v, const optional<const T&>& x)
-{
-  return bool(x) ? v != *x : true;
-}
-
-template <class T> constexpr bool operator<(const optional<const T&>& x, const T& v)
-{
-  return bool(x) ? *x < v : true;
-}
-
-template <class T> constexpr bool operator>(const T& v, const optional<const T&>& x)
-{
-  return bool(x) ? v > *x : true;
-}
-
-template <class T> constexpr bool operator>(const optional<const T&>& x, const T& v)
-{
-  return bool(x) ? *x > v : false;
-}
-
-template <class T> constexpr bool operator<(const T& v, const optional<const T&>& x)
-{
-  return bool(x) ? v < *x : false;
-}
-
-template <class T> constexpr bool operator>=(const optional<const T&>& x, const T& v)
-{
-  return bool(x) ? *x >= v : false;
-}
-
-template <class T> constexpr bool operator<=(const T& v, const optional<const T&>& x)
-{
-  return bool(x) ? v <= *x : false;
-}
-
-template <class T> constexpr bool operator<=(const optional<const T&>& x, const T& v)
-{
-  return bool(x) ? *x <= v : true;
-}
-
-template <class T> constexpr bool operator>=(const T& v, const optional<const T&>& x)
-{
-  return bool(x) ? v >= *x : true;
-}
-
-
-// 20.5.12, Specialized algorithms
-template <class T>
-void swap(optional<T>& x, optional<T>& y) noexcept(noexcept(x.swap(y)))
-{
-  x.swap(y);
-}
-
-
-template <class T>
-constexpr optional<typename std::decay<T>::type> make_optional(T&& v)
-{
-  return optional<typename std::decay<T>::type>(constexpr_forward<T>(v));
-}
-
-template <class X>
-constexpr optional<X&> make_optional(std::reference_wrapper<X> v)
-{
-  return optional<X&>(v.get());
-}
-
-
-} // namespace at
-
-namespace std
-{
-  template <typename T>
-  struct hash<at::optional<T>>
-  {
-    typedef typename hash<T>::result_type result_type;
-    typedef at::optional<T> argument_type;
-
-    constexpr result_type operator()(argument_type const& arg) const {
-      return arg ? std::hash<T>{}(*arg) : result_type{};
-    }
-  };
-
-  template <typename T>
-  struct hash<at::optional<T&>>
-  {
-    typedef typename hash<T>::result_type result_type;
-    typedef at::optional<T&> argument_type;
-
-    constexpr result_type operator()(argument_type const& arg) const {
-      return arg ? std::hash<T>{}(*arg) : result_type{};
-    }
-  };
-}
-
-# undef TR2_OPTIONAL_REQUIRES
-# undef TR2_OPTIONAL_ASSERTED_EXPRESSION
+#include <ATen/core/optional.h>
diff --git a/aten/src/ATen/templates/StorageDerived.cpp b/aten/src/ATen/templates/StorageDerived.cpp
deleted file mode 100644
index 0491203c3286e6..00000000000000
--- a/aten/src/ATen/templates/StorageDerived.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-#include "ATen/${Storage}.h"
-
-// ${generated_comment}
-
-#include "ATen/Half.h"
-#include "ATen/Allocator.h"
-#include <ATen/Context.h>
-
-#include "ATen/Config.h"
-$extra_cuda_headers
-
-namespace at {
-
-${Storage}::${Storage}()
-  : Storage(new StorageImpl(
-      ScalarType::${ScalarName}, 
-      0,
-#if ${isCUDA}
-      globalContext().getTHCState()->cudaDeviceAllocator,
-#else
-      getTHDefaultAllocator(),
-#endif
-      /* resizable */ true)) {}
-
-${Storage}::${Storage}(size_t size)
-  : Storage(new StorageImpl(
-      ScalarType::${ScalarName}, 
-      size,
-#if ${isCUDA}
-      globalContext().getTHCState()->cudaDeviceAllocator,
-#else
-      getTHDefaultAllocator(),
-#endif
-      /* resizable */ true)) {}
-
-${Storage}::${Storage}(size_t size, Allocator* allocator)
-  : Storage(new StorageImpl(
-      ScalarType::${ScalarName}, 
-      size,
-      allocator,
-      /* resizable */ false)) {}
-
-// TODO: Take in Device as an input to the std::function constructor
-
-#if ${isCUDA}
-static int getPointerDevice(void* ptr) {
-  struct cudaPointerAttributes attr;
-  THCudaCheck(cudaPointerGetAttributes(&attr, ptr));
-  return attr.device;
-}
-#endif
-
-${Storage}::${Storage}(
-  void * data, 
-  size_t size, 
-  const std::function<void(void*)> & deleter)
-  : Storage(new StorageImpl(
-      ScalarType::${ScalarName},
-      size,
-      InefficientStdFunctionContext::makeDataPtr(data, deleter,
-#if ${isCUDA}
-      Device(kCUDA, getPointerDevice(data))
-#else
-      kCPU
-#endif
-       ),
-     /* allocator */ nullptr,
-     /* resizable */ false)) {}
-}
diff --git a/aten/src/ATen/templates/StorageDerived.h b/aten/src/ATen/templates/StorageDerived.h
deleted file mode 100644
index dddcd5dbf03f21..00000000000000
--- a/aten/src/ATen/templates/StorageDerived.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#pragma once
-
-// ${generated_comment}
-
-$th_headers
-
-#include "ATen/Storage.h"
-#include "ATen/Context.h"
-
-#include <memory>
-
-namespace at {
-
-struct Allocator;
-
-struct ${Storage} final : public Storage {
-  ${Storage}();
-  ${Storage}(StorageImpl* storage_impl) : Storage(storage_impl){};
-  ${Storage}(size_t size);
-  ${Storage}(size_t size, Allocator* allocator);
-  ${Storage}(
-      void* data,
-      size_t size,
-      const std::function<void(void*)>& deleter);
-  StorageImpl* storage_impl_;
-
- protected:
-  friend struct ${Type};
-};
-
-} // namespace at
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 31e952ebb79ff8..55fb4aec0cbb60 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -2,7 +2,6 @@
 
 // ${generated_comment}
 
-#include "ATen/Generator.h"
 #include "ATen/Scalar.h"
 #include "ATen/ScalarType.h"
 #include "ATen/SparseTensorRef.h"
@@ -10,12 +9,12 @@
 #include "ATen/TensorAccessor.h"
 #include "ATen/TensorBase.h"
 #include "ATen/TensorImpl.h"
-#include "ATen/Utils.h"
 #include "ATen/Device.h"
 #include "ATen/Layout.h"
 #include "ATen/optional.h"
 
 namespace at {
+struct Generator;
 struct Type;
 struct Tensor;
 struct TensorOptions;
diff --git a/aten/src/ATen/templates/TensorDense.cpp b/aten/src/ATen/templates/TensorDense.cpp
index cc2f47a89180ab..aeba9fb22a3653 100644
--- a/aten/src/ATen/templates/TensorDense.cpp
+++ b/aten/src/ATen/templates/TensorDense.cpp
@@ -3,5 +3,5 @@
 std::unique_ptr<Storage> ${Tensor}::storage() {
   auto storage = THTensor_getStoragePtr(tensor);
   THStorage_retain(storage);
-  return std::unique_ptr<Storage>(new ${Storage}(storage));
+  return std::unique_ptr<Storage>(new Storage(storage));
 }
diff --git a/aten/src/ATen/templates/TensorDerived.cpp b/aten/src/ATen/templates/TensorDerived.cpp
index d72ba4abde2c12..5fab8bf2226417 100644
--- a/aten/src/ATen/templates/TensorDerived.cpp
+++ b/aten/src/ATen/templates/TensorDerived.cpp
@@ -5,9 +5,8 @@
 
 // ${generated_comment}
 
-#include "ATen/Config.h"
 #include "ATen/${Tensor}.h"
-#include "ATen/${Storage}.h"
+#include "ATen/Storage.h"
 #include "ATen/Scalar.h"
 #include "ATen/Half.h"
 
@@ -22,7 +21,7 @@ namespace detail {
 }
 
 ${Tensor}::${Tensor}(${THTensor} * tensor)
-: TensorImpl(&globalContext().getType(Backend::${Backend},ScalarType::${ScalarName}), tensor)
+: TensorImpl(Backend::${Backend}, ScalarType::${ScalarName}, tensor, /* is variable */ false)
 {}
 
 ${TensorDenseOrSparse}
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index 67009473dddefc..ddd1483f0436f3 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -31,6 +31,14 @@
 
 namespace at {
 
+#if ${isCUDA}
+static int getPointerDevice(void* ptr) {
+  struct cudaPointerAttributes attr;
+  THCudaCheck(cudaPointerGetAttributes(&attr, ptr));
+  return attr.device;
+}
+#endif
+
 ${Type}::${Type}(Context* context)
   : Type(context, /*is_variable=*/false, /*is_undefined=*/false) {}
 ScalarType ${Type}::scalarType() const {
@@ -44,18 +52,44 @@ bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() ==
 bool ${Type}::is_distributed() const { return false; }
 
 std::unique_ptr<Storage> ${Type}::storage() const {
-  return std::unique_ptr<Storage>(new ${Storage}());
+  return std::unique_ptr<Storage>(new Storage(
+      ScalarType::${ScalarName},
+      0,
+#if ${isCUDA}
+      globalContext().getTHCState()->cudaDeviceAllocator
+#else
+      getTHDefaultAllocator()
+#endif
+  ));
 }
 std::unique_ptr<Storage> ${Type}::storage(size_t size) const {
-  return std::unique_ptr<Storage>(new ${Storage}(size));
+  return std::unique_ptr<Storage>(new Storage(
+      ScalarType::${ScalarName},
+      size,
+#if ${isCUDA}
+      globalContext().getTHCState()->cudaDeviceAllocator
+#else
+      getTHDefaultAllocator()
+#endif
+  ));
 }
 std::unique_ptr<Storage> ${Type}::storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const {
     return std::unique_ptr<Storage>(
-      new ${Storage}(data,size,deleter));
+      new Storage(
+      ScalarType::${ScalarName},
+      InefficientStdFunctionContext::makeDataPtr(data, deleter,
+#if ${isCUDA}
+      Device(kCUDA, getPointerDevice(data))
+#else
+      kCPU
+#endif
+      ),
+      size,
+      deleter));
 }
 std::unique_ptr<Storage> ${Type}::storageWithAllocator(int64_t size, Allocator* allocator) const {
     return std::unique_ptr<Storage>(
-        new ${Storage}(size, allocator));
+        new Storage(ScalarType::${ScalarName}, size, allocator));
 }
 Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const {
   if (retain)
@@ -65,7 +99,7 @@ Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const {
 std::unique_ptr<Storage> ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const {
   if (retain)
     ${THStorage}_retain(${state,} (${THStorage}*) th_pointer);
-  return std::unique_ptr<Storage>(new ${Storage}((${THStorage}*) th_pointer));
+  return std::unique_ptr<Storage>(new Storage((${THStorage}*) th_pointer));
 }
 std::unique_ptr<Generator> ${Type}::generator() const {
   return std::unique_ptr<Generator>(new ${Generator}(context));
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 6b46c8c0b70018..8e58df97073086 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -270,6 +270,10 @@ static void test(Type & type) {
     auto result = tensor.m(relu).m(mse_loss, other, Reduction::ElementwiseMean);
     REQUIRE(result.allclose(mse_loss(relu(tensor), other)));
   }
+  SECTION("core") {
+    int i = CoreTest();
+    REQUIRE(i + 1 == CoreTest());
+  }
 }
 
 TEST_CASE( "basic tests CPU", "[cpu]" ) {
diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp
index 64098c5bf76c56..4a400e3a517ee6 100644
--- a/aten/src/ATen/test/scalar_tensor_test.cpp
+++ b/aten/src/ATen/test/scalar_tensor_test.cpp
@@ -65,30 +65,13 @@ void test(Type &T) {
     require_equal_size_dim(t2, ones({0}, T));
 
     // unsqueeze
-#ifndef USE_TH_SIZE_ZERO_DIM
-    if (t.numel() != 0) {
-      REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1);
-    } else {
-      REQUIRE_THROWS(t.unsqueeze(0));
-    }
-#else
     REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1);
-#endif
 
     // unsqueeze_
     {
       auto t2 = ones(*s, T);
-#ifndef USE_TH_SIZE_ZERO_DIM
-      if (t2.numel() != 0) {
-        auto r = t2.unsqueeze_(0);
-        REQUIRE(r.dim() == t.dim() + 1);
-      } else {
-        REQUIRE_THROWS(t2.unsqueeze_(0));
-      }
-#else
       auto r = t2.unsqueeze_(0);
       REQUIRE(r.dim() == t.dim() + 1);
-#endif
     }
 
     // squeeze (with dimension argument)
diff --git a/aten/src/TH/THHalf.cpp b/aten/src/TH/THHalf.cpp
index 1c46c59a9977fa..840c97617c4cb2 100644
--- a/aten/src/TH/THHalf.cpp
+++ b/aten/src/TH/THHalf.cpp
@@ -1,4 +1,5 @@
 #include "THHalf.h"
+#include <ATen/Half.h>
 
 /* Copyright 1993-2014 NVIDIA Corporation.  All rights reserved. */
 
@@ -16,85 +17,14 @@ TH_API float TH_half2float(THHalf h)
   return f;
 }
 
-// Host functions for converting between FP32 and FP16 formats
 
 void TH_halfbits2float(unsigned short* src, float* res)
 {
-    unsigned h = *src;
-    unsigned sign = ((h >> 15) & 1);
-    unsigned exponent = ((h >> 10) & 0x1f);
-    unsigned mantissa = ((h & 0x3ff) << 13);
-
-    if (exponent == 0x1f) {  /* NaN or Inf */
-        mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
-        exponent = 0xff;
-    } else if (!exponent) {  /* Denorm or Zero */
-        if (mantissa) {
-            unsigned int msb;
-            exponent = 0x71;
-            do {
-                msb = (mantissa & 0x400000);
-                mantissa <<= 1;  /* normalize */
-                --exponent;
-            } while (!msb);
-            mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
-        }
-    } else {
-        exponent += 0x70;
-    }
-
-    *(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa);
+  *res = at::detail::halfbits2float(*src);
 }
 
+
 void TH_float2halfbits(float* src, unsigned short* dest)
 {
-    unsigned x = *(unsigned*)src;
-    unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
-    unsigned sign, exponent, mantissa;
-
-    // Get rid of +NaN/-NaN case first.
-    if (u > 0x7f800000) {
-      *dest = 0x7fffU;
-      return ;
-    }
-  
-    sign = ((x >> 16) & 0x8000);
-  
-    // Get rid of +Inf/-Inf, +0/-0.
-    if (u > 0x477fefff) {
-      *dest = sign | 0x7c00U;
-      return; 
-    }
-    if (u < 0x33000001) {
-      *dest = (sign | 0x0000);
-      return;
-    }
-
-    exponent = ((u >> 23) & 0xff);
-    mantissa = (u & 0x7fffff);
-
-    if (exponent > 0x70) {
-        shift = 13;
-        exponent -= 0x70;
-    } else {
-        shift = 0x7e - exponent;
-        exponent = 0;
-        mantissa |= 0x800000;
-    }
-    lsb = (1 << shift);
-    lsb_s1 = (lsb >> 1);
-    lsb_m1 = (lsb - 1);
-  
-    // Round to nearest even.
-    remainder = (mantissa & lsb_m1);
-    mantissa >>= shift;
-    if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
-        ++mantissa;
-        if (!(mantissa & 0x3ff)) {
-            ++exponent;
-            mantissa = 0;
-        }
-    }  
-
-    *dest = (sign | (exponent << 10) | mantissa);  
+  *dest = at::detail::float2halfbits(*src);
 }
diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp
index 0f05bb466651d3..0c36d5bf97fcf0 100644
--- a/aten/src/TH/THStorageFunctions.cpp
+++ b/aten/src/TH/THStorageFunctions.cpp
@@ -19,38 +19,25 @@ void THStorage_free(THStorage* storage) {
   if (!storage) {
     return;
   }
-
-  if (--storage->refcount == 0) {
-    if (storage->finalizer) {
-      (*storage->finalizer)();
-    }
-    storage->finalizer = nullptr;
-    storage->data_ptr.clear();
-    THStorage_weakFree(storage);
-  }
+  storage->release();
 }
 
 // Manually retains a weak reference
 void THStorage_weakRetain(THStorage *weak_storage) {
-  weak_storage->weakcount++;
+  weak_storage->weak_retain();
 }
 
 // Releases a weak reference
 void THStorage_weakFree(THStorage *weak_storage) {
-  if (--weak_storage->weakcount == 0) {
-    delete weak_storage;
-  }
+  weak_storage->weak_release();
 }
 
 // Given a weak reference, returns a strong reference to a storage (which must
 // be freed when done) or null if the storage is already dead.
 THStorage* THStorage_weakLock(THStorage *weak_storage) {
-  for (;;) {
-    int refcount = weak_storage->refcount.load();
-    if (refcount == 0) return nullptr;
-    if (weak_storage->refcount.compare_exchange_strong(refcount, refcount + 1)) break;
-  }
-  return weak_storage;
+  if (weak_storage->weak_lock())
+    return weak_storage;
+  return nullptr;
 }
 
 THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) {
@@ -95,7 +82,7 @@ ptrdiff_t THStorage_size(const THStorage *self)
 void THStorage_retain(THStorage *storage)
 {
   if (storage) {
-    ++storage->refcount;
+    storage->retain();
   }
 }
 
diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp
index 671e2f39fb1c7e..0e8b3e4ab17bee 100644
--- a/aten/src/TH/THStorageFunctions.hpp
+++ b/aten/src/TH/THStorageFunctions.hpp
@@ -35,8 +35,6 @@
 
 TH_API ptrdiff_t THStorage_size(const THStorage *self);
 
-TH_API void THStorage_setFlag(THStorage *storage, const char flag);
-TH_API void THStorage_clearFlag(THStorage *storage, const char flag);
 TH_API void THStorage_retain(THStorage *storage);
 TH_API void THStorage_resize(THStorage *storage, ptrdiff_t size);
 TH_API void THStorage_swap(THStorage *storage1, THStorage *storage2);
diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp
index 13df5128e5f5f8..5f3b6ed1fef6cc 100644
--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@@ -32,7 +32,7 @@ THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, at::IntList
   // This could perhaps be combined with the below code, but the complexity didn't seem worth it.
   int64_t numel = std::accumulate(oldshape.begin(), oldshape.end(), 1, std::multiplies<int64_t>());
   if (numel == 0 && oldshape.equals(newshape)) {
-    return std::vector<int64_t>(oldstride);
+    return oldstride.vec();
   }
 
   std::vector<int64_t> newstride(newshape.size());
diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp
index 16329f7ed7f621..56204a00e9c3ed 100644
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@@ -56,6 +56,10 @@ struct THTensor
       return sizes_.size();
     }
 
+    at::ScalarType scalar_type() const {
+      return storage_->scalar_type;
+    }
+
     ptrdiff_t storage_offset() const {
       return storage_offset_;
     }
@@ -109,6 +113,17 @@ inline int64_t* THTensor_getStridePtr(THTensor* tensor) {
 
 // NB: Non-retaining
 inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) {
+  // Within PyTorch, the invariant is that storage_ is always
+  // initialized; we never have tensors that don't have any storage.
+  // However, for Caffe2, this is not true, because they have permitted
+  // tensors to be allocated without specifying what scalar type
+  // they should be, only to be filled when GetMutableData is called
+  // for the first time (providing the necessary type).  It is an ERROR to
+  // invoke any PyTorch operations on such a half-constructed storage,
+  // and this check tests for that case.
+  AT_CHECK(tensor->storage_, "Cannot use PyTorch operations on a half-constructed "
+           "tensor.  If this tensor came from Caffe2, please call GetMutableData on "
+           "it first; otherwise, this is a bug, please report it.");
   return tensor->storage_;
 }
 
@@ -118,6 +133,7 @@ inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) {
 inline void THTensor_resizeDim(THTensor* tensor, int64_t ndim) {
   // NB: This is *truly* a resize; calling code (e.g., squeeze)
   // assumes that old values are preserved
+  tensor->is_zero_dim_ = bool(ndim == 0);
   tensor->sizes_.resize(ndim);
   tensor->strides_.resize(ndim);
 }
@@ -141,6 +157,9 @@ inline void THTensor_setStorageOffset(THTensor* tensor, ptrdiff_t storage_offset
 
 // NB: Steals ownership of storage
 inline void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) {
+  // Caffe2 might have tensors whose storages are null, but we
+  // don't allow it in PyTorch.
+  AT_ASSERT(storage);
   tensor->storage_ = storage;
 }
 
@@ -177,6 +196,19 @@ inline int THTensor_nDimensionLegacyAll(const THTensor* tensor) {
   }
 }
 
+inline int64_t THTensor_strideLegacyNoScalars(const THTensor *self, int dim) {
+  THArgCheck((dim >= 0) && (dim < THTensor_nDimensionLegacyNoScalars(self)), 2, "dimension %d out of range of %dD tensor",
+      dim+TH_INDEX_BASE, THTensor_nDimensionLegacyNoScalars(self));
+  return THTensor_isZeroDim(self) ? 1 : self->stride(dim);
+}
+
+inline int64_t THTensor_sizeLegacyNoScalars(const THTensor *self, int dim)
+{
+  THArgCheck((dim >= 0) && (dim < THTensor_nDimensionLegacyNoScalars(self)), 2, "dimension %d out of range of %dD tensor",
+      dim+TH_INDEX_BASE, THTensor_nDimensionLegacyNoScalars(self));
+  return THTensor_isZeroDim(self) ? 1 : self->size(dim);
+}
+
 TH_API void THTensor_free(THTensor *self);
 TH_CPP_API at::optional<std::vector<int64_t>> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride,
                                                                       at::IntList newshape);
diff --git a/aten/src/TH/THTensorDimApply.h b/aten/src/TH/THTensorDimApply.h
index 00c24dee51adb8..ff05ed8194979d 100644
--- a/aten/src/TH/THTensorDimApply.h
+++ b/aten/src/TH/THTensorDimApply.h
@@ -39,8 +39,8 @@
   int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \
   int TH_TENSOR_DIM_APPLY_i; \
 \
-  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \
-    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->dim()); \
+  if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyNoScalars(TENSOR1)) ) \
+    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, THTensor_nDimensionLegacyNoScalars(TENSOR1)); \
   int same_dims = 1;                                                    \
   if( TENSOR1->dim() != TENSOR2->dim() ) {                    \
     same_dims = 0;                                                      \
@@ -56,8 +56,8 @@
   if (TH_TENSOR_DIM_APPLY_hasFinished) { \
     return; \
   } \
-  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
+  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyNoScalars(TENSOR1))); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
     TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
   TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+(TENSOR1)->storage_offset(); \
@@ -76,14 +76,14 @@
   { \
     CODE \
 \
-    if(TENSOR1->dim() == 1) \
+    if(THTensor_nDimensionLegacyNoScalars(TENSOR1) == 1) \
        break; \
  \
-    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
+    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
     { \
       if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
       { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
+        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
         { \
           TH_TENSOR_DIM_APPLY_hasFinished = 1; \
           break; \
@@ -98,7 +98,7 @@
 \
       if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) \
       { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
+        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
         { \
           TH_TENSOR_DIM_APPLY_hasFinished = 1; \
           break; \
@@ -145,13 +145,13 @@
   int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \
   int TH_TENSOR_DIM_APPLY_i; \
 \
-  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \
+  if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyNoScalars(TENSOR1)) ) \
     THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, THTensor_nDimensionLegacyAll(TENSOR1)); \
   if( TENSOR1->dim() != TENSOR2->dim() ) {                    \
     AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same number of dimensions");        \
   }                                                                     \
   TH_UNUSED int shape_check_flag = 0;                                             \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
   { \
     if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
       continue; \
@@ -163,8 +163,8 @@
   if (TH_TENSOR_DIM_APPLY_hasFinished) { \
     return; \
   } \
-  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
+  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyNoScalars(TENSOR1))); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
     TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
   TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+(TENSOR1)->storage_offset(); \
@@ -179,14 +179,14 @@
   { \
     CODE \
 \
-    if(TENSOR1->dim() == 1) \
+    if(THTensor_nDimensionLegacyNoScalars(TENSOR1) == 1) \
        break; \
  \
     for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
     { \
       if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
       { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
+        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
         { \
           TH_TENSOR_DIM_APPLY_hasFinished = 1; \
           break; \
@@ -200,7 +200,7 @@
 \
       if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) \
       { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
+        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
         { \
           TH_TENSOR_DIM_APPLY_hasFinished = 1; \
           break; \
diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp
index 58a5d39366c294..e68c60a9455c4f 100644
--- a/aten/src/TH/generic/THTensor.cpp
+++ b/aten/src/TH/generic/THTensor.cpp
@@ -373,11 +373,7 @@ void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t fir
 
   THArgCheck( (dimension >= 0) && (dimension < src->dim()), 2, "out of range");
   THArgCheck( firstIndex >= 0, 3, "out of range");
-#ifdef USE_TH_SIZE_ZERO_DIM
   THArgCheck( size >= 0, 4, "out of range");
-#else
-  THArgCheck( size > 0, 4, "out of range");
-#endif
   THArgCheck(firstIndex <= src->size(dimension) - size, 4, "out of range");
 
   THTensor_(set)(self, src);
@@ -396,12 +392,8 @@ void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sli
   if(!src)
     src = self;
 
-#ifndef USE_TH_SIZE_ZERO_DIM
-  THArgCheck(THTensor_nDimensionLegacyAll(src) > 1, 1, "cannot select on a vector");
-#else
 #ifndef USE_TH_SCALAR
   THArgCheck(src->dim() > 1, 1, "cannot select on a vector");
-#endif
 #endif
   THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range");
   THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 3, "out of range");
@@ -423,8 +415,8 @@ void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dim
   if(!src)
     src = self;
 
-  THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range");
-  THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range");
+  THArgCheck( (dimension1 >= 0) && (dimension1 < THTensor_nDimensionLegacyNoScalars(src)), 1, "out of range");
+  THArgCheck( (dimension2 >= 0) && (dimension2 < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range");
 
   THTensor_(set)(self, src);
 
@@ -446,10 +438,7 @@ void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t siz
   if(!src)
     src = self;
 
-#ifndef USE_TH_SIZE_ZERO_DIM
-  THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor");
-#endif
-  THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range");
+  THArgCheck((dimension >= 0) && (dimension < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range");
   THArgCheck(size <= src->size(dimension), 3, "out of range");
   THArgCheck(step > 0, 4, "invalid step");
 
@@ -459,18 +448,20 @@ void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t siz
   std::vector<int64_t> newStride(/* size */ self->dim()+1);
 
   newSize[self->dim()] = size;
-  newStride[self->dim()] = self->stride(dimension);
+  newStride[self->dim()] = THTensor_strideLegacyNoScalars(self, dimension);
   for(d = 0; d < self->dim(); d++)
   {
+    auto self_size = THTensor_sizeLegacyNoScalars(self, d);
+    auto self_stride = THTensor_strideLegacyNoScalars(self, d);
     if(d == dimension)
     {
-      newSize[d] = (self->size(d) - size) / step + 1;
-      newStride[d] = step*self->stride(d);
+      newSize[d] = (self_size - size) / step + 1;
+      newStride[d] = step*self_stride;
     }
     else
     {
-      newSize[d] = self->size(d);
-      newStride[d] = self->stride(d);
+      newSize[d] = self_size;
+      newStride[d] = self_stride;
     }
   }
 
@@ -547,9 +538,6 @@ void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension)
     src = self;
 
   THArgCheck((dimension >= 0) && (dimension <= src->dim()), 2, "dimension out of range");
-#ifndef USE_TH_SIZE_ZERO_DIM
-  THArgCheck(!src->is_empty(), 2, "cannot unsqueeze empty tensor");
-#endif
 
   THTensor_(set)(self, src);
 
@@ -728,15 +716,6 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t
 
   for(d = 0; d < nDimension; d++)
   {
-#ifndef USE_TH_SIZE_ZERO_DIM
-    // we can't support this unless we have arbitrary 0-sized dimensions, but some calls to this
-    // currently exist and expect a size [0] tensor to be returned.
-    if (d == 0 && size[d] == 0) {
-      nDimension = 1;
-    } else {
-      AT_CHECK(size[d] > 0, "sizes must be non-negative");
-    }
-#endif
     if((self->dim() > d) && (size[d] != self->size(d))) {
       hascorrectsize = false;
     }
@@ -790,14 +769,14 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t
 
 void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value)
 {
-  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 1, 1, "tensor must have one dimension");
+  THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
   THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0), value);
 }
 
 real THTensor_(get1d)(const THTensor *tensor, int64_t x0)
 {
-  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 1, 1, "tensor must have one dimension");
+  THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
   return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0));
 }
diff --git a/aten/src/TH/generic/THTensorEvenMoreMath.cpp b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
index 644fa541a8f9ae..03946724dcadc6 100644
--- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
@@ -149,15 +149,8 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
   int64_t *index_data;
   real *tensor_data, *src_data;
 
-#ifndef USE_TH_SIZE_ZERO_DIM
-  THArgCheck(THTensor_nDimensionLegacyAll(index) <= 1, 3, "Index is supposed to be an empty tensor or a vector");
-  THArgCheck(dim < THTensor_nDimensionLegacyAll(src), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
-  THArgCheck(THTensor_nDimensionLegacyAll(src) > 0, 2, "Source tensor is empty");
-#else
-  THArgCheck(index->dim() == 1, 3, "Index is supposed to be 1-dimensional");
-  THArgCheck(dim < src->dim(), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
-  //THArgCheck(src->dim() > 0, 2, "Source tensor is empty");
-#endif
+  THArgCheck(THTensor_nDimensionLegacyNoScalars(index) == 1, 3, "Index is supposed to be 1-dimensional");
+  THArgCheck(dim < THTensor_nDimensionLegacyNoScalars(src), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
 
   numel = THLongTensor_nElement(index);
 
@@ -188,7 +181,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
       }
     }
 
-    if (src->dim() == 1) {
+    if (src->dim() <= 1) {
       #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i)
       for (i=0; i<numel; i++)
         tensor_data[i] = src_data[index_data[i] - TH_INDEX_BASE];
@@ -198,7 +191,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
         memcpy(tensor_data + i*rowsize, src_data + (index_data[i] - TH_INDEX_BASE)*rowsize, rowsize*sizeof(real));
     }
   }
-  else if (src->dim() == 1)
+  else if (src->dim() <= 1)
   {
     for (i=0; i<numel; i++)
       THTensor_(set1d)(tensor,i,THTensor_(get1d)(src,index_data[i] - TH_INDEX_BASE));
@@ -354,14 +347,9 @@ void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTenso
   int64_t *index_data;
 
   numel = THLongTensor_nElement(index);
-#ifndef USE_TH_SIZE_ZERO_DIM
-  THArgCheck(THTensor_nDimensionLegacyAll(index) == 1, 3, "Index is supposed to be a vector");
-  THArgCheck(dim < THTensor_nDimensionLegacyAll(src), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
-#else
-  THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector");
-  THArgCheck(dim < src->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
-#endif
-  THArgCheck(numel == src->size(dim),4,"Number of indices should be equal to source:size(dim)");
+  THArgCheck(THTensor_nDimensionLegacyNoScalars(index) == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < THTensor_nDimensionLegacyNoScalars(src), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+  THArgCheck(numel == THTensor_sizeLegacyNoScalars(src, dim),4,"Number of indices should be equal to source:size(dim)");
 
   index = THLongTensor_newContiguous(index);
   index_data = THLongTensor_data(index);
@@ -400,13 +388,8 @@ void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real v
   int64_t *index_data;
 
   numel = THLongTensor_nElement(index);
-#ifndef USE_TH_SIZE_ZERO_DIM
-  THArgCheck(THTensor_nDimensionLegacyAll(index) == 1, 3, "Index is supposed to be a vector");
-  THArgCheck(dim < THTensor_nDimensionLegacyAll(tensor), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
-#else
-  THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector");
-  THArgCheck(dim < tensor->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
-#endif
+  THArgCheck(THTensor_nDimensionLegacyNoScalars(index) == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < THTensor_nDimensionLegacyNoScalars(tensor), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
 
   index = THLongTensor_newContiguous(index);
   index_data = THLongTensor_data(index);
@@ -459,19 +442,11 @@ void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor
 {
   int64_t elems_per_row, i, idx;
 
-#ifndef USE_TH_SIZE_ZERO_DIM
-  THArgCheck(dim < THTensor_(nDimensionLegacyAll)(tensor), 2, "Index dimension is out of bounds");
-  THArgCheck(THLongTensor_nDimensionLegacyAll(index) == THTensor_(nDimensionLegacyAll)(tensor), 3,
-             "Index tensor must have same dimensions as output tensor");
-  THArgCheck(THTensor_(nDimensionLegacyAll)(src) == THTensor_(nDimensionLegacyAll)(tensor), 4,
-             "Input tensor must have same dimensions as output tensor");
-#else
   THArgCheck(dim < THTensor_(nDimensionLegacyNoScalars)(tensor), 2, "Index dimension is out of bounds");
   THArgCheck(THLongTensor_nDimensionLegacyNoScalars(index) == THTensor_(nDimensionLegacyNoScalars)(tensor), 3,
              "Index tensor must have same dimensions as output tensor");
   THArgCheck(THTensor_(nDimensionLegacyNoScalars)(src) == THTensor_(nDimensionLegacyNoScalars)(tensor), 4,
              "Input tensor must have same dimensions as output tensor");
-#endif
 
   elems_per_row = THLongTensor_size(index, dim);
 
diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp
index c521d1da750a43..24d9a7e8c4ea07 100644
--- a/aten/src/TH/generic/THTensorMath.cpp
+++ b/aten/src/TH/generic/THTensorMath.cpp
@@ -805,11 +805,11 @@ void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, T
 
 void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat, THTensor *vec)
 {
-  if( (mat->dim() != 2) || (vec->dim() != 1) )
+  if( (mat->dim() != 2) || (THTensor_nDimensionLegacyNoScalars(vec) != 1) )
     THError("matrix and vector expected, got %dD, %dD",
-      mat->dim(), vec->dim());
+      mat->dim(), THTensor_nDimensionLegacyNoScalars(vec));
 
-  if( mat->size(1) != vec->size(0) ) {
+  if( mat->size(1) != THTensor_sizeLegacyNoScalars(vec, 0) ) {
     THDescBuff bm = THTensor_(sizeDesc)(mat);
     THDescBuff bv = THTensor_(sizeDesc)(vec);
     THError("size mismatch, %s, %s", bm.str, bv.str);
@@ -837,14 +837,14 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
   {
     THBlas_(gemv)('n', mat->size(0), mat->size(1),
                   alpha, THTensor_(data)(mat), mat->stride(1),
-                  THTensor_(data)(vec), vec->stride(0),
+                  THTensor_(data)(vec), THTensor_strideLegacyNoScalars(vec, 0),
                   beta, THTensor_(data)(r_), r_->stride(0));
   }
   else if(mat->stride(1) == 1 && LDA_COND(mat->size(1), mat->size(0), mat->stride(0)))
   {
     THBlas_(gemv)('t',  mat->size(1), mat->size(0),
                   alpha, THTensor_(data)(mat), mat->stride(0),
-                  THTensor_(data)(vec), vec->stride(0),
+                  THTensor_(data)(vec), THTensor_strideLegacyNoScalars(vec, 0),
                   beta, THTensor_(data)(r_), r_->stride(0));
   }
   else
@@ -853,7 +853,7 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
 
     THBlas_(gemv)('t',  mat->size(1), mat->size(0),
                   alpha, THTensor_(data)(cmat), cmat->stride(0),
-                  THTensor_(data)(vec), vec->stride(0),
+                  THTensor_(data)(vec), THTensor_strideLegacyNoScalars(vec, 0),
                   beta, THTensor_(data)(r_), r_->stride(0));
 
     THTensor_(free)(cmat);
@@ -861,7 +861,7 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
 
   // In gemv (x,0).mv(0) does not
   // handle beta, whereas gemm does for case where (x,0).mm(0,y).
-  if (vec->size(0) == 0 && mat->size(0) != 0) {
+  if (THTensor_sizeLegacyNoScalars(vec, 0) == 0 && mat->size(0) != 0) {
     if (beta == 0) {
       THTensor_(zero)(r_);
     } else if (beta != 1) {
@@ -1058,14 +1058,19 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
 
 void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2)
 {
-  if( (vec1->dim() != 1) || (vec2->dim() != 1) )
+  if( (THTensor_nDimensionLegacyNoScalars(vec1) != 1) || (THTensor_nDimensionLegacyNoScalars(vec2) != 1) )
     THError("vector and vector expected, got %dD, %dD tensors",
-        vec1->dim(), vec2->dim());
+        THTensor_nDimensionLegacyNoScalars(vec1), THTensor_nDimensionLegacyNoScalars(vec2));
 
   if(t->dim() != 2)
     THError("expected matrix, got %dD tensor for t", t->dim());
 
-  if( (t->size(0) != vec1->size(0)) || (t->size(1) != vec2->size(0)) ) {
+  auto vec1_size = THTensor_sizeLegacyNoScalars(vec1, 0);
+  auto vec2_size = THTensor_sizeLegacyNoScalars(vec2, 0);
+  auto vec1_stride = THTensor_strideLegacyNoScalars(vec1, 0);
+  auto vec2_stride = THTensor_strideLegacyNoScalars(vec2, 0);
+
+  if( (t->size(0) != vec1_size) || (t->size(1) != vec2_size) ) {
     THDescBuff bt  = THTensor_(sizeDesc)(t);
     THDescBuff bv1 = THTensor_(sizeDesc)(vec1);
     THDescBuff bv2 = THTensor_(sizeDesc)(vec2);
@@ -1087,27 +1092,27 @@ void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
   // n == 1 || lda >= max(1, m)
   #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M)))
 
-  if(r_->stride(0) == 1 && LDA_COND(vec1->size(0), vec2->size(0), r_->stride(1)))
+  if(r_->stride(0) == 1 && LDA_COND(vec1_size, vec2_size, r_->stride(1)))
   {
-    THBlas_(ger)(vec1->size(0), vec2->size(0),
-                 alpha, THTensor_(data)(vec1), vec1->stride(0),
-                 THTensor_(data)(vec2), vec2->stride(0),
+    THBlas_(ger)(vec1_size, vec2_size,
+                 alpha, THTensor_(data)(vec1), vec1_stride,
+                 THTensor_(data)(vec2), vec2_stride,
                  THTensor_(data)(r_), r_->stride(1));
   }
-  else if(r_->stride(1) == 1 && LDA_COND(vec2->size(0), vec1->size(0), r_->stride(0)))
+  else if(r_->stride(1) == 1 && LDA_COND(vec2->size(0), vec1_size, r_->stride(0)))
   {
-    THBlas_(ger)(vec2->size(0), vec1->size(0),
-                 alpha, THTensor_(data)(vec2), vec2->stride(0),
-                 THTensor_(data)(vec1), vec1->stride(0),
+    THBlas_(ger)(vec2_size, vec1_size,
+                 alpha, THTensor_(data)(vec2), vec2_stride,
+                 THTensor_(data)(vec1), vec1_stride,
                  THTensor_(data)(r_), r_->stride(0));
   }
   else
   {
     THTensor *cr = THTensor_(newClone)(r_);
 
-    THBlas_(ger)(vec2->size(0), vec1->size(0),
-                 alpha, THTensor_(data)(vec2), vec2->stride(0),
-                 THTensor_(data)(vec1), vec1->stride(0),
+    THBlas_(ger)(vec2_size, vec1_size,
+                 alpha, THTensor_(data)(vec2), vec2_stride,
+                 THTensor_(data)(vec1), vec1_stride,
                  THTensor_(data)(cr), cr->stride(0));
 
     THTensor_(freeCopyTo)(cr, r_);
diff --git a/aten/src/TH/generic/THTensorMoreMath.cpp b/aten/src/TH/generic/THTensorMoreMath.cpp
index d06ec255644cce..fa8fb0558661ea 100644
--- a/aten/src/TH/generic/THTensorMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorMoreMath.cpp
@@ -557,9 +557,6 @@ void THTensor_(onesLike)(THTensor *r_, THTensor *input)
 
 void THTensor_(diag)(THTensor *r_, THTensor *t, int k)
 {
-#ifndef USE_TH_SIZE_ZERO_DIM
-  AT_ASSERT(!t->is_empty())
-#endif
   THArgCheck(THTensor_(nDimensionLegacyNoScalars)(t) == 1 || THTensor_(nDimensionLegacyNoScalars)(t) == 2, 1, "matrix or a vector expected");
 
   if(THTensor_(nDimensionLegacyNoScalars)(t) == 1)
@@ -1186,19 +1183,11 @@ void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, i
 
 void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted)
 {
-#ifndef USE_TH_SIZE_ZERO_DIM
-  int numDims = THTensor_(nDimensionLegacyAll)(t);
-#else
   int numDims = THTensor_(nDimensionLegacyNoScalars)(t);
-#endif
   THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range");
 
   int64_t sliceSize = THTensor_(size)(t, dim);
-#ifndef USE_TH_SIZE_ZERO_DIM
-  THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension");
-#else
   THArgCheck(k >= 0 && k <= sliceSize, 2, "k not in range for dimension");
-#endif
 
   THTensor *tmpResults = THTensor_(new)();
   THTensor_(resize1d)(tmpResults, sliceSize);
diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp
index 9df36f097ba6ee..a8fb33c11a5bd4 100644
--- a/aten/src/THC/THCTensor.cpp
+++ b/aten/src/THC/THCTensor.cpp
@@ -10,7 +10,7 @@
 #include "THCTensorInfo.cuh"
 
 int THCTensor_nDimensionLegacyNoScalars(THCState *state, const THCTensor *self) {
-  return self->dim();
+  return THTensor_nDimensionLegacyNoScalars(self);
 }
 
 int THCTensor_nDimensionLegacyAll(THCState *state, const THCTensor *self) {
@@ -99,15 +99,6 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_
 
   for(d = 0; d < nDimension; d++)
   {
-#ifndef USE_TH_SIZE_ZERO_DIM
-    // we can't support this unless we have arbitrary 0-sized dimensions, but some calls to this
-    // currently exist and expect a size [0] tensor to be returned.
-    if (d == 0 && size[d] == 0) {
-      nDimension = 1;
-    } else {
-      AT_CHECK(size[d] > 0, "sizes must be non-negative");
-    }
-#endif
     if((self->dim() > d) && (size[d] != self->size(d))) {
       hascorrectsize = false;
     }
@@ -234,9 +225,6 @@ void THCTensor_unsqueeze1d(THCState *state, THCTensor *self, THCTensor *src, int
     src = self;
 
   THArgCheck((dimension >= 0) && (dimension <= src->dim()), 3, "dimension out of range");
-#ifndef USE_TH_SIZE_ZERO_DIM
-  THArgCheck(!src->is_empty(), 3, "cannot unsqueeze empty tensor");
-#endif
 
   THCTensor_set(state, self, src);
 
diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp
index e15ba5e5a2c666..940af6eb86ead4 100644
--- a/aten/src/THC/generic/THCTensor.cpp
+++ b/aten/src/THC/generic/THCTensor.cpp
@@ -28,11 +28,21 @@ int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim)
   return THCTensor_size(state, self, dim);
 }
 
+int64_t THCTensor_(sizeLegacyNoScalars)(THCState *state, const THCTensor *self, int dim)
+{
+  return THTensor_sizeLegacyNoScalars(self, dim);
+}
+
 int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim)
 {
   return THCTensor_stride(state, self, dim);
 }
 
+int64_t THCTensor_(strideLegacyNoScalars)(THCState *state, const THCTensor *self, int dim)
+{
+  return THTensor_strideLegacyNoScalars(self, dim);
+}
+
 THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self)
 {
   return THCTensor_newSizeOf(state, self);
@@ -367,11 +377,7 @@ void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int di
 
   THArgCheck( (dimension >= 0) && (dimension < src->dim()), 3, "out of range");
   THArgCheck( firstIndex >= 0, 4, "out of range");
-#ifdef USE_TH_SIZE_ZERO_DIM
   THArgCheck( size >= 0, 5, "out of range");
-#else
-  THArgCheck( size > 0, 5, "out of range");
-#endif
   THArgCheck(firstIndex+size <= src->size(dimension), 5, "out of range");
 
   THCTensor_(set)(state, self, src);
@@ -390,12 +396,8 @@ void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int di
   if(!src)
     src = self;
 
-#ifndef USE_TH_SIZE_ZERO_DIM
-  THArgCheck(THTensor_nDimensionLegacyAll(src) > 1, 1, "cannot select on a vector");
-#else
 #ifndef USE_TH_SCALAR
   THArgCheck(src->dim() > 1, 1, "cannot select on a vector");
-#endif
 #endif
   THArgCheck((dimension >= 0) && (dimension < src->dim()), 3, "out of range");
   THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 4, "out of range");
@@ -417,8 +419,8 @@ void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int
   if(!src)
     src = self;
 
-  THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range");
-  THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range");
+  THArgCheck( (dimension1 >= 0) && (dimension1 < THTensor_nDimensionLegacyNoScalars(src)), 1, "out of range");
+  THArgCheck( (dimension2 >= 0) && (dimension2 < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range");
 
   THCTensor_(set)(state, self, src);
 
@@ -440,11 +442,8 @@ void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int di
   if(!src)
     src = self;
 
-#ifndef USE_TH_SIZE_ZERO_DIM
-  THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor");
-#endif
-  THArgCheck(dimension < src->dim(), 2, "out of range");
-  THArgCheck(size <= src->size(dimension), 3, "out of range");
+  THArgCheck(dimension < THTensor_nDimensionLegacyNoScalars(src), 2, "out of range");
+  THArgCheck(size <= THTensor_sizeLegacyNoScalars(src, dimension), 3, "out of range");
   THArgCheck(step > 0, 4, "invalid step");
 
   THCTensor_(set)(state, self, src);
@@ -453,18 +452,20 @@ void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int di
   std::vector<int64_t> newStride(self->dim() + 1);
 
   newSize[self->dim()] = size;
-  newStride[self->dim()] = self->stride(dimension);
+  newStride[self->dim()] = THTensor_strideLegacyNoScalars(self, dimension);
   for(d = 0; d < self->dim(); d++)
   {
+    auto self_size = THTensor_sizeLegacyNoScalars(self, d);
+    auto self_stride = THTensor_strideLegacyNoScalars(self, d);
     if(d == dimension)
     {
-      newSize[d] = (self->size(d) - size) / step + 1;
-      newStride[d] = step*self->stride(d);
+      newSize[d] = (self_size - size) / step + 1;
+      newStride[d] = step*self_stride;
     }
     else
     {
-      newSize[d] = self->size(d);
-      newStride[d] = self->stride(d);
+      newSize[d] = self_size;
+      newStride[d] = self_stride;
     }
   }
 
@@ -603,15 +604,15 @@ void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, int6
 
 void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real value)
 {
-  THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
+  THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension");
+  THArgCheck( (x0 >= 0) && (x0 < THTensor_sizeLegacyNoScalars(tensor, 0)), 2, "out of range");
   THCStorage_(set)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0), value);
 }
 
 real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0)
 {
-  THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
+  THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension");
+  THArgCheck( (x0 >= 0) && (x0 < THTensor_sizeLegacyNoScalars(tensor, 0)), 2, "out of range");
   return THCStorage_(get)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0));
 }
 
diff --git a/aten/src/THC/generic/THCTensor.h b/aten/src/THC/generic/THCTensor.h
index dbb1591ae194f2..2ee1bf11a4be4c 100644
--- a/aten/src/THC/generic/THCTensor.h
+++ b/aten/src/THC/generic/THCTensor.h
@@ -26,7 +26,9 @@ THC_API int THCTensor_(nDimensionLegacyNoScalars)(THCState *state, const THCTens
 THC_API int THCTensor_(nDimensionLegacyAll)(THCState *state, const THCTensor *self);
 
 THC_API int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim);
+THC_API int64_t THCTensor_(sizeLegacyNoScalars)(THCState *state, const THCTensor *self, int dim);
 THC_API int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim);
+THC_API int64_t THCTensor_(strideLegacyNoScalars)(THCState *state, const THCTensor *self, int dim);
 THC_API THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self);
 THC_API THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self);
 THC_API real *THCTensor_(data)(THCState *state, const THCTensor *self);
diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu
index 4cbf5dd224abe5..82f56f9946e471 100644
--- a/aten/src/THC/generic/THCTensorIndex.cu
+++ b/aten/src/THC/generic/THCTensorIndex.cu
@@ -537,16 +537,6 @@ void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, in
 
   THLongStorage *newSize;
 
-#ifndef USE_TH_SIZE_ZERO_DIM
-  if (numIndices == 0) {
-    newSize = THCTensor_(newSizeOf)(state, src);
-    THLongStorage_set(newSize, 0, numIndices);
-    THCTensor_(resize)(state, dst, newSize, NULL);
-    THLongStorage_free(newSize);
-    return;
-  }
-#endif
-
   newSize = THCTensor_(newSizeOf)(state, src);
   THLongStorage_set(newSize, dim, numIndices);
   THCTensor_(resize)(state, dst, newSize, NULL);
diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu
index 642b14aec48cfd..cc1a8c9ba57e41 100644
--- a/aten/src/THC/generic/THCTensorMath.cu
+++ b/aten/src/THC/generic/THCTensorMath.cu
@@ -330,9 +330,6 @@ void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor,
 void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, int64_t k){
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   int nDimension = THCTensor_(nDimensionLegacyNoScalars)(state, src_);
-#ifndef USE_TH_SIZE_ZERO_DIM
-  AT_ASSERT(!src_->is_empty());
-#endif
   THArgCheck((nDimension == 2) || (nDimension == 1), 1, "expected a matrix or a vector");
   if (nDimension == 2) {
     int64_t stride0 = THCTensor_(stride)(state, src_, 0);
diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu
index 17ef020e85f8ee..591780b04edf75 100644
--- a/aten/src/THC/generic/THCTensorMathBlas.cu
+++ b/aten/src/THC/generic/THCTensorMathBlas.cu
@@ -49,11 +49,15 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec));
-  if( (mat->dim() != 2) || (vec->dim() != 1) )
+  if( (mat->dim() != 2) || (THTensor_nDimensionLegacyNoScalars(vec) != 1) )
     THError("2D tensor and 1D tensor expected, got %dD, %dD tensors",
-       mat->dim(), vec->dim());
+       mat->dim(), THTensor_nDimensionLegacyNoScalars(vec));
 
-  if( mat->size(1) != vec->size(0) )
+
+  auto vec_size = THTensor_sizeLegacyNoScalars(vec, 0);
+  auto vec_stride = THTensor_strideLegacyNoScalars(vec, 0);
+
+  if( mat->size(1) != THTensor_sizeLegacyNoScalars(vec, 0) )
     THError("size mismatch");
 
   if(t->dim() != 1)
@@ -74,12 +78,12 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
 #ifdef THC_REAL_IS_FLOAT
     THCudaBlas_Sgemv(state, 'n', mat->size(0), mat->size(1),
                     alpha, THCTensor_(data)(state, mat), mat->stride(1),
-                    THCTensor_(data)(state, vec), vec->stride(0),
+                    THCTensor_(data)(state, vec), vec_stride,
                     beta, THCTensor_(data)(state, r_), r_->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
     THCudaBlas_Dgemv(state, 'n', mat->size(0), mat->size(1),
                     alpha, THCTensor_(data)(state, mat), mat->stride(1),
-                    THCTensor_(data)(state, vec), vec->stride(0),
+                    THCTensor_(data)(state, vec), vec_stride,
                     beta, THCTensor_(data)(state, r_), r_->stride(0));
 #endif
   }
@@ -88,12 +92,12 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
 #ifdef THC_REAL_IS_FLOAT
     THCudaBlas_Sgemv(state, 't',  mat->size(1), mat->size(0),
                     alpha, THCTensor_(data)(state, mat), mat->stride(0),
-                    THCTensor_(data)(state, vec), vec->stride(0),
+                    THCTensor_(data)(state, vec), vec_stride,
                     beta, THCTensor_(data)(state, r_), r_->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
     THCudaBlas_Dgemv(state, 't',  mat->size(1), mat->size(0),
                      alpha, THCTensor_(data)(state, mat), mat->stride(0),
-                     THCTensor_(data)(state, vec), vec->stride(0),
+                     THCTensor_(data)(state, vec), vec_stride,
                      beta, THCTensor_(data)(state, r_), r_->stride(0));
 #endif
   }
@@ -104,12 +108,12 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
 #ifdef THC_REAL_IS_FLOAT
     THCudaBlas_Sgemv(state, 't',  mat->size(1), mat->size(0),
                     alpha, THCTensor_(data)(state, cmat), cmat->stride(0),
-                    THCTensor_(data)(state, vec), vec->stride(0),
+                    THCTensor_(data)(state, vec), vec_stride,
                     beta, THCTensor_(data)(state, r_), r_->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
     THCudaBlas_Dgemv(state, 't',  mat->size(1), mat->size(0),
                     alpha, THCTensor_(data)(state, cmat), cmat->stride(0),
-                    THCTensor_(data)(state, vec), vec->stride(0),
+                    THCTensor_(data)(state, vec), vec_stride,
                     beta, THCTensor_(data)(state, r_), r_->stride(0));
 #endif
 
@@ -129,7 +133,7 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
 #elif defined(THC_REAL_IS_HALF)
     // Currently no Hgemv/SgemvEx in Cublas
     THCTensor *vecAsMatrix = THCTensor_(newWithTensor)(state, vec);
-    THCTensor_(resize2d)(state, vecAsMatrix, vecAsMatrix->size(0), 1);
+    THCTensor_(resize2d)(state, vecAsMatrix, vec_size, 1);
 
     THCTensor *tAsMatrix = THCTensor_(newWithTensor)(state, t);
     THCTensor_(resize2d)(state, tAsMatrix, tAsMatrix->size(0), 1);
@@ -151,16 +155,20 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2));
-  if ( (vec1->dim() != 1) || (vec2->dim() != 1) ) {
+  if ( (THTensor_nDimensionLegacyNoScalars(vec1) != 1) || (THTensor_nDimensionLegacyNoScalars(vec2) != 1) ) {
     THError("1D tensors expected, got %dD, %dD tensors",
-       vec1->dim(), vec2->dim());
+       THTensor_nDimensionLegacyNoScalars(vec1), THTensor_nDimensionLegacyNoScalars(vec2));
   }
+  auto vec1_size = THTensor_sizeLegacyNoScalars(vec1, 0);
+  auto vec2_size = THTensor_sizeLegacyNoScalars(vec2, 0);
+  auto vec1_stride = THTensor_strideLegacyNoScalars(vec1, 0);
+  auto vec2_stride = THTensor_strideLegacyNoScalars(vec2, 0);
 
   if (t->dim() != 2) {
     THError("size mismatch");
   }
 
-  if ( (t->size(0) != vec1->size(0)) || (t->size(1) != vec2->size(0)) ) {
+  if ( (t->size(0) != vec1_size) || (t->size(1) != vec2_size) ) {
     THError("size mismatch");
   }
 
@@ -179,28 +187,28 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
   if(r_->stride(0) == 1)
   {
 #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sger(state, vec1->size(0), vec2->size(0),
-                   alpha, THCTensor_(data)(state, vec1), vec1->stride(0),
-                   THCTensor_(data)(state, vec2), vec2->stride(0),
+    THCudaBlas_Sger(state, vec1_size, vec2_size,
+                   alpha, THCTensor_(data)(state, vec1), vec1_stride,
+                   THCTensor_(data)(state, vec2), vec2_stride,
                    THCTensor_(data)(state, r_), r_->stride(1));
 #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dger(state, vec1->size(0), vec2->size(0),
-                   alpha, THCTensor_(data)(state, vec1), vec1->stride(0),
-                   THCTensor_(data)(state, vec2), vec2->stride(0),
+    THCudaBlas_Dger(state, vec1->size(0), vec2_size,
+                   alpha, THCTensor_(data)(state, vec1), vec1_stride,
+                   THCTensor_(data)(state, vec2), vec2_stride,
                    THCTensor_(data)(state, r_), r_->stride(1));
 #endif
   }
   else if(r_->stride(1) == 1)
   {
 #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sger(state, vec2->size(0), vec1->size(0),
-                   alpha, THCTensor_(data)(state, vec2), vec2->stride(0),
-                   THCTensor_(data)(state, vec1), vec1->stride(0),
+    THCudaBlas_Sger(state, vec2_size, vec1_size,
+                   alpha, THCTensor_(data)(state, vec2), vec2_stride,
+                   THCTensor_(data)(state, vec1), vec1_stride,
                    THCTensor_(data)(state, r_), r_->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dger(state, vec2->size(0), vec1->size(0),
-                   alpha, THCTensor_(data)(state, vec2), vec2->stride(0),
-                   THCTensor_(data)(state, vec1), vec1->stride(0),
+    THCudaBlas_Dger(state, vec2_size, vec1_size,
+                   alpha, THCTensor_(data)(state, vec2), vec2_stride,
+                   THCTensor_(data)(state, vec1), vec1_stride,
                    THCTensor_(data)(state, r_), r_->stride(0));
 #endif
   }
@@ -209,14 +217,14 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
     THCTensor *cr = THCTensor_(newClone)(state, r_);
 
 #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sger(state, vec2->size(0), vec1->size(0),
-                   alpha, THCTensor_(data)(state, vec2), vec2->stride(0),
-                   THCTensor_(data)(state, vec1), vec1->stride(0),
+    THCudaBlas_Sger(state, vec2_size, vec1_size,
+                   alpha, THCTensor_(data)(state, vec2), vec2_stride,
+                   THCTensor_(data)(state, vec1), vec1_stride,
                    THCTensor_(data)(state, cr), cr->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dger(state, vec2->size(0), vec1->size(0),
-                   alpha, THCTensor_(data)(state, vec2), vec2->stride(0),
-                   THCTensor_(data)(state, vec1), vec1->stride(0),
+    THCudaBlas_Dger(state, vec2_size, vec1_size,
+                   alpha, THCTensor_(data)(state, vec2), vec2_stride,
+                   THCTensor_(data)(state, vec1), vec1_stride,
                    THCTensor_(data)(state, cr), cr->stride(0));
 #endif
 
@@ -225,11 +233,11 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
 #elif defined(THC_REAL_IS_HALF)
   // currently no Hger/SgerEx in Cublas.
   THCTensor *vec2T = THCTensor_(newWithTensor)(state, vec2);
-  THCTensor_(resize2d)(state, vec2T, vec2T->size(0), 1);
+  THCTensor_(resize2d)(state, vec2T, vec2_size, 1);
   THCTensor_(transpose)(state, vec2T, NULL, 0, 1);
 
   THCTensor *vec1M = THCTensor_(newWithTensor)(state, vec1);
-  THCTensor_(resize2d)(state, vec1M, vec1M->size(0), 1);
+  THCTensor_(resize2d)(state, vec1M, vec1_size, 1);
 
   THCTensor_(addmm)(state, r_, beta, t, alpha, vec1M, vec2T);
   THCTensor_(free)(state, vec2T);
diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt
index 79b11c2db9b64f..78faef7a7f227b 100644
--- a/aten/src/THCUNN/CMakeLists.txt
+++ b/aten/src/THCUNN/CMakeLists.txt
@@ -43,7 +43,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFractionalMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullConvolution.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullDilatedConvolution.cu
-${CMAKE_CURRENT_SOURCE_DIR}/SpatialGridSamplerBilinear.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxUnpooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialReflectionPadding.cu
@@ -71,7 +70,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricDilatedMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFractionalMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullConvolution.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullDilatedConvolution.cu
-${CMAKE_CURRENT_SOURCE_DIR}/VolumetricGridSamplerBilinear.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxUnpooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricReplicationPadding.cu
diff --git a/aten/src/THCUNN/ELU.cu b/aten/src/THCUNN/ELU.cu
index d17d185b4858bf..9c4c2ea1fdc8b6 100644
--- a/aten/src/THCUNN/ELU.cu
+++ b/aten/src/THCUNN/ELU.cu
@@ -8,15 +8,17 @@ struct ELUupdateOutput_functor
 {
   const T negcoef_;
   const T poscoef_;
+  const T negiptcoef_;
 
-  ELUupdateOutput_functor(T negcoef, T poscoef)
+  ELUupdateOutput_functor(T negcoef, T poscoef, T negiptcoef)
     : negcoef_(negcoef)
     , poscoef_(poscoef)
+    , negiptcoef_(negiptcoef)
   {}
 
   __device__ void operator()(T *output, const T *input) const
   {
-    *output = *input <= 0 ? (exp(*input) - 1) * negcoef_ : *input * poscoef_;
+    *output = *input <= 0 ? (exp(*input * negiptcoef_) - 1) * negcoef_ : *input * poscoef_;
   }
 };
 
@@ -26,15 +28,17 @@ struct ELUupdateOutputIP_functor
 {
   const T negcoef_;
   const T poscoef_;
+  const T negiptcoef_;
 
-  ELUupdateOutputIP_functor(T negcoef, T poscoef)
+  ELUupdateOutputIP_functor(T negcoef, T poscoef, T negiptcoef)
     : negcoef_(negcoef)
     , poscoef_(poscoef)
+    , negiptcoef_(negiptcoef)
   {}
 
   __device__ void operator()(T *x) const
   {
-    *x = *x <= 0 ? (exp(*x) - 1) * negcoef_ : *x * poscoef_;
+    *x = *x <= 0 ? (exp(*x * negiptcoef_) - 1) * negcoef_ : *x * poscoef_;
   }
 };
 
@@ -43,15 +47,17 @@ struct ELUupdateGradInput_functor
 {
   const T negcoef_;
   const T poscoef_;
+  const T negiptcoef_;
 
-  ELUupdateGradInput_functor(T negcoef, T poscoef)
+  ELUupdateGradInput_functor(T negcoef, T poscoef, T negiptcoef)
     : negcoef_(negcoef)
     , poscoef_(poscoef)
+    , negiptcoef_(negiptcoef)
   {}
 
   __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
   {
-    *gradInput = (*output) <= 0 ? (*gradOutput * (*output + negcoef_)) : (*gradOutput * poscoef_);
+    *gradInput = (*output) <= 0 ? (*gradOutput * negiptcoef_ * (*output + negcoef_)) : (*gradOutput * poscoef_);
   }
 };
 
diff --git a/aten/src/THCUNN/SpatialGridSamplerBilinear.cu b/aten/src/THCUNN/SpatialGridSamplerBilinear.cu
deleted file mode 100644
index 30a1a5d5ade10b..00000000000000
--- a/aten/src/THCUNN/SpatialGridSamplerBilinear.cu
+++ /dev/null
@@ -1,243 +0,0 @@
-#include "THCUNN.h"
-#include "common.h"
-#include "THCDeviceTensor.cuh"
-#include "THCDeviceTensorUtils.cuh"
-#include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
-#include "THCHalfAutoNumerics.cuh"
-#include "THCAtomics.cuh"
-
-#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < W && y >= 0 && y < H)
-#define SAFE_ADD(input, x, y, n, c, H, W, value)    \
-  do {    \
-    if (WITHIN_BOUNDS(x, y, H, W)) {    \
-      atomicAdd(&input[n][c][y][x], value);   \
-    }   \
-  } while(0)
-
-#undef MIN
-#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
-#undef MAX
-#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
-#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0))
-
-const int MODE_BORDER = 1;
-
-
-template <typename Dtype>
-__launch_bounds__(1024)
-__global__ void SpatialGridSamplerBilinear_updateOutput_kernel(
-    const int nthreads,
-    THCDeviceTensor<Dtype, 4> input,
-    THCDeviceTensor<Dtype, 4> grid,
-    THCDeviceTensor<Dtype, 4> output,
-    const int padding_mode) {
-
-  int N = input.getSize(0);
-  int C = input.getSize(1);
-  int IH = input.getSize(2);
-  int IW = input.getSize(3);
-  int H = grid.getSize(1);
-  int W = grid.getSize(2);
-
-  CUDA_KERNEL_LOOP(index, nthreads) {
-
-    const int n = index % N;
-    const int h = (index / N) % H;
-    const int w = (index / (N * H)) % W;
-    int c;
-
-    // get the corresponding input x, y co-ordinates from grid
-    Dtype ix = grid[n][h][w][0];
-    Dtype iy = grid[n][h][w][1];
-
-    // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1]
-    ix = ScalarConvert<float,Dtype>::to(((ix + 1.f) / 2) * (IW-1));
-    iy = ScalarConvert<float,Dtype>::to(((iy + 1.f) / 2) * (IH-1));
-
-    // get NE, NW, SE, SW pixel values from (x, y)
-    int ix_nw = floor(ScalarConvert<Dtype,float>::to(ix));
-    int iy_nw = floor(ScalarConvert<Dtype,float>::to(iy));
-    int ix_ne = ix_nw + 1;
-    int iy_ne = iy_nw;
-    int ix_sw = ix_nw;
-    int iy_sw = iy_nw + 1;
-    int ix_se = ix_nw + 1;
-    int iy_se = iy_nw + 1;
-
-    // get surfaces to each neighbor:
-    Dtype nw = (ix_se - ix)    * (iy_se - iy);
-    Dtype ne = (ix    - ix_sw) * (iy_sw - iy);
-    Dtype sw = (ix_ne - ix)    * (iy    - iy_ne);
-    Dtype se = (ix    - ix_nw) * (iy    - iy_nw);
-
-    // calculate bilinear weighted pixel value and set output pixel
-    if (padding_mode==MODE_BORDER){
-      // clip coordinates to image borders
-      CLIP_COORDINATES(ix_nw, ix_nw, IW);
-      CLIP_COORDINATES(iy_nw, iy_nw, IH);
-      CLIP_COORDINATES(ix_ne, ix_ne, IW);
-      CLIP_COORDINATES(iy_ne, iy_ne, IH);
-      CLIP_COORDINATES(ix_sw, ix_sw, IW);
-      CLIP_COORDINATES(iy_sw, iy_sw, IH);
-      CLIP_COORDINATES(ix_se, ix_se, IW);
-      CLIP_COORDINATES(iy_se, iy_se, IH);
-    }
-
-    Dtype out_val;
-    for (c = 0; c < C; ++c) {
-      out_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_nw, iy_nw, IH, IW)) {
-        out_val += input[n][c][iy_nw][ix_nw] * nw;
-      }
-      if (WITHIN_BOUNDS(ix_ne, iy_ne, IH, IW)) {
-        out_val += input[n][c][iy_ne][ix_ne] * ne;
-      }
-      if (WITHIN_BOUNDS(ix_sw, iy_sw, IH, IW)) {
-        out_val += input[n][c][iy_sw][ix_sw] * sw;
-      }
-      if (WITHIN_BOUNDS(ix_se, iy_se, IH, IW)) {
-        out_val += input[n][c][iy_se][ix_se] * se;
-      }
-      output[n][c][h][w] = out_val;
-    }
-  }
-}
-
-template <typename Dtype>
-__launch_bounds__(1024)
-__global__ void SpatialGridSamplerBilinear_updateGradInput_kernel(
-    const int nthreads,
-    THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> gradInput,
-    THCDeviceTensor<Dtype, 4> grid, THCDeviceTensor<Dtype, 4> gradGrid,
-    THCDeviceTensor<Dtype, 4> gradOutput,
-    const int padding_mode) {
-
-  int N = input.getSize(0);
-  int C = input.getSize(1);
-  int IH = input.getSize(2);
-  int IW = input.getSize(3);
-  int H = grid.getSize(1);
-  int W = grid.getSize(2);
-
-  CUDA_KERNEL_LOOP(index, nthreads) {
-
-    const int n = index % N;
-    const int h = (index / N) % H;
-    const int w = (index / (N * H)) % W;
-
-    // get the corresponding input x, y co-ordinates from grid
-    Dtype ix = grid[n][h][w][0];
-    Dtype iy = grid[n][h][w][1];
-
-    Dtype gix = ScalarConvert<int,Dtype>::to(0);
-    Dtype giy = ScalarConvert<int,Dtype>::to(0);
-
-    // normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1]
-    ix = ScalarConvert<float,Dtype>::to(((ix + 1.f) / 2) * (IW-1));
-    iy = ScalarConvert<float,Dtype>::to(((iy + 1.f) / 2) * (IH-1));;
-
-    // get NE, NW, SE, SW pixel values from (x, y)
-    int ix_nw = floor(ScalarConvert<Dtype,float>::to(ix));
-    int iy_nw = floor(ScalarConvert<Dtype,float>::to(iy));;
-    int ix_ne = ix_nw + 1;
-    int iy_ne = iy_nw;
-    int ix_sw = ix_nw;
-    int iy_sw = iy_nw + 1;
-    int ix_se = ix_nw + 1;
-    int iy_se = iy_nw + 1;
-
-    // get surfaces to each neighbor:
-    Dtype nw = (ix_se - ix)    * (iy_se - iy);
-    Dtype ne = (ix    - ix_sw) * (iy_sw - iy);
-    Dtype sw = (ix_ne - ix)    * (iy    - iy_ne);
-    Dtype se = (ix    - ix_nw) * (iy    - iy_nw);
-
-    Dtype gradout;
-    Dtype nw_val;
-    Dtype ne_val;
-    Dtype sw_val;
-    Dtype se_val;
-    
-    int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl;
-
-    if (padding_mode==MODE_BORDER){
-      // get clipped NE, NW, SE, SW pixel values from (x, y)
-      CLIP_COORDINATES(ix_nw, ix_nw_cl, IW);
-      CLIP_COORDINATES(iy_nw, iy_nw_cl, IH);
-      CLIP_COORDINATES(ix_ne, ix_ne_cl, IW);
-      CLIP_COORDINATES(iy_ne, iy_ne_cl, IH);
-      CLIP_COORDINATES(ix_sw, ix_sw_cl, IW);
-      CLIP_COORDINATES(iy_sw, iy_sw_cl, IH);
-      CLIP_COORDINATES(ix_se, ix_se_cl, IW);
-      CLIP_COORDINATES(iy_se, iy_se_cl, IH);
-    }
-    else {
-      ix_nw_cl = ix_nw;
-      iy_nw_cl = iy_nw;
-      ix_ne_cl = ix_ne;
-      iy_ne_cl = iy_ne;
-      ix_sw_cl = ix_sw;
-      iy_sw_cl = iy_sw;
-      ix_se_cl = ix_se;
-      iy_se_cl = iy_se;
-    }
-
-    for (int c = 0; c < C; ++c) {
-      gradout = gradOutput[n][c][h][w];
-
-      // calculate and set gradInput
-      SAFE_ADD(gradInput, ix_nw_cl, iy_nw_cl, n, c, IH, IW, nw * gradout);
-      SAFE_ADD(gradInput, ix_ne_cl, iy_ne_cl, n, c, IH, IW, ne * gradout);
-      SAFE_ADD(gradInput, ix_sw_cl, iy_sw_cl, n, c, IH, IW, sw * gradout);
-      SAFE_ADD(gradInput, ix_se_cl, iy_se_cl, n, c, IH, IW, se * gradout);
-
-      // calculate gradGrid
-      nw_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_nw_cl, iy_nw_cl, IH, IW)) {
-        nw_val = input[n][c][iy_nw_cl][ix_nw_cl];
-      }
-      ne_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_ne_cl, iy_ne_cl, IH, IW)) {
-        ne_val = input[n][c][iy_ne_cl][ix_ne_cl];
-      }
-      sw_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_sw_cl, iy_sw_cl, IH, IW)) {
-        sw_val = input[n][c][iy_sw_cl][ix_sw_cl];
-      }
-      se_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_se_cl, iy_se_cl, IH, IW)) {
-        se_val = input[n][c][iy_se_cl][ix_se_cl];
-      }
-
-      gix += ScalarConvert<int,Dtype>::to(-1)*(nw_val * (iy_se - iy) * gradout);
-      gix += ne_val * (iy_sw - iy) * gradout;
-      gix += ScalarConvert<int,Dtype>::to(-1)*(sw_val * (iy - iy_ne) * gradout);
-      gix += se_val * (iy - iy_nw) * gradout;
-
-      giy += ScalarConvert<int,Dtype>::to(-1)*(nw_val * (ix_se - ix) * gradout);
-      giy += ScalarConvert<int,Dtype>::to(-1)*(ne_val * (ix - ix_sw) * gradout);
-      giy += sw_val * (ix_ne - ix) * gradout;
-      giy += se_val * (ix - ix_nw) * gradout;
-    }
-
-    // un-normalize gradGrid values back to [-1, 1] constraints
-    gix = gix * (IW - 1) / 2;
-    giy = giy * (IH - 1) / 2;
-
-    Dtype gix_old = gradGrid[n][h][w][0];
-    Dtype giy_old = gradGrid[n][h][w][1];
-
-    gradGrid[n][h][w][0] = gix_old + gix;
-    gradGrid[n][h][w][1] = giy_old + giy;
-  }
-}
-
-#undef MIN
-#undef MAX
-#undef CLIP_COORDINATES
-#undef WITHIN_BOUNDS
-#undef SAFE_ADD
-
-#include "generic/SpatialGridSamplerBilinear.cu"
-#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu b/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu
deleted file mode 100644
index 43b8ceff1cb8ae..00000000000000
--- a/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu
+++ /dev/null
@@ -1,421 +0,0 @@
-#include "THCUNN.h"
-#include "common.h"
-#include "THCDeviceTensor.cuh"
-#include "THCDeviceTensorUtils.cuh"
-#include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
-#include "THCHalfAutoNumerics.cuh"
-#include "THCAtomics.cuh"
-
-#define WITHIN_BOUNDS(x, y, z, D, H, W) (x >= 0 && x < W && y >= 0 && y < H && z >= 0 && z < D)
-#define SAFE_ADD(input, x, y, z, n, c, D, H, W, value)	\
-  do {    \
-    if (WITHIN_BOUNDS(x, y, z, D, H, W)) {    \
-      atomicAdd(&input[n][c][z][y][x], value);	\
-    }						\
-  } while(0)
-
-#undef MIN
-#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
-#undef MAX
-#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
-#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0))
-
-const int MODE_BORDER = 1;
-
-
-template <typename Dtype>
-__launch_bounds__(1024)
-__global__ void VolumetricGridSamplerBilinear_updateOutput_kernel(
-    const int nthreads,
-    THCDeviceTensor<Dtype, 5> input,
-    THCDeviceTensor<Dtype, 5> grid,
-    THCDeviceTensor<Dtype, 5> output,
-    const int padding_mode) {
-
-  int N = input.getSize(0);
-  int C = input.getSize(1);
-  int ID = input.getSize(2);
-  int IH = input.getSize(3);
-  int IW = input.getSize(4);
-  int D = grid.getSize(1);
-  int H = grid.getSize(2);
-  int W = grid.getSize(3);
-
-  CUDA_KERNEL_LOOP(index, nthreads) {
-
-    const int n = index % N;
-    const int d = (index / N) % D;
-    const int h = (index / (N * D)) % H;
-    const int w = (index / (N * D * H)) % W;
-    int c;
-
-    // get the corresponding input x, y, z co-ordinates from grid
-    Dtype ix = grid[n][d][h][w][0];
-    Dtype iy = grid[n][d][h][w][1];
-    Dtype iz = grid[n][d][h][w][2];
-
-    // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1]
-    ix = ScalarConvert<float,Dtype>::to(((ix + 1.f) / 2) * (IW-1));
-    iy = ScalarConvert<float,Dtype>::to(((iy + 1.f) / 2) * (IH-1));
-    iz = ScalarConvert<float,Dtype>::to(((iz + 1.f) / 2) * (ID-1));
-
-    // get corner pixel values from (x, y, z)
-    // for 4d, we used north-east-south-west
-    // for 5d, we add top-bottom
-    int ix_tnw = floor(ScalarConvert<Dtype,float>::to(ix));
-    int iy_tnw = floor(ScalarConvert<Dtype,float>::to(iy));
-    int iz_tnw = floor(ScalarConvert<Dtype,float>::to(iz));
-    
-    int ix_tne = ix_tnw + 1;
-    int iy_tne = iy_tnw;
-    int iz_tne = iz_tnw;
-
-    int ix_tsw = ix_tnw;
-    int iy_tsw = iy_tnw + 1;
-    int iz_tsw = iz_tnw;
-
-    int ix_tse = ix_tnw + 1;
-    int iy_tse = iy_tnw + 1;
-    int iz_tse = iz_tnw;
-
-    int ix_bnw = ix_tnw;
-    int iy_bnw = iy_tnw;
-    int iz_bnw = iz_tnw + 1;
-
-    int ix_bne = ix_tnw + 1;
-    int iy_bne = iy_tnw;
-    int iz_bne = iz_tnw + 1;
-
-    int ix_bsw = ix_tnw;
-    int iy_bsw = iy_tnw + 1;
-    int iz_bsw = iz_tnw + 1;
-
-    int ix_bse = ix_tnw + 1;
-    int iy_bse = iy_tnw + 1;
-    int iz_bse = iz_tnw + 1;
-
-    // get surfaces to each neighbor:
-    Dtype tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
-    Dtype tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
-    Dtype tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
-    Dtype tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
-    Dtype bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
-    Dtype bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
-    Dtype bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
-    Dtype bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
-
-    // calculate bilinear weighted pixel value and set output pixel
-    if (padding_mode==MODE_BORDER){
-      // clip coordinates to image borders
-      CLIP_COORDINATES(ix_tnw, ix_tnw, IW);
-      CLIP_COORDINATES(iy_tnw, iy_tnw, IH);
-      CLIP_COORDINATES(iz_tnw, iz_tnw, ID);
-      CLIP_COORDINATES(ix_tne, ix_tne, IW);
-      CLIP_COORDINATES(iy_tne, iy_tne, IH);
-      CLIP_COORDINATES(iz_tne, iz_tne, ID);
-      CLIP_COORDINATES(ix_tsw, ix_tsw, IW);
-      CLIP_COORDINATES(iy_tsw, iy_tsw, IH);
-      CLIP_COORDINATES(iz_tsw, iz_tsw, ID);
-      CLIP_COORDINATES(ix_tse, ix_tse, IW);
-      CLIP_COORDINATES(iy_tse, iy_tse, IH);
-      CLIP_COORDINATES(iz_tse, iz_tse, ID);
-      CLIP_COORDINATES(ix_bnw, ix_bnw, IW);
-      CLIP_COORDINATES(iy_bnw, iy_bnw, IH);
-      CLIP_COORDINATES(iz_bnw, iz_bnw, ID);
-      CLIP_COORDINATES(ix_bne, ix_bne, IW);
-      CLIP_COORDINATES(iy_bne, iy_bne, IH);
-      CLIP_COORDINATES(iz_bne, iz_bne, ID);
-      CLIP_COORDINATES(ix_bsw, ix_bsw, IW);
-      CLIP_COORDINATES(iy_bsw, iy_bsw, IH);
-      CLIP_COORDINATES(iz_bsw, iz_bsw, ID);
-      CLIP_COORDINATES(ix_bse, ix_bse, IW);
-      CLIP_COORDINATES(iy_bse, iy_bse, IH);
-      CLIP_COORDINATES(iz_bse, iz_bse, ID);
-    }
-
-    Dtype out_val;
-    for (c = 0; c < C; ++c) {
-      out_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_tnw, iy_tnw, iz_tnw, ID, IH, IW)) {
-        out_val += input[n][c][iz_tnw][iy_tnw][ix_tnw] * tnw;
-      }
-      if (WITHIN_BOUNDS(ix_tne, iy_tne, iz_tne, ID, IH, IW)) {
-        out_val += input[n][c][iz_tne][iy_tne][ix_tne] * tne;
-      }
-      if (WITHIN_BOUNDS(ix_tsw, iy_tsw, iz_tsw, ID, IH, IW)) {
-        out_val += input[n][c][iz_tsw][iy_tsw][ix_tsw] * tsw;
-      }
-      if (WITHIN_BOUNDS(ix_tse, iy_tse, iz_tse, ID, IH, IW)) {
-        out_val += input[n][c][iz_tse][iy_tse][ix_tse] * tse;
-      }
-      if (WITHIN_BOUNDS(ix_bnw, iy_bnw, iz_bnw, ID, IH, IW)) {
-        out_val += input[n][c][iz_bnw][iy_bnw][ix_bnw] * bnw;
-      }
-      if (WITHIN_BOUNDS(ix_bne, iy_bne, iz_bne, ID, IH, IW)) {
-        out_val += input[n][c][iz_bne][iy_bne][ix_bne] * bne;
-      }
-      if (WITHIN_BOUNDS(ix_bsw, iy_bsw, iz_bsw, ID, IH, IW)) {
-        out_val += input[n][c][iz_bsw][iy_bsw][ix_bsw] * bsw;
-      }
-      if (WITHIN_BOUNDS(ix_bse, iy_bse, iz_bse, ID, IH, IW)) {
-        out_val += input[n][c][iz_bse][iy_bse][ix_bse] * bse;
-      }
-      output[n][c][d][h][w] = out_val;
-    }
-  }
-}
-
-template <typename Dtype>
-__launch_bounds__(1024)
-__global__ void VolumetricGridSamplerBilinear_updateGradInput_kernel(
-    const int nthreads,
-    THCDeviceTensor<Dtype, 5> input, THCDeviceTensor<Dtype, 5> gradInput,
-    THCDeviceTensor<Dtype, 5> grid, THCDeviceTensor<Dtype, 5> gradGrid,
-    THCDeviceTensor<Dtype, 5> gradOutput,
-    const int padding_mode) {
-
-  int N = input.getSize(0);
-  int C = input.getSize(1);
-  int ID = input.getSize(2);
-  int IH = input.getSize(3);
-  int IW = input.getSize(4);
-  int D = grid.getSize(1);
-  int H = grid.getSize(2);
-  int W = grid.getSize(3);
-
-  CUDA_KERNEL_LOOP(index, nthreads) {
-
-    const int n = index % N;
-    const int d = (index / N) % D;
-    const int h = (index / (N * D)) % H;
-    const int w = (index / (N * D * H)) % W;
-
-    // get the corresponding input x, y, z co-ordinates from grid
-    Dtype ix = grid[n][d][h][w][0];
-    Dtype iy = grid[n][d][h][w][1];
-    Dtype iz = grid[n][d][h][w][2];
-
-    Dtype gix = ScalarConvert<int,Dtype>::to(0);
-    Dtype giy = ScalarConvert<int,Dtype>::to(0);
-    Dtype giz = ScalarConvert<int,Dtype>::to(0);
-
-    // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1]
-    ix = ScalarConvert<float,Dtype>::to(((ix + 1.f) / 2) * (IW-1));
-    iy = ScalarConvert<float,Dtype>::to(((iy + 1.f) / 2) * (IH-1));
-    iz = ScalarConvert<float,Dtype>::to(((iz + 1.f) / 2) * (ID-1));
-
-    // get corner pixel values from (x, y, z)
-    // for 4d, we used north-east-south-west
-    // for 5d, we add top-bottom
-    int ix_tnw = floor(ScalarConvert<Dtype,float>::to(ix));
-    int iy_tnw = floor(ScalarConvert<Dtype,float>::to(iy));
-    int iz_tnw = floor(ScalarConvert<Dtype,float>::to(iz));
-    
-    int ix_tne = ix_tnw + 1;
-    int iy_tne = iy_tnw;
-    int iz_tne = iz_tnw;
-
-    int ix_tsw = ix_tnw;
-    int iy_tsw = iy_tnw + 1;
-    int iz_tsw = iz_tnw;
-
-    int ix_tse = ix_tnw + 1;
-    int iy_tse = iy_tnw + 1;
-    int iz_tse = iz_tnw;
-
-    int ix_bnw = ix_tnw;
-    int iy_bnw = iy_tnw;
-    int iz_bnw = iz_tnw + 1;
-
-    int ix_bne = ix_tnw + 1;
-    int iy_bne = iy_tnw;
-    int iz_bne = iz_tnw + 1;
-
-    int ix_bsw = ix_tnw;
-    int iy_bsw = iy_tnw + 1;
-    int iz_bsw = iz_tnw + 1;
-
-    int ix_bse = ix_tnw + 1;
-    int iy_bse = iy_tnw + 1;
-    int iz_bse = iz_tnw + 1;
-
-    // get surfaces to each neighbor:
-    Dtype tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
-    Dtype tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
-    Dtype tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
-    Dtype tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
-    Dtype bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
-    Dtype bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
-    Dtype bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
-    Dtype bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
-
-    Dtype gradout;
-    Dtype tnw_val;
-    Dtype tne_val;
-    Dtype tsw_val;
-    Dtype tse_val;
-    Dtype bnw_val;
-    Dtype bne_val;
-    Dtype bsw_val;
-    Dtype bse_val;
-    
-    int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl;
-    int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl;
-    int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl;
-    int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl;
-
-    if (padding_mode==MODE_BORDER){
-      // clip coordinates to image borders
-      CLIP_COORDINATES(ix_tnw, ix_tnw_cl, IW);
-      CLIP_COORDINATES(iy_tnw, iy_tnw_cl, IH);
-      CLIP_COORDINATES(iz_tnw, iz_tnw_cl, ID);
-      CLIP_COORDINATES(ix_tne, ix_tne_cl, IW);
-      CLIP_COORDINATES(iy_tne, iy_tne_cl, IH);
-      CLIP_COORDINATES(iz_tne, iz_tne_cl, ID);
-      CLIP_COORDINATES(ix_tsw, ix_tsw_cl, IW);
-      CLIP_COORDINATES(iy_tsw, iy_tsw_cl, IH);
-      CLIP_COORDINATES(iz_tsw, iz_tsw_cl, ID);
-      CLIP_COORDINATES(ix_tse, ix_tse_cl, IW);
-      CLIP_COORDINATES(iy_tse, iy_tse_cl, IH);
-      CLIP_COORDINATES(iz_tse, iz_tse_cl, ID);
-      CLIP_COORDINATES(ix_bnw, ix_bnw_cl, IW);
-      CLIP_COORDINATES(iy_bnw, iy_bnw_cl, IH);
-      CLIP_COORDINATES(iz_bnw, iz_bnw_cl, ID);
-      CLIP_COORDINATES(ix_bne, ix_bne_cl, IW);
-      CLIP_COORDINATES(iy_bne, iy_bne_cl, IH);
-      CLIP_COORDINATES(iz_bne, iz_bne_cl, ID);
-      CLIP_COORDINATES(ix_bsw, ix_bsw_cl, IW);
-      CLIP_COORDINATES(iy_bsw, iy_bsw_cl, IH);
-      CLIP_COORDINATES(iz_bsw, iz_bsw_cl, ID);
-      CLIP_COORDINATES(ix_bse, ix_bse_cl, IW);
-      CLIP_COORDINATES(iy_bse, iy_bse_cl, IH);
-      CLIP_COORDINATES(iz_bse, iz_bse_cl, ID);
-    }
-    else {
-      ix_tnw_cl = ix_tnw;
-      iy_tnw_cl = iy_tnw;
-      iz_tnw_cl = iz_tnw;
-      ix_tne_cl = ix_tne;
-      iy_tne_cl = iy_tne;
-      iz_tne_cl = iz_tne;
-      ix_tsw_cl = ix_tsw;
-      iy_tsw_cl = iy_tsw;
-      iz_tsw_cl = iz_tsw;
-      ix_tse_cl = ix_tse;
-      iy_tse_cl = iy_tse;
-      iz_tse_cl = iz_tse;
-      ix_bnw_cl = ix_bnw;
-      iy_bnw_cl = iy_bnw;
-      iz_bnw_cl = iz_bnw;
-      ix_bne_cl = ix_bne;
-      iy_bne_cl = iy_bne;
-      iz_bne_cl = iz_bne;
-      ix_bsw_cl = ix_bsw;
-      iy_bsw_cl = iy_bsw;
-      iz_bsw_cl = iz_bsw;
-      ix_bse_cl = ix_bse;
-      iy_bse_cl = iy_bse;
-      iz_bse_cl = iz_bse;
-    }
-
-    for (int c = 0; c < C; ++c) {
-      gradout = gradOutput[n][c][d][h][w];
-
-      // calculate and set gradInput
-      SAFE_ADD(gradInput, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW, tnw * gradout);
-      SAFE_ADD(gradInput, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW, tne * gradout);
-      SAFE_ADD(gradInput, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW, tsw * gradout);
-      SAFE_ADD(gradInput, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW, tse * gradout);
-      SAFE_ADD(gradInput, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW, bnw * gradout);
-      SAFE_ADD(gradInput, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW, bne * gradout);
-      SAFE_ADD(gradInput, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW, bsw * gradout);
-      SAFE_ADD(gradInput, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW, bse * gradout);
-
-      // calculate gradGrid
-      tnw_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ID, IH, IW)) {
-        tnw_val = input[n][c][iz_tnw_cl][iy_tnw_cl][ix_tnw_cl];
-      }
-      tne_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_tne_cl, iy_tne_cl, iz_tne_cl, ID, IH, IW)) {
-        tne_val = input[n][c][iz_tne_cl][iy_tne_cl][ix_tne_cl];
-      }
-      tsw_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ID, IH, IW)) {
-        tsw_val = input[n][c][iz_tsw_cl][iy_tsw_cl][ix_tsw_cl];
-      }
-      tse_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_tse_cl, iy_tse_cl, iz_tse_cl, ID, IH, IW)) {
-        tse_val = input[n][c][iz_tse_cl][iy_tse_cl][ix_tse_cl];
-      }
-      bnw_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ID, IH, IW)) {
-        bnw_val = input[n][c][iz_bnw_cl][iy_bnw_cl][ix_bnw_cl];
-      }
-      bne_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_bne_cl, iy_bne_cl, iz_bne_cl, ID, IH, IW)) {
-        bne_val = input[n][c][iz_bne_cl][iy_bne_cl][ix_bne_cl];
-      }
-      bsw_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ID, IH, IW)) {
-        bsw_val = input[n][c][iz_bsw_cl][iy_bsw_cl][ix_bsw_cl];
-      }
-      bse_val = ScalarConvert<int,Dtype>::to(0);
-      if (WITHIN_BOUNDS(ix_bse_cl, iy_bse_cl, iz_bse_cl, ID, IH, IW)) {
-        bse_val = input[n][c][iz_bse_cl][iy_bse_cl][ix_bse_cl];
-      }
-
-      Dtype m1 = ScalarConvert<int,Dtype>::to(-1);
-      gix += m1 * tnw_val * (iy_bse - iy) * (iz_bse - iz) * gradout;
-      gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gradout;
-      gix += m1 * tsw_val * (iy - iy_bne) * (iz_bne - iz) * gradout;
-      gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gradout;
-      gix += m1 * bnw_val * (iy_tse - iy) * (iz - iz_tse) * gradout;
-      gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gradout;
-      gix += m1 * bsw_val * (iy - iy_tne) * (iz - iz_tne) * gradout;
-      gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gradout;
-
-
-      giy += m1 * tnw_val * (ix_bse - ix)    * (iz_bse - iz) * gradout;
-      giy += m1 * tne_val * (ix    - ix_bsw) * (iz_bsw - iz) * gradout;
-      giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz) * gradout;
-      giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz) * gradout;
-      giy += m1 * bnw_val * (ix_tse - ix)    * (iz - iz_tse) * gradout;
-      giy += m1 * bne_val * (ix    - ix_tsw) * (iz - iz_tsw) * gradout;
-      giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne) * gradout;
-      giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw) * gradout;
-
-      giz += m1 * tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gradout;
-      giz += m1 * tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gradout;
-      giz += m1 * tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gradout;
-      giz += m1 * tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gradout;
-      giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gradout;
-      giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gradout;
-      giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gradout;
-      giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gradout;
-    }
-
-    // un-normalize gradGrid values back to [-1, 1] constraints
-    gix = gix * (IW - 1) / 2;
-    giy = giy * (IH - 1) / 2;
-    giz = giz * (ID - 1) / 2;
-
-    Dtype gix_old = gradGrid[n][d][h][w][0];
-    Dtype giy_old = gradGrid[n][d][h][w][1];
-    Dtype giz_old = gradGrid[n][d][h][w][2];
-
-    gradGrid[n][d][h][w][0] = gix_old + gix;
-    gradGrid[n][d][h][w][1] = giy_old + giy;
-    gradGrid[n][d][h][w][2] = giz_old + giz;
-  }
-}
-
-#undef MIN
-#undef MAX
-#undef CLIP_COORDINATES
-#undef WITHIN_BOUNDS
-#undef SAFE_ADD
-
-#include "generic/VolumetricGridSamplerBilinear.cu"
-#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 47f9bee0fb6744..e2a99640ba69b6 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -62,7 +62,7 @@ inline int GET_BLOCKS(const int N)
 
 #define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \
   if (THCTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM ||             \
-      THCTensor_(size)(STATE, T, DIM_SIZE) != SIZE) {        \
+      THCTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) {        \
       THCDescBuff s1 = THCTensor_(sizeDesc)(state, T);       \
       THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
               " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
@@ -70,7 +70,7 @@ inline int GET_BLOCKS(const int N)
 
 #define THCUNN_check_dim_size_indices(STATE, T, DIM, DIM_SIZE, SIZE)  \
   if (THCIndexTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM ||                 \
-      THCIndexTensor_(size)(STATE, T, DIM_SIZE) != SIZE) {            \
+      THCIndexTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) {            \
       THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, T);           \
       THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
               " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
diff --git a/aten/src/THCUNN/generic/BatchNormalization.cu b/aten/src/THCUNN/generic/BatchNormalization.cu
index 03dd38a7bd76ee..81eabc68812f36 100644
--- a/aten/src/THCUNN/generic/BatchNormalization.cu
+++ b/aten/src/THCUNN/generic/BatchNormalization.cu
@@ -21,11 +21,11 @@ static THCDeviceTensor<real, Dim> THNN_(devicetensor)(THCState *state, THCTensor
   int size[Dim];
   for (int i = 0; i < Dim || i < inDim; ++i) {
     if (i < Dim && i < inDim) {
-      size[i] = t->size(i);
+      size[i] = THTensor_sizeLegacyNoScalars(t, i);
     } else if (i < Dim) {
       size[i] = 1;
     } else {
-      size[Dim - 1] *= t->size(i);
+      size[Dim - 1] *= THTensor_sizeLegacyNoScalars(t, i);
     }
   }
   return THCDeviceTensor<real, Dim>(t->data<real>(), size);
diff --git a/aten/src/THCUNN/generic/ClassNLLCriterion.cu b/aten/src/THCUNN/generic/ClassNLLCriterion.cu
index 6126dee76dcb27..6866c5798f7d23 100644
--- a/aten/src/THCUNN/generic/ClassNLLCriterion.cu
+++ b/aten/src/THCUNN/generic/ClassNLLCriterion.cu
@@ -16,7 +16,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(
   }
 
   int n_dims = THCTensor_(nDimensionLegacyNoScalars)(state, input);
-  int n_classes = THCTensor_(size)(state, input, n_dims - 1);
+  int n_classes = THCTensor_(sizeLegacyNoScalars)(state, input, n_dims - 1);
   ignore_index -= TH_INDEX_BASE;
 
   if (weights) {
@@ -31,8 +31,8 @@ void THNN_(ClassNLLCriterion_updateOutput)(
 
   THArgCheck(!input->is_empty() && (n_dims <= 2 && n_dims > 0), 2, "non-empty vector or matrix expected");
 
-  int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(size)(state, input, 0);
-  int64_t num_targets = THCudaLongTensor_size(state, target, 0);
+  int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(sizeLegacyNoScalars)(state, input, 0);
+  int64_t num_targets = THCudaLongTensor_sizeLegacyNoScalars(state, target, 0);
   THArgCheck(batch_size == num_targets,
       2, "mismatch between the batch size of input (%ld) and that of target (%ld)",
       batch_size, num_targets);
@@ -152,7 +152,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
   THArgCheck(!input->is_empty() && (n_dims <= 2 && n_dims > 0), 2, "non-empty vector or matrix expected");
 
   int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(size)(state, input, 0);
-  int64_t num_targets = THCudaLongTensor_size(state, target, 0);
+  int64_t num_targets = THCudaLongTensor_sizeLegacyNoScalars(state, target, 0);
   THArgCheck(batch_size == num_targets,
       2, "mismatch between the batch size of input (%ld) and that of target (%ld)",
       batch_size, num_targets);
diff --git a/aten/src/THCUNN/generic/ELU.cu b/aten/src/THCUNN/generic/ELU.cu
index 5c09a0607f0246..6f78349110ec35 100644
--- a/aten/src/THCUNN/generic/ELU.cu
+++ b/aten/src/THCUNN/generic/ELU.cu
@@ -11,21 +11,23 @@ void THNN_(ELU_updateOutput)(
            THCTensor *output,
            accreal alpha,
            accreal scale,
+           accreal input_scale,
            bool inplace)
 {
   real negcoef = ScalarConvert<accreal, real>::to(alpha * scale);
-  real poscoef = ScalarConvert<accreal, real>::to(scale);
+  real poscoef = ScalarConvert<accreal, real>::to(scale * input_scale);
+  real negiptcoef = ScalarConvert<accreal, real>::to(input_scale);
   THCUNN_assertSameGPU(state, 2, input, output);
 
   if (inplace)
   {
-    THC_pointwiseApply1<real>(state, input, ELUupdateOutputIP_functor<real>(negcoef, poscoef));
+    THC_pointwiseApply1<real>(state, input, ELUupdateOutputIP_functor<real>(negcoef, poscoef, negiptcoef));
     THCTensor_(set)(state, output, input);
   }
   else
   {
     THCTensor_(resizeAs)(state, output, input);
-    THC_pointwiseApply2<real, real>(state, output, input, ELUupdateOutput_functor<real>(negcoef, poscoef));
+    THC_pointwiseApply2<real, real>(state, output, input, ELUupdateOutput_functor<real>(negcoef, poscoef, negiptcoef));
   }
 }
 
@@ -36,15 +38,17 @@ void THNN_(ELU_updateGradInput)(
            THCTensor *gradInput,
            THCTensor *output,
            accreal alpha,
-           accreal scale)
+           accreal scale,
+           accreal input_scale)
 {
   real negcoef = ScalarConvert<accreal, real>::to(alpha * scale);
-  real poscoef = ScalarConvert<accreal, real>::to(scale);
+  real poscoef = ScalarConvert<accreal, real>::to(scale * input_scale);
+  real negiptcoef = ScalarConvert<accreal, real>::to(input_scale);
   THCUNN_check_nElement(state, output, gradOutput);
   THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
 
   THCTensor_(resizeAs)(state, gradInput, output);
-  THC_pointwiseApply3<real, real, real>(state, gradInput, output, gradOutput, ELUupdateGradInput_functor<real>(negcoef, poscoef));
+  THC_pointwiseApply3<real, real, real>(state, gradInput, output, gradOutput, ELUupdateGradInput_functor<real>(negcoef, poscoef, negiptcoef));
 }
 
 #endif
diff --git a/aten/src/THCUNN/generic/GatedLinearUnit.cu b/aten/src/THCUNN/generic/GatedLinearUnit.cu
index 4622403e76088f..9bd59eec538cb6 100644
--- a/aten/src/THCUNN/generic/GatedLinearUnit.cu
+++ b/aten/src/THCUNN/generic/GatedLinearUnit.cu
@@ -12,7 +12,7 @@ void THNN_(GatedLinear_updateOutput)(
 
   // size output to half of input
   dim = dim - TH_INDEX_BASE;
-  const int64_t nIn = THCTensor_(size)(state, input, dim);
+  const int64_t nIn = THCTensor_(sizeLegacyNoScalars)(state, input, dim);
   THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
       dim + TH_INDEX_BASE, nIn);
   const int64_t inputSize = THCTensor_(size)(state, input, dim) / 2;
diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
index 8272b3d4020ec7..65bd6cdec850bb 100644
--- a/aten/src/THCUNN/generic/MultiMarginCriterion.cu
+++ b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
@@ -18,7 +18,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   input = THCTensor_(newContiguous)(state, input);
   if(weights)
     weights = THCTensor_(newContiguous)(state, weights);
-  if (input->dim() == 1)
+  if (THTensor_nDimensionLegacyNoScalars(input) == 1)
   {
     dim3 blocks(1);
     dim3 threads(MULTIMARGIN_THREADS);
@@ -30,7 +30,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, input->size(0),
+        1, THTensor_sizeLegacyNoScalars(input, 0),
         reduction == Reduction::ElementwiseMean,
         margin
       );
@@ -42,7 +42,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, input->size(0),
+        1, THTensor_sizeLegacyNoScalars(input, 0),
         reduction == Reduction::ElementwiseMean,
         margin
       );
@@ -52,7 +52,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   else if (input->dim() == 2)
   {
     int nframe = input->size(0);
-    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3,
+    THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
                "inconsistent target size");
     dim3 blocks(input->size(0));
     dim3 threads(MULTIMARGIN_THREADS);
@@ -149,7 +149,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   if(weights)
     weights = THCTensor_(newContiguous)(state, weights);
 
-  if (input->dim() == 1)
+  if (THTensor_nDimensionLegacyNoScalars(input) == 1)
   {
     dim3 blocks(1);
     dim3 threads(MULTIMARGIN_THREADS);
@@ -162,7 +162,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, gradInput->size(0),
+        1, THTensor_sizeLegacyNoScalars(gradInput, 0),
         reduction == Reduction::ElementwiseMean,
         margin,
         reduction != Reduction::None
@@ -176,7 +176,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, gradInput->size(0),
+        1, THTensor_sizeLegacyNoScalars(gradInput, 0),
         reduction == Reduction::ElementwiseMean,
         margin,
         reduction != Reduction::None
@@ -187,7 +187,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   else if (input->dim() == 2)
   {
     int nframe = gradInput->size(0);
-    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3,
+    THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
                "inconsistent target size");
     dim3 blocks(gradInput->size(0));
     dim3 threads(MULTIMARGIN_THREADS);
diff --git a/aten/src/THCUNN/generic/PReLU.cu b/aten/src/THCUNN/generic/PReLU.cu
index 2517b409409aed..2a0d719ff6a3e6 100644
--- a/aten/src/THCUNN/generic/PReLU.cu
+++ b/aten/src/THCUNN/generic/PReLU.cu
@@ -24,8 +24,8 @@ void THNN_(PReLU_updateOutput)(
     input = THCTensor_(newContiguous)(state, input);
 
     int n = THCTensor_(nElement)(state, input);
-    if (input->size(ndim > 1) != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(ndim > 1));
+    if (THTensor_sizeLegacyNoScalars(input, ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, ndim > 1));
 
     int mapSize = 1;
     for (int d = 2; d < ndim; d++) {
@@ -69,8 +69,8 @@ void THNN_(PReLU_updateGradInput)(
     gradOutput = THCTensor_(newContiguous)(state, gradOutput);
 
     int n = THCTensor_(nElement)(state, input);
-    if (input->size(ndim > 1) != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(ndim > 1));
+    if (THTensor_sizeLegacyNoScalars(input, ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, ndim > 1));
 
     int mapSize = 1;
     for (int d = 2; d < ndim; d++) {
diff --git a/aten/src/THCUNN/generic/SparseLinear.cu b/aten/src/THCUNN/generic/SparseLinear.cu
index f73bd5835c04bb..0363dcf0e3996a 100644
--- a/aten/src/THCUNN/generic/SparseLinear.cu
+++ b/aten/src/THCUNN/generic/SparseLinear.cu
@@ -4,17 +4,17 @@
 
 static bool THNN_(checkInput)(THCTensor* t)
 {
-  return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 2 && t->size(1) == 3;
+  return !t->is_empty() && t->dim() == 2 && t->size(1) == 3;
 }
 
 static bool THNN_(checkSize2D)(THCTensor* t, int64_t size0, int64_t size1)
 {
-  return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 2 && t->size(0) == size0 && t->size(1) == size1;
+  return !t->is_empty() && t->dim() == 2 && t->size(0) == size0 && t->size(1) == size1;
 }
 
 static bool THNN_(checkSize1D)(THCTensor* t, int64_t size0)
 {
-  return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 1 && t->size(0) == size0;
+  return !t->is_empty() && THTensor_nDimensionLegacyNoScalars(t) == 1 && THTensor_sizeLegacyNoScalars(t, 0) == size0;
 }
 
 static inline void THNN_(copyCudaFloatingType)(THCState *state, THCudaIntTensor *buf, THCTensor *t) {
diff --git a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu
index b7010977558816..ae211774a580db 100644
--- a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu
+++ b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu
@@ -8,10 +8,10 @@ void THNN_(SpatialClassNLLCriterion_shapeCheck)(
            THCIndexTensor *target,
            THCTensor *weights)
 {
-  AT_CHECK(!target->is_empty() && THCIndexTensor_(nDimensionLegacyNoScalars)(state, target) == 3, 1,
+  AT_CHECK(!target->is_empty() && target->dim() == 3, 1,
            "only batches of spatial targets supported (non-empty 3D tensors)" \
            " but got targets of size: : ", target->sizes());
-  AT_CHECK(!input->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, input) == 4, 2,
+  AT_CHECK(!input->is_empty() && input->dim() == 4, 2,
            "only batches of spatial inputs supported (non-empty 4D tensors), "      \
            "but got input of size: ", input->sizes());
   if (THCTensor_(size)(state, input, 0) != THCIndexTensor_(size)(state, target, 0) ||
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
index 334afe93cb727e..7860404b685f52 100644
--- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
+++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
@@ -73,7 +73,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size(0);
+      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     }
     THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
diff --git a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
index 7c6716c41f5bff..546ec2ae3c6185 100644
--- a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
@@ -31,7 +31,7 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)(
 
   // Bias has same # of channels as output
   if (bias) {
-    THAssert(bias->size(0) == weight->size(0));
+    THAssert(THTensor_sizeLegacyNoScalars(bias, 0) == weight->size(0));
   }
 
   input = THCTensor_(newContiguous)(state, input);
diff --git a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
index ad0f47418b86cf..4225583735460e 100644
--- a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
@@ -65,7 +65,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size(0);
+      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     }
      THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
diff --git a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu
index 76777796e361e4..8d039d54068aaf 100644
--- a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu
@@ -65,7 +65,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
       int64_t nOutputPlane = weight->size(1);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size(0);
+      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     }
     THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
@@ -351,7 +351,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
   if (gradWeight != NULL) {
     nOutputPlane = THCTensor_(size)(state, gradWeight, 1);
   } else if (gradBias != NULL) {
-    nOutputPlane = THCTensor_(size)(state, gradBias, 0);
+    nOutputPlane = THCTensor_(sizeLegacyNoScalars)(state, gradBias, 0);
   } else {
     return;
   }
diff --git a/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu b/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu
deleted file mode 100644
index 7e285cb55fa7d2..00000000000000
--- a/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu
+++ /dev/null
@@ -1,97 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/SpatialGridSamplerBilinear.cu"
-#else
-
-static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck)(
-    THCState *state,
-    THCTensor *input,
-    THCTensor *grid,
-    THCTensor *gradOutput) {
-  THCUNN_argCheck(state, !input->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, input) == 4, 2, input,
-      "non-empty 4D input tensor expected but got: %s");
-  THCUNN_argCheck(state, !grid->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, grid) == 4, 2, grid,
-      "4D grid tensor expected but got: %s");
-
-  int64_t nbatch   = THCTensor_(size)(state, input, 0);
-  int64_t channels = THCTensor_(size)(state, input, 1);
-  int64_t iheight   = THCTensor_(size)(state, input, 2);
-  int64_t iwidth    = THCTensor_(size)(state, input, 3);
-  int64_t oheight   = THCTensor_(size)(state, grid, 1);
-  int64_t owidth    = THCTensor_(size)(state, grid, 2);
-
-  THCUNN_check_dim_size(state, grid, 4, 0, nbatch);
-  THCUNN_check_dim_size(state, grid, 4, 3, 2);
-
-  if (gradOutput != NULL) {
-    THCUNN_check_dim_size(state, gradOutput, 4, 0, nbatch);
-    THCUNN_check_dim_size(state, gradOutput, 4, 1, channels);
-    THCUNN_check_dim_size(state, gradOutput, 4, 2, oheight);
-    THCUNN_check_dim_size(state, gradOutput, 4, 3, owidth);
-  }
-}
-
-THC_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
-    THCState *state,
-    THCTensor *input,
-    THCTensor *grid,
-    THCTensor *output,
-    int padding_mode) {
-
-  THCUNN_assertSameGPU(state, 3, input, grid, output);
-  THNN_(SpatialGridSamplerBilinear_shapeCheck)(state, input, grid, NULL);
-  int64_t N = THCTensor_(size)(state, input, 0);
-  int64_t C = THCTensor_(size)(state, input, 1);
-  int64_t IH = THCTensor_(size)(state, input, 2);
-  int64_t IW = THCTensor_(size)(state, input, 3);
-  int64_t H = THCTensor_(size)(state,grid, 1);
-  int64_t W = THCTensor_(size)(state, grid, 2);
-
-  // resize output to the same shape as input
-  THCTensor_(resize4d)(state, output, N, C, H, W);
-
-  THCDeviceTensor<real, 4> devInput = toDeviceTensor<real, 4>(state, input);
-  THCDeviceTensor<real, 4> devGrid = toDeviceTensor<real, 4>(state, grid);
-  THCDeviceTensor<real, 4> devOutput = toDeviceTensor<real, 4>(state, output);
-
-  int count = static_cast<int>(N*H*W);
-  SpatialGridSamplerBilinear_updateOutput_kernel
-    <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
-      count, devInput, devGrid, devOutput, padding_mode);
-  THCudaCheck(cudaGetLastError());
-}
-
-THC_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
-    THCState *state,
-    THCTensor *input, THCTensor *gradInput,
-    THCTensor *grid, THCTensor *gradGrid,
-    THCTensor *gradOutput,
-    int padding_mode) {
-
-  THCUNN_assertSameGPU(state, 5, input, gradInput, grid, gradGrid, gradOutput);
-  THNN_(SpatialGridSamplerBilinear_shapeCheck)(state, input, grid, gradOutput);
-  int64_t N = THCTensor_(size)(state, input, 0);
-  int64_t C = THCTensor_(size)(state, input, 1);
-  int64_t IH = THCTensor_(size)(state, input, 2);
-  int64_t IW = THCTensor_(size)(state, input, 3);
-  int64_t H = THCTensor_(size)(state, grid, 1);
-  int64_t W = THCTensor_(size)(state, grid, 2);
-
-  THCTensor_(resize4d)(state, gradInput, N, C, IH, IW);
-  THCTensor_(resize4d)(state, gradGrid, N, H, W, 2);
-  THCTensor_(zero)(state, gradInput);
-  THCTensor_(zero)(state, gradGrid);
-
-  THCDeviceTensor<real, 4> devInput = toDeviceTensor<real, 4>(state, input);
-  THCDeviceTensor<real, 4> devGradInput = toDeviceTensor<real, 4>(state, gradInput);
-  THCDeviceTensor<real, 4> devGrid = toDeviceTensor<real, 4>(state, grid);
-  THCDeviceTensor<real, 4> devGradGrid = toDeviceTensor<real, 4>(state, gradGrid);
-  THCDeviceTensor<real, 4> devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
-
-  int count = static_cast<int>(N*H*W);
-  SpatialGridSamplerBilinear_updateGradInput_kernel
-    <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
-      count, devInput, devGradInput, devGrid, devGradGrid, devGradOutput, padding_mode);
-  THCudaCheck(cudaGetLastError());
-}
-
-#endif
diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h
index eaadf66c8306ee..3c4883a1e3c45d 100644
--- a/aten/src/THCUNN/generic/THCUNN.h
+++ b/aten/src/THCUNN/generic/THCUNN.h
@@ -119,6 +119,7 @@ THC_API void THNN_(ELU_updateOutput)(
                   THCTensor *output,
                   accreal alpha,
                   accreal scale,
+                  accreal input_scale,
                   bool inplace);
 
 THC_API void THNN_(ELU_updateGradInput)(
@@ -127,7 +128,8 @@ THC_API void THNN_(ELU_updateGradInput)(
                   THCTensor *gradInput,
                   THCTensor *output,
                   accreal alpha,
-                  accreal scale);
+                  accreal scale,
+                  accreal input_scale);
 
 THC_API void THNN_(FeatureLPPooling_updateOutput)(
                   THCState* state,
@@ -1045,34 +1047,6 @@ THC_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
                   int outputHeight,
                   int outputWidth);
 
-THC_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *grid,
-                  THCTensor *output,
-                  int padding_mode);
-
-THC_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
-                  THCState *state,
-                  THCTensor *input, THCTensor *gradInput,
-                  THCTensor *grid, THCTensor *gradGrid,
-                  THCTensor *gradOutput,
-                  int padding_mode);
-
-THC_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)(
-                  THCState *state,
-                  THCTensor *input,
-                  THCTensor *grid,
-                  THCTensor *output,
-                  int padding_mode);
-
-THC_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)(
-                  THCState *state,
-                  THCTensor *input, THCTensor *gradInput,
-                  THCTensor *grid, THCTensor *gradGrid,
-                  THCTensor *gradOutput,
-                  int padding_mode);
-
 THC_API void THNN_(RReLU_updateOutput)(
                   THCState *state,
                   THCTensor *input,
diff --git a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
index 870d38ba225f8c..310f22d03e5dfa 100644
--- a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
+++ b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
@@ -79,7 +79,7 @@ void THNN_(TemporalReflectionPadding_updateGradInput)(
   int planeDim = 0;
   int dimw = 1;
 
-  int numInputDims = THCTensor_(nDimensionLegacyNoScalars)(state, input);
+  int numInputDims = input->dim();
   if (numInputDims == 3) {
     planeDim++;
     dimw++;
diff --git a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu
index 52d97fbf2a3638..d6ffba3519553c 100644
--- a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu
@@ -75,7 +75,7 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size(0);
+      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     }
     THCUNN_check_dim_size(state, gradOutput, ndim, dimd, outputDepth);
diff --git a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu
index 96310609e956f4..10a5fdc2643193 100644
--- a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu
@@ -387,7 +387,7 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
   if (gradWeight) {
     nOutputPlane = THCTensor_(size)(state, gradWeight, 1);
   } else if (gradBias) {
-    nOutputPlane = THCTensor_(size)(state, gradBias, 0);
+    nOutputPlane = THCTensor_(sizeLegacyNoScalars)(state, gradBias, 0);
   } else {
     return;
   }
diff --git a/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu b/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu
deleted file mode 100644
index 086667ca476ac1..00000000000000
--- a/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/VolumetricGridSamplerBilinear.cu"
-#else
-
-static inline void THNN_(VolumetricGridSamplerBilinear_shapeCheck)(
-    THCState *state,
-    THCTensor *input,
-    THCTensor *grid,
-    THCTensor *gradOutput) {
-  THCUNN_argCheck(state, !input->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, input) == 5, 2, input,
-      "non-empty 5D input tensor expected but got: %s");
-  THCUNN_argCheck(state, !grid->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, grid) == 5, 2, grid,
-      "non-empty 5D grid tensor expected but got: %s");
-
-  int64_t nbatch   = THCTensor_(size)(state, input, 0);
-  int64_t channels = THCTensor_(size)(state, input, 1);
-  int64_t idepth   = THCTensor_(size)(state, input, 2);
-  int64_t iheight   = THCTensor_(size)(state, input, 3);
-  int64_t iwidth    = THCTensor_(size)(state, input, 4);
-  int64_t odepth   = THCTensor_(size)(state, grid, 1);
-  int64_t oheight   = THCTensor_(size)(state, grid, 2);
-  int64_t owidth    = THCTensor_(size)(state, grid, 3);
-
-  THCUNN_check_dim_size(state, grid, 5, 0, nbatch);
-  THCUNN_check_dim_size(state, grid, 5, 4, 3);
-
-  if (gradOutput != NULL) {
-    THCUNN_check_dim_size(state, gradOutput, 5, 0, nbatch);
-    THCUNN_check_dim_size(state, gradOutput, 5, 1, channels);
-    THCUNN_check_dim_size(state, gradOutput, 5, 2, odepth);
-    THCUNN_check_dim_size(state, gradOutput, 5, 3, oheight);
-    THCUNN_check_dim_size(state, gradOutput, 5, 4, owidth);
-  }
-}
-
-THC_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)(
-    THCState *state,
-    THCTensor *input,
-    THCTensor *grid,
-    THCTensor *output,
-    int padding_mode) {
-
-  THCUNN_assertSameGPU(state, 3, input, grid, output);
-  THNN_(VolumetricGridSamplerBilinear_shapeCheck)(state, input, grid, NULL);
-  int64_t N = THCTensor_(size)(state, input, 0);
-  int64_t C = THCTensor_(size)(state, input, 1);
-  int64_t ID = THCTensor_(size)(state, input, 2);
-  int64_t IH = THCTensor_(size)(state, input, 3);
-  int64_t IW = THCTensor_(size)(state, input, 4);
-  int64_t D = THCTensor_(size)(state,grid, 1);
-  int64_t H = THCTensor_(size)(state,grid, 2);
-  int64_t W = THCTensor_(size)(state, grid, 3);
-
-  // resize output to the same shape as input
-  THCTensor_(resize5d)(state, output, N, C, D, H, W);
-
-  THCDeviceTensor<real, 5> devInput = toDeviceTensor<real, 5>(state, input);
-  THCDeviceTensor<real, 5> devGrid = toDeviceTensor<real, 5>(state, grid);
-  THCDeviceTensor<real, 5> devOutput = toDeviceTensor<real, 5>(state, output);
-
-  int count = static_cast<int>(N*D*H*W);
-  VolumetricGridSamplerBilinear_updateOutput_kernel
-    <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
-      count, devInput, devGrid, devOutput, padding_mode);
-  THCudaCheck(cudaGetLastError());
-}
-
-THC_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)(
-    THCState *state,
-    THCTensor *input, THCTensor *gradInput,
-    THCTensor *grid, THCTensor *gradGrid,
-    THCTensor *gradOutput,
-    int padding_mode) {
-
-  THCUNN_assertSameGPU(state, 5, input, gradInput, grid, gradGrid, gradOutput);
-  THNN_(VolumetricGridSamplerBilinear_shapeCheck)(state, input, grid, gradOutput);
-  int64_t N = THCTensor_(size)(state, input, 0);
-  int64_t C = THCTensor_(size)(state, input, 1);
-  int64_t ID = THCTensor_(size)(state, input, 2);
-  int64_t IH = THCTensor_(size)(state, input, 3);
-  int64_t IW = THCTensor_(size)(state, input, 4);
-  int64_t D = THCTensor_(size)(state,grid, 1);
-  int64_t H = THCTensor_(size)(state,grid, 2);
-  int64_t W = THCTensor_(size)(state, grid, 3);
-
-  THCTensor_(resize5d)(state, gradInput, N, C, ID, IH, IW);
-  THCTensor_(resize5d)(state, gradGrid, N, D, H, W, 3);
-  THCTensor_(zero)(state, gradInput);
-  THCTensor_(zero)(state, gradGrid);
-
-  THCDeviceTensor<real, 5> devInput = toDeviceTensor<real, 5>(state, input);
-  THCDeviceTensor<real, 5> devGradInput = toDeviceTensor<real, 5>(state, gradInput);
-  THCDeviceTensor<real, 5> devGrid = toDeviceTensor<real, 5>(state, grid);
-  THCDeviceTensor<real, 5> devGradGrid = toDeviceTensor<real, 5>(state, gradGrid);
-  THCDeviceTensor<real, 5> devGradOutput = toDeviceTensor<real, 5>(state, gradOutput);
-
-  int count = static_cast<int>(N*D*H*W);
-  VolumetricGridSamplerBilinear_updateGradInput_kernel
-    <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
-      count, devInput, devGradInput, devGrid, devGradGrid, devGradOutput, padding_mode);
-  THCudaCheck(cudaGetLastError());
-}
-
-#endif
diff --git a/aten/src/THNN/generic/ClassNLLCriterion.c b/aten/src/THNN/generic/ClassNLLCriterion.c
index c7d42b583374cc..7db0531d60d1ef 100644
--- a/aten/src/THNN/generic/ClassNLLCriterion.c
+++ b/aten/src/THNN/generic/ClassNLLCriterion.c
@@ -82,7 +82,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(
     }
   } else if (THTensor_(nDimensionLegacyAll)(input) == 2) {
     int batch_size = THTensor_(size)(input, 0);
-    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+    THAssert(THTensor_sizeLegacyNoScalars(target, 0) == batch_size);
 
     int n_target = THTensor_(size)(input, 1);
 
@@ -189,7 +189,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
 
   } else if (THTensor_(nDimensionLegacyAll)(input) == 2) {
     int batch_size = THTensor_(size)(input, 0);
-    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+    THAssert(THTensor_sizeLegacyNoScalars(target, 0) == batch_size);
 
     int n_target = THTensor_(size)(input, 1);
 
diff --git a/aten/src/THNN/generic/ELU.c b/aten/src/THNN/generic/ELU.c
index f2d87185b813a5..62111ebbf4d7c2 100644
--- a/aten/src/THNN/generic/ELU.c
+++ b/aten/src/THNN/generic/ELU.c
@@ -8,19 +8,21 @@ void THNN_(ELU_updateOutput)(
           THTensor *output,
           accreal alpha_,
           accreal scale,
+          accreal input_scale,
           bool inplace)
 {
   real negcoef = TH_CONVERT_ACCREAL_TO_REAL(alpha_ * scale);
-  real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale);
+  real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale * input_scale);
+  real negiptcoef = TH_CONVERT_ACCREAL_TO_REAL(input_scale);
   if (inplace) {
     TH_TENSOR_APPLY(real, input,
-      *input_data = *input_data <= 0 ? (exp(*input_data)-1) * negcoef : *input_data * poscoef;
+      *input_data = *input_data <= 0 ? (exp(*input_data * negiptcoef)-1) * negcoef : *input_data * poscoef;
     );
     THTensor_(set)(output, input);
   } else {
     THTensor_(resizeAs)(output, input);
     TH_TENSOR_APPLY2(real, input, real, output,
-      *output_data = *input_data <= 0 ? (exp(*input_data)-1) * negcoef : *input_data * poscoef;
+      *output_data = *input_data <= 0 ? (exp(*input_data * negiptcoef)-1) * negcoef : *input_data * poscoef;
     );
   }
 }
@@ -31,14 +33,16 @@ void THNN_(ELU_updateGradInput)(
           THTensor *gradInput,
           THTensor *output,
           accreal alpha_,
-          accreal scale)
+          accreal scale,
+          accreal input_scale)
 {
   real negcoef = TH_CONVERT_ACCREAL_TO_REAL(alpha_ * scale);
-  real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale);
+  real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale * input_scale);
+  real negiptcoef = TH_CONVERT_ACCREAL_TO_REAL(input_scale);
   THNN_CHECK_NELEMENT(output, gradOutput);
   THTensor_(resizeAs)(gradInput, output);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
-    *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + negcoef) : *gradOutput_data * poscoef;
+    *gradInput_data = *output_data <= 0 ? *gradOutput_data * negiptcoef * (*output_data + negcoef) : *gradOutput_data * poscoef;
   );
 }
 
diff --git a/aten/src/THNN/generic/GatedLinearUnit.c b/aten/src/THNN/generic/GatedLinearUnit.c
index 68cdc37d54214a..0f888744240473 100644
--- a/aten/src/THNN/generic/GatedLinearUnit.c
+++ b/aten/src/THNN/generic/GatedLinearUnit.c
@@ -10,7 +10,7 @@ void THNN_(GatedLinear_updateOutput)(
 {
   // size output to half of input
   dim = dim - TH_INDEX_BASE;
-  const int64_t nIn = THTensor_(size)(input, dim);
+  const int64_t nIn = THTensor_sizeLegacyNoScalars(input, dim);
   THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
       dim + TH_INDEX_BASE, nIn);
 
diff --git a/aten/src/THNN/generic/LookupTable.c b/aten/src/THNN/generic/LookupTable.c
index 2260b168d8e8d5..fa6648e2a6b80c 100644
--- a/aten/src/THNN/generic/LookupTable.c
+++ b/aten/src/THNN/generic/LookupTable.c
@@ -40,7 +40,7 @@ void THNN_(LookupTable_accGradParameters)(
 
   if (scaleGradByFreq)
   {
-    THIntegerTensor_(resize1d)(count, gradWeight->size(0));
+    THIntegerTensor_(resize1d)(count, THTensor_sizeLegacyNoScalars(gradWeight, 0));
     count_data = THIntegerTensor_(data)(count);
   }
 
diff --git a/aten/src/THNN/generic/MultiLabelMarginCriterion.c b/aten/src/THNN/generic/MultiLabelMarginCriterion.c
index 0699c3ac471c55..a18252b06914d6 100644
--- a/aten/src/THNN/generic/MultiLabelMarginCriterion.c
+++ b/aten/src/THNN/generic/MultiLabelMarginCriterion.c
@@ -17,14 +17,14 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
   int64_t t, d, dt, ddt;
   real sum;
 
-  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
+  AT_CHECK(!input->is_empty() && input->dim() <= 2,
            "non-empty vector or matrix expected, got size: ", input->sizes());
 
-  if (input->dim() == 1)
+  if (input->dim() <= 1)
   {
     nframe = 1;
-    dim = input->size(0);
-    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == dim),
+    dim = THTensor_sizeLegacyNoScalars(input, 0);
+    AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim),
              "inconsistent target size");
   }
   else
@@ -155,16 +155,16 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
   int64_t t, d, dt;
   real g;
 
-  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
+  AT_CHECK(!input->is_empty() && input->dim() <= 2,
            "vector or matrix expected, got size: ", input->sizes());
 
-  if (input->dim() == 1)
+  if (input->dim() <= 1)
   {
     nframe = 1;
-    dim = input->size(0);
-    AT_CHECK((!target->is_empty() && target->dim() == 1) && (target->size(0) == dim),
+    dim = THTensor_sizeLegacyNoScalars(input, 0);
+    AT_CHECK((!target->is_empty() && target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim),
              "inconsistent target size");
-    AT_CHECK((!isTarget->is_empty() && isTarget->dim() == 1) && (isTarget->size(0) == dim),
+    AT_CHECK((!isTarget->is_empty() && isTarget->dim() <= 1) && (THTensor_sizeLegacyNoScalars(isTarget, 0) == dim),
              "inconsistent isTarget size");
   }
   else
diff --git a/aten/src/THNN/generic/MultiMarginCriterion.c b/aten/src/THNN/generic/MultiMarginCriterion.c
index 424669e5de8515..2c8f38be23eb3a 100644
--- a/aten/src/THNN/generic/MultiMarginCriterion.c
+++ b/aten/src/THNN/generic/MultiMarginCriterion.c
@@ -20,19 +20,19 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   int64_t t, d;
   real sum;
 
-  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
+  AT_CHECK(!input->is_empty() && input->dim() <= 2,
            "non-empty vector or matrix expected, got size: ", input->sizes());
 
-  if (input->dim() == 1)
+  if (input->dim() <= 1)
   {
     nframe = 1;
-    dim = input->size(0);
+    dim = THTensor_sizeLegacyNoScalars(input, 0);
   }
   else
   {
     nframe = input->size(0);
     dim = input->size(1);
-    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe),
+    AT_CHECK(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe),
              "inconsistent target size, got: ", target->sizes());
   }
 
@@ -136,19 +136,19 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   int64_t t, d;
   real g;
 
-  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
+  AT_CHECK(!input->is_empty() && (input->dim() <= 2),
            "non-empty vector or matrix expected, got size: ", input->sizes());
 
-  if (input->dim() == 1)
+  if (input->dim() <= 1)
   {
     nframe = 1;
-    dim = input->size(0);
+    dim = THTensor_sizeLegacyNoScalars(input, 0);
   }
   else
   {
     nframe = input->size(0);
     dim = input->size(1);
-    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe),
+    AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe),
              "inconsistent target size, got: ", target->sizes());
   }
 
diff --git a/aten/src/THNN/generic/PReLU.c b/aten/src/THNN/generic/PReLU.c
index e148fde783ce9d..1837874852d2bb 100644
--- a/aten/src/THNN/generic/PReLU.c
+++ b/aten/src/THNN/generic/PReLU.c
@@ -26,8 +26,8 @@ void THNN_(PReLU_updateOutput)(
   int64_t bs = 1, ks = 1;
   {
     int64_t input_ndim = THTensor_(nDimensionLegacyAll)(input);
-    if (input->size(input_ndim > 1) != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1));
+    if (THTensor_sizeLegacyNoScalars(input, input_ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, input_ndim > 1));
 
     if (input_ndim > 1) {
         bs = input->size(0);
@@ -91,8 +91,8 @@ void THNN_(PReLU_updateGradInput)(
   int64_t bs = 1, ks = 1;
   {
     int64_t input_ndim = THTensor_(nDimensionLegacyAll)(input);
-    if (input->size(input_ndim > 1) != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1));
+    if (THTensor_sizeLegacyNoScalars(input, input_ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, input_ndim > 1));
 
     if (input_ndim > 1) {
         bs = input->size(0);
@@ -162,8 +162,8 @@ void THNN_(PReLU_accGradParameters)(
   int64_t bs = 1, ks = 1;
   {
     int64_t input_ndim = THTensor_(nDimensionLegacyAll)(input);
-    if (input->size(input_ndim > 1) != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1));
+    if (THTensor_sizeLegacyNoScalars(input, input_ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, input_ndim > 1));
 
     if (input_ndim > 1) {
         bs = input->size(0);
diff --git a/aten/src/THNN/generic/SparseLinear.c b/aten/src/THNN/generic/SparseLinear.c
index a28d4e78477ceb..3bf8e652fa9ed9 100644
--- a/aten/src/THNN/generic/SparseLinear.c
+++ b/aten/src/THNN/generic/SparseLinear.c
@@ -26,7 +26,7 @@ static bool THNN_(checkSize2D)(THTensor* t, int64_t size0, int64_t size1)
 
 static bool THNN_(checkSize1D)(THTensor* t, int64_t size0)
 {
-  return !t->is_empty() && t->dim() == 1 && t->size(0) == size0;
+  return !t->is_empty() && THTensor_nDimensionLegacyNoScalars(t) == 1 && THTensor_sizeLegacyNoScalars(t, 0) == size0;
 }
 
 static void THNN_(set1d)(THTensor *t, int64_t x0, real value) {
diff --git a/aten/src/THNN/generic/SpatialConvolutionMM.c b/aten/src/THNN/generic/SpatialConvolutionMM.c
index fce2c8575935a5..f18a6d0817059b 100644
--- a/aten/src/THNN/generic/SpatialConvolutionMM.c
+++ b/aten/src/THNN/generic/SpatialConvolutionMM.c
@@ -72,7 +72,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size(0);
+      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
@@ -332,7 +332,7 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
   }
 
   if (gradBias) {
-    for(i = 0; i < gradBias->size(0); i++)
+    for(i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++)
     {
       int64_t k;
       real sum = 0;
diff --git a/aten/src/THNN/generic/SpatialDilatedConvolution.c b/aten/src/THNN/generic/SpatialDilatedConvolution.c
index 63e7bd81033e12..2f71861963fcdf 100644
--- a/aten/src/THNN/generic/SpatialDilatedConvolution.c
+++ b/aten/src/THNN/generic/SpatialDilatedConvolution.c
@@ -64,7 +64,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size(0);
+      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
diff --git a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
index 7226db67ef1a74..eeb644fc9eb5e6 100644
--- a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
+++ b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
@@ -64,7 +64,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
       int64_t nOutputPlane = weight->size(1);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size(0);
+      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
@@ -332,7 +332,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
   if (gradWeight) {
     nOutputPlane = THTensor_(size)(gradWeight, 1);
   } else if (gradBias) {
-    nOutputPlane = THTensor_(size)(gradBias, 0);
+    nOutputPlane = THTensor_sizeLegacyNoScalars(gradBias, 0);
   } else {
     return;
   }
@@ -402,7 +402,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
       // M,N,K are dims of matrix A and B
       // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
       int64_t n = columns->size(0);   // nOutputPlane * kh * kw
-      int64_t m = input_n->size(0);   // nInputPlane
+      int64_t m = THTensor_sizeLegacyNoScalars(input_n, 0);   // nInputPlane
       int64_t k = columns->size(1);   // inputHeight * inputWidth
 
       // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
diff --git a/aten/src/THNN/generic/SpatialGridSamplerBilinear.c b/aten/src/THNN/generic/SpatialGridSamplerBilinear.c
deleted file mode 100644
index d31f3e0a76c20a..00000000000000
--- a/aten/src/THNN/generic/SpatialGridSamplerBilinear.c
+++ /dev/null
@@ -1,250 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/SpatialGridSamplerBilinear.c"
-#else
-
-#undef MIN
-#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
-#undef MAX
-#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
-
-#undef MODE_BORDER
-#define MODE_BORDER 1
-
-static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck)
-     (THTensor *input, THTensor *grid, THTensor *gradOutput) {
-  THNN_ARGCHECK(!input->is_empty() && input->dim() == 4, 2, input,
-    "non-empty 4D input tensor expected but got: %s");
-  THNN_ARGCHECK(!grid->is_empty() && grid->dim() == 4, 2, grid,
-    "non-empty 4D grid tensor expected but got: %s");
-
-  int nbatch   = THTensor_(size)(input, 0);
-  int channels = THTensor_(size)(input, 1);
-  int oheight   = THTensor_(size)(grid, 1);
-  int owidth    = THTensor_(size)(grid, 2);
-
-  THNN_CHECK_DIM_SIZE(grid, 4, 0, nbatch);
-  THNN_CHECK_DIM_SIZE(grid, 4, 3, 2);
-
-  if (gradOutput != NULL) {
-    THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nbatch);
-    THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, channels);
-    THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, oheight);
-    THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, owidth);
-  }
-}
-
-#define SAFE_GET(input, x, y, n, c, H, W) x >= 0 && x < W && y >=0 \
-    && y < H ? THTensor_(fastGet4d)(input, n, c, y, x) : 0
-
-#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0))
-
-TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
-    THNNState *state,
-    THTensor *input,
-    THTensor *grid,
-    THTensor *output,
-    int padding_mode) {
-
-  THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, NULL);
-  int N = THTensor_(size)(input, 0);
-  int C = THTensor_(size)(input, 1);
-  int IH = THTensor_(size)(input, 2);
-  int IW = THTensor_(size)(input, 3);
-  int H = THTensor_(size)(grid, 1);
-  int W = THTensor_(size)(grid, 2);
-
-  // resize output to the same shape as input
-  THTensor_(resize4d)(output, N, C, H, W);
-
-  // loop over each output pixel
-  int n, h, w, c;
-#pragma omp parallel for private(n, h, w, c)
-  for (n = 0; n < N; ++n) {
-    for (h = 0; h < H; ++h) {
-      for (w = 0; w < W; ++w) {
-        // get the corresponding input x, y co-ordinates from grid
-        real ix = THTensor_(fastGet4d)(grid, n, h, w, 0);
-        real iy = THTensor_(fastGet4d)(grid, n, h, w, 1);
-
-        // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1]
-        ix = ((ix + 1) / 2) * (IW-1);
-        iy = ((iy + 1) / 2) * (IH-1);
-
-        // get NE, NW, SE, SW pixel values from (x, y)
-        int ix_nw = floor(ix);
-        int iy_nw = floor(iy);
-        int ix_ne = ix_nw + 1;
-        int iy_ne = iy_nw;
-        int ix_sw = ix_nw;
-        int iy_sw = iy_nw + 1;
-        int ix_se = ix_nw + 1;
-        int iy_se = iy_nw + 1;
-
-        // get surfaces to each neighbor:
-        real nw = (ix_se - ix)    * (iy_se - iy);
-        real ne = (ix    - ix_sw) * (iy_sw - iy);
-        real sw = (ix_ne - ix)    * (iy    - iy_ne);
-        real se = (ix    - ix_nw) * (iy    - iy_nw);
-
-        if (padding_mode==MODE_BORDER){
-          // clip coordinates to image borders
-          CLIP_COORDINATES(ix_nw, ix_nw, IW);
-          CLIP_COORDINATES(iy_nw, iy_nw, IH);
-          CLIP_COORDINATES(ix_ne, ix_ne, IW);
-          CLIP_COORDINATES(iy_ne, iy_ne, IH);
-          CLIP_COORDINATES(ix_sw, ix_sw, IW);
-          CLIP_COORDINATES(iy_sw, iy_sw, IH);
-          CLIP_COORDINATES(ix_se, ix_se, IW);
-          CLIP_COORDINATES(iy_se, iy_se, IH);
-        }
-
-        // calculate bilinear weighted pixel value and set output pixel
-        for (c = 0; c < C; ++c) {
-          //   (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne
-          // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se
-          real nw_val = SAFE_GET(input, ix_nw, iy_nw, n, c, IH, IW);
-          real ne_val = SAFE_GET(input, ix_ne, iy_ne, n, c, IH, IW);
-          real sw_val = SAFE_GET(input, ix_sw, iy_sw, n, c, IH, IW);
-          real se_val = SAFE_GET(input, ix_se, iy_se, n, c, IH, IW);
-          real out_val = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se;
-          THTensor_(fastSet4d)(output, n, c, h, w, out_val);
-        }
-      }
-    }
-  }
-}
-
-#define SAFE_ADD(input, x, y, n, c, H, W, value)    \
-  do {                \
-    if (x >= 0 && x < W && y >=0 && y < H) {      \
-      real old_value = THTensor_(fastGet4d)(input, n, c, y, x); \
-      THTensor_(fastSet4d)(input, n, c, y, x, value + old_value); \
-    }               \
-  } while(0)
-
-TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
-    THNNState *state,
-    THTensor *input, THTensor *gradInput,
-    THTensor *grid, THTensor *gradGrid,
-    THTensor *gradOutput,
-    int padding_mode) {
-
-  THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, gradOutput);
-  int N = THTensor_(size)(input, 0);
-  int C = THTensor_(size)(input, 1);
-  int IH = THTensor_(size)(input, 2);
-  int IW = THTensor_(size)(input, 3);
-  int H = THTensor_(size)(grid, 1);
-  int W = THTensor_(size)(grid, 2);
-
-  THTensor_(resize4d)(gradInput, N, C, IH, IW);
-  THTensor_(resize4d)(gradGrid, N, H, W, 2);
-  THTensor_(zero)(gradInput);
-  THTensor_(zero)(gradGrid);
-
-  // loop over each output pixel
-  int n, h, w;
-#pragma omp parallel for private(n, h, w)
-  for (n = 0; n < N; ++n) {
-    for (h = 0; h < H; ++h) {
-      for (w = 0; w < W; ++w) {
-        // get the corresponding input x, y co-ordinates from grid
-        real ix = THTensor_(fastGet4d)(grid, n, h, w, 0);
-        real iy = THTensor_(fastGet4d)(grid, n, h, w, 1);
-
-        real gix = 0;
-        real giy = 0;
-
-        // normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1]
-        ix = ((ix + 1) / 2) * (IW-1);
-        iy = ((iy + 1) / 2) * (IH-1);
-
-        // get NE, NW, SE, SW pixel values from (x, y)
-        int ix_nw = floor(ix);
-        int iy_nw = floor(iy);
-        int ix_ne = ix_nw + 1;
-        int iy_ne = iy_nw;
-        int ix_sw = ix_nw;
-        int iy_sw = iy_nw + 1;
-        int ix_se = ix_nw + 1;
-        int iy_se = iy_nw + 1;
-
-        // get surfaces to each neighbor:
-        real nw = (ix_se - ix)    * (iy_se - iy);
-        real ne = (ix    - ix_sw) * (iy_sw - iy);
-        real sw = (ix_ne - ix)    * (iy    - iy_ne);
-        real se = (ix    - ix_nw) * (iy    - iy_nw);
-
-        int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl;
-
-        if (padding_mode==MODE_BORDER){
-          // get clipped NE, NW, SE, SW pixel values from (x, y)
-          CLIP_COORDINATES(ix_nw, ix_nw_cl, IW);
-          CLIP_COORDINATES(iy_nw, iy_nw_cl, IH);
-          CLIP_COORDINATES(ix_ne, ix_ne_cl, IW);
-          CLIP_COORDINATES(iy_ne, iy_ne_cl, IH);
-          CLIP_COORDINATES(ix_sw, ix_sw_cl, IW);
-          CLIP_COORDINATES(iy_sw, iy_sw_cl, IH);
-          CLIP_COORDINATES(ix_se, ix_se_cl, IW);
-          CLIP_COORDINATES(iy_se, iy_se_cl, IH);
-        }
-        else {
-          ix_nw_cl = ix_nw;
-          iy_nw_cl = iy_nw;
-          ix_ne_cl = ix_ne;
-          iy_ne_cl = iy_ne;
-          ix_sw_cl = ix_sw;
-          iy_sw_cl = iy_sw;
-          ix_se_cl = ix_se;
-          iy_se_cl = iy_se;
-        }
-
-        for (int c = 0; c < C; ++c) {
-          real gradout = THTensor_(fastGet4d)(gradOutput, n, c, h, w);
-
-          // calculate and set gradInput
-          SAFE_ADD(gradInput, ix_nw_cl, iy_nw_cl, n, c, IH, IW, nw * gradout);
-          SAFE_ADD(gradInput, ix_ne_cl, iy_ne_cl, n, c, IH, IW, ne * gradout);
-          SAFE_ADD(gradInput, ix_sw_cl, iy_sw_cl, n, c, IH, IW, sw * gradout);
-          SAFE_ADD(gradInput, ix_se_cl, iy_se_cl, n, c, IH, IW, se * gradout);
-
-          // calculate gradGrid
-          real nw_val = SAFE_GET(input, ix_nw_cl, iy_nw_cl, n, c, IH, IW);
-          real ne_val = SAFE_GET(input, ix_ne_cl, iy_ne_cl, n, c, IH, IW);
-          real sw_val = SAFE_GET(input, ix_sw_cl, iy_sw_cl, n, c, IH, IW);
-          real se_val = SAFE_GET(input, ix_se_cl, iy_se_cl, n, c, IH, IW);
-
-          gix -= nw_val * (iy_se - iy) * gradout;
-          gix += ne_val * (iy_sw - iy) * gradout;
-          gix -= sw_val * (iy - iy_ne) * gradout;
-          gix += se_val * (iy - iy_nw) * gradout;
-
-          giy -= nw_val * (ix_se - ix) * gradout;
-          giy -= ne_val * (ix - ix_sw) * gradout;
-          giy += sw_val * (ix_ne - ix) * gradout;
-          giy += se_val * (ix - ix_nw) * gradout;
-        }
-
-        // un-normalize gradGrid values back to [-1, 1] constraints
-        gix = gix * (IW - 1) / 2;
-        giy = giy * (IH - 1) / 2;
-
-        real gix_old = THTensor_(fastGet4d)(gradGrid, n, h, w, 0);
-        real giy_old = THTensor_(fastGet4d)(gradGrid, n, h, w, 1);
-
-        THTensor_(fastSet4d)(gradGrid, n, h, w, 0, gix_old + gix);
-        THTensor_(fastSet4d)(gradGrid, n, h, w, 1, giy_old + giy);
-      }
-    }
-  }
-}
-
-
-#undef MIN
-#undef MAX
-#undef SAFE_GET
-#undef CLIP_COORDINATES
-#undef SAFE_ADD
-#undef MODE_BORDER
-
-#endif
diff --git a/aten/src/THNN/generic/THNN.h b/aten/src/THNN/generic/THNN.h
index 455da04c7e4454..1d7a9176553756 100644
--- a/aten/src/THNN/generic/THNN.h
+++ b/aten/src/THNN/generic/THNN.h
@@ -90,7 +90,8 @@ TH_API void THNN_(ELU_updateOutput)(
           THTensor *input,             // input tensor
           THTensor *output,            // [OUT] ELU output
           accreal alpha,               // an ELU parameter (as in paper)
-          accreal scale,               // scaling factor
+          accreal scale,               // scaling factor for output
+          accreal input_scale,         // scaling factor for input
           bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
 TH_API void THNN_(ELU_updateGradInput)(
           THNNState *state,            // library's state
@@ -98,7 +99,8 @@ TH_API void THNN_(ELU_updateGradInput)(
           THTensor *gradInput,         // [OUT] gradient w.r.t. input
           THTensor *output,            // output from a forward pass
           accreal alpha,               // an ELU parameter (as in paper)
-          accreal scale);
+          accreal scale,
+          accreal input_scale);
 
 TH_API void THNN_(DistKLDivCriterion_updateOutput)(
           THNNState *state,            // library's state
@@ -1227,34 +1229,6 @@ TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
           int osizeW,
           bool align_corners);
 
-TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *grid,
-          THTensor *output,
-          int padding_mode);
-
-TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
-          THNNState *state,
-          THTensor *input, THTensor *gradInput,
-          THTensor *grid, THTensor *gradGrid,
-          THTensor *gradOutput,
-          int padding_mode);
-
-TH_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)(
-          THNNState *state,
-          THTensor *input,
-          THTensor *grid,
-          THTensor *output,
-          int padding_mode);
-
-TH_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)(
-          THNNState *state,
-          THTensor *input, THTensor *gradInput,
-          THTensor *grid, THTensor *gradGrid,
-          THTensor *gradOutput,
-          int padding_mode);
-
 TH_API void THNN_(unfolded_acc)(
           THTensor *finput,
           THTensor *input,
diff --git a/aten/src/THNN/generic/TemporalRowConvolution.c b/aten/src/THNN/generic/TemporalRowConvolution.c
index b623e5a2ad7fd4..e7b51ec194c402 100644
--- a/aten/src/THNN/generic/TemporalRowConvolution.c
+++ b/aten/src/THNN/generic/TemporalRowConvolution.c
@@ -38,7 +38,7 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)(
 	THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
 	              "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
 
-	int64_t inputFrameSize = weight->size(0);
+	int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
 	int64_t nInputFrame = input->size(dimS);
 	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
 
@@ -197,7 +197,7 @@ void THNN_(TemporalRowConvolution_updateOutput)(
 	THNN_(TemporalRowConvolution_shapeCheck)(
 		state, input, NULL, weight, bias, kW, dW, padW);
 
-	int64_t inputFrameSize = weight->size(0);
+	int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
 	int64_t nInputFrame = input->size(ndim - 1);
 	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
 
@@ -311,7 +311,7 @@ void THNN_(TemporalRowConvolution_updateGradInput)(
 	THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
 	                                         NULL, kW, dW, padW);
 
-	int64_t inputFrameSize = weight->size(0);
+	int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
 	int64_t nInputFrame = input->size(ndim - 1);
 	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
 
@@ -386,7 +386,7 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
     THTensor_(free)(tfinput);
 
 	if (gradBias != NULL) {
-		for (i = 0; i < gradBias->size(0); i++) {
+		for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) {
 			int64_t k;
 			real sum = 0;
 			real *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d))
diff --git a/aten/src/THNN/generic/VolumetricConvolution.c b/aten/src/THNN/generic/VolumetricConvolution.c
index 4b74445e047705..c979edf71f8f4c 100644
--- a/aten/src/THNN/generic/VolumetricConvolution.c
+++ b/aten/src/THNN/generic/VolumetricConvolution.c
@@ -51,7 +51,7 @@ void THNN_(VolumetricConvolution_updateOutput)(
 
     /* add bias */
     if (bias) {
-      for (i = 0; i < bias->size(0); i++)
+      for (i = 0; i < THTensor_sizeLegacyNoScalars(bias, 0); i++)
       {
         THTensor_(select)(outn, output, 0, i);
         THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
@@ -78,7 +78,7 @@ void THNN_(VolumetricConvolution_updateOutput)(
 
       /* add bias */
       if (bias) {
-        for (i = 0; i < bias->size(0); i++)
+        for (i = 0; i < THTensor_sizeLegacyNoScalars(bias, 0); i++)
         {
           THTensor_(select)(outn, outb, 0, i);
           THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
@@ -117,7 +117,7 @@ void THNN_(VolumetricConvolution_updateGradInput)(
 		"non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
 		"expected for weight, but got: %s");
 
-  int nOutputPlane = (int)weight->size(0);
+  int nOutputPlane = (int)THTensor_sizeLegacyNoScalars(weight, 0);
 
   THNN_ARGCHECK(!gradOutput->is_empty() && (gradOutput->dim() == 4 || gradOutput->dim() == 5), 3,
 		gradOutput,
@@ -187,9 +187,9 @@ void THNN_(VolumetricConvolution_accGradParameters)(
 		"non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
 		"expected for gradWeight, but got: %s");
 
-  int nOutputPlane = (int)gradWeight->size(0);
+  int nOutputPlane = (int)THTensor_sizeLegacyNoScalars(gradWeight, 0);
   if (gradBias) {
-    THArgCheck(!gradBias->is_empty() && gradBias->dim() == 1 && gradBias->size(0) == nOutputPlane, 5,
+    THArgCheck(!gradBias->is_empty() && THTensor_nDimensionLegacyNoScalars(gradBias) == 1 && THTensor_sizeLegacyNoScalars(gradBias, 0) == nOutputPlane, 5,
       "gradBias tensor has wrong size"
     );
   }
diff --git a/aten/src/THNN/generic/VolumetricConvolutionMM.c b/aten/src/THNN/generic/VolumetricConvolutionMM.c
index 14d98a79dd29b8..209d1575dacbec 100644
--- a/aten/src/THNN/generic/VolumetricConvolutionMM.c
+++ b/aten/src/THNN/generic/VolumetricConvolutionMM.c
@@ -102,7 +102,7 @@ static void inline THNN_(VolumetricConvolutionMM_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size(0);
+      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth);
@@ -691,7 +691,7 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
   }
 
   if (gradBias) {
-    for (i = 0; i < gradBias->size(0); i++)
+    for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++)
     {
       int64_t k;
       real sum = 0;
diff --git a/aten/src/THNN/generic/VolumetricDilatedConvolution.c b/aten/src/THNN/generic/VolumetricDilatedConvolution.c
index 8222c534612fd5..c9fa19f0adf488 100644
--- a/aten/src/THNN/generic/VolumetricDilatedConvolution.c
+++ b/aten/src/THNN/generic/VolumetricDilatedConvolution.c
@@ -69,7 +69,7 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size(0);
+      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
diff --git a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
index 4cc4dcc69837d8..16dedeffb9c58f 100644
--- a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
+++ b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
@@ -154,7 +154,7 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
       const int64_t nOutputPlane = weight->size(1);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      const int64_t nOutputPlane = bias->size(0);
+      const int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
@@ -441,7 +441,7 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
   if (gradWeight) {
     nOutputPlane = THTensor_(size)(gradWeight, 1);
   } else if (gradBias) {
-    nOutputPlane = THTensor_(size)(gradBias, 0);
+    nOutputPlane = THTensor_sizeLegacyNoScalars(gradBias, 0);
   } else {
     return;
   }
diff --git a/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c b/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c
deleted file mode 100644
index 4d7ace422d4e97..00000000000000
--- a/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c
+++ /dev/null
@@ -1,409 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/VolumetricGridSamplerBilinear.c"
-#else
-
-#undef MIN
-#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
-#undef MAX
-#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
-
-#undef MODE_BORDER
-#define MODE_BORDER 1
-
-static inline void THNN_(VolumetricGridSamplerBilinear_shapeCheck)
-     (THTensor *input, THTensor *grid, THTensor *gradOutput) {
-  THNN_ARGCHECK(!input->is_empty() && input->dim() == 5, 2, input,
-    "non-empty 5D input tensor expected but got: %s");
-  THNN_ARGCHECK(!grid->is_empty() && grid->dim() == 5, 2, grid,
-    "non-empty 5D grid tensor expected but got: %s");
-
-  int nbatch   = THTensor_(size)(input, 0);
-  int channels = THTensor_(size)(input, 1);
-  int odepth    = THTensor_(size)(grid, 1);
-  int oheight   = THTensor_(size)(grid, 2);
-  int owidth    = THTensor_(size)(grid, 3);
-
-  THNN_CHECK_DIM_SIZE(grid, 5, 0, nbatch);
-  THNN_CHECK_DIM_SIZE(grid, 5, 4, 3);
-
-  if (gradOutput != NULL) {
-    THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nbatch);
-    THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, channels);
-    THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, odepth);
-    THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, oheight);
-    THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, owidth);
-  }
-}
-
-#define SAFE_GET(input, x, y, z, n, c, D, H, W) \
-  x >= 0 && x < W && y >=0 && y < H && z >= 0 && z < D \
-    ? THTensor_(fastGet5d)(input, n, c, z, y, x) : 0
-
-#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0))
-
-TH_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)(
-    THNNState *state,
-    THTensor *input,
-    THTensor *grid,
-    THTensor *output,
-    int padding_mode) {
-
-  THNN_(VolumetricGridSamplerBilinear_shapeCheck)(input, grid, NULL);
-  int N = THTensor_(size)(input, 0);
-  int C = THTensor_(size)(input, 1);
-  int ID = THTensor_(size)(input, 2);
-  int IH = THTensor_(size)(input, 3);
-  int IW = THTensor_(size)(input, 4);
-  int D = THTensor_(size)(grid, 1);
-  int H = THTensor_(size)(grid, 2);
-  int W = THTensor_(size)(grid, 3);
-
-  // resize output to the same shape as input
-  THTensor_(resize5d)(output, N, C, D, H, W);
-
-  // loop over each output pixel
-  int n, d, h, w, c;
-#pragma omp parallel for private(n, d, h, w, c)
-  for (n = 0; n < N; ++n) {
-    for (d = 0; d < D; ++d) {
-      for (h = 0; h < H; ++h) {
-        for (w = 0; w < W; ++w) {
-          // get the corresponding input x, y, z co-ordinates from grid
-          real ix = THTensor_(fastGet5d)(grid, n, d, h, w, 0);
-          real iy = THTensor_(fastGet5d)(grid, n, d, h, w, 1);
-          real iz = THTensor_(fastGet5d)(grid, n, d, h, w, 2);
-
-          // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1]
-          ix = ((ix + 1) / 2) * (IW-1);
-          iy = ((iy + 1) / 2) * (IH-1);
-          iz = ((iz + 1) / 2) * (ID-1);
-
-          // get corner pixel values from (x, y, z)
-          // for 4d, we used north-east-south-west
-          // for 5d, we add top-bottom
-          int ix_tnw = floor(ix);
-          int iy_tnw = floor(iy);
-          int iz_tnw = floor(iz);
-
-          int ix_tne = ix_tnw + 1;
-          int iy_tne = iy_tnw;
-          int iz_tne = iz_tnw;
-
-          int ix_tsw = ix_tnw;
-          int iy_tsw = iy_tnw + 1;
-          int iz_tsw = iz_tnw;
-
-          int ix_tse = ix_tnw + 1;
-          int iy_tse = iy_tnw + 1;
-          int iz_tse = iz_tnw;
-
-          int ix_bnw = ix_tnw;
-          int iy_bnw = iy_tnw;
-          int iz_bnw = iz_tnw + 1;
-
-          int ix_bne = ix_tnw + 1;
-          int iy_bne = iy_tnw;
-          int iz_bne = iz_tnw + 1;
-
-          int ix_bsw = ix_tnw;
-          int iy_bsw = iy_tnw + 1;
-          int iz_bsw = iz_tnw + 1;
-
-          int ix_bse = ix_tnw + 1;
-          int iy_bse = iy_tnw + 1;
-          int iz_bse = iz_tnw + 1;
-
-          // get surfaces to each neighbor:
-          real tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
-          real tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
-          real tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
-          real tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
-          real bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
-          real bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
-          real bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
-          real bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
-
-          if (padding_mode==MODE_BORDER){
-            // clip coordinates to image borders
-            CLIP_COORDINATES(ix_tnw, ix_tnw, IW);
-            CLIP_COORDINATES(iy_tnw, iy_tnw, IH);
-            CLIP_COORDINATES(iz_tnw, iz_tnw, ID);
-            CLIP_COORDINATES(ix_tne, ix_tne, IW);
-            CLIP_COORDINATES(iy_tne, iy_tne, IH);
-            CLIP_COORDINATES(iz_tne, iz_tne, ID);
-            CLIP_COORDINATES(ix_tsw, ix_tsw, IW);
-            CLIP_COORDINATES(iy_tsw, iy_tsw, IH);
-            CLIP_COORDINATES(iz_tsw, iz_tsw, ID);
-            CLIP_COORDINATES(ix_tse, ix_tse, IW);
-            CLIP_COORDINATES(iy_tse, iy_tse, IH);
-            CLIP_COORDINATES(iz_tse, iz_tse, ID);
-            CLIP_COORDINATES(ix_bnw, ix_bnw, IW);
-            CLIP_COORDINATES(iy_bnw, iy_bnw, IH);
-            CLIP_COORDINATES(iz_bnw, iz_bnw, ID);
-            CLIP_COORDINATES(ix_bne, ix_bne, IW);
-            CLIP_COORDINATES(iy_bne, iy_bne, IH);
-            CLIP_COORDINATES(iz_bne, iz_bne, ID);
-            CLIP_COORDINATES(ix_bsw, ix_bsw, IW);
-            CLIP_COORDINATES(iy_bsw, iy_bsw, IH);
-            CLIP_COORDINATES(iz_bsw, iz_bsw, ID);
-            CLIP_COORDINATES(ix_bse, ix_bse, IW);
-            CLIP_COORDINATES(iy_bse, iy_bse, IH);
-            CLIP_COORDINATES(iz_bse, iz_bse, ID);
-          }
-
-          // calculate bilinear weighted pixel value and set output pixel
-          for (c = 0; c < C; ++c) {
-            //   (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne
-            // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se
-            real tnw_val = SAFE_GET(input, ix_tnw, iy_tnw, iz_tnw, n, c, ID, IH, IW);
-            real tne_val = SAFE_GET(input, ix_tne, iy_tne, iz_tne, n, c, ID, IH, IW);
-            real tsw_val = SAFE_GET(input, ix_tsw, iy_tsw, iz_tsw, n, c, ID, IH, IW);
-            real tse_val = SAFE_GET(input, ix_tse, iy_tse, iz_tse, n, c, ID, IH, IW);
-            real bnw_val = SAFE_GET(input, ix_bnw, iy_bnw, iz_bnw, n, c, ID, IH, IW);
-            real bne_val = SAFE_GET(input, ix_bne, iy_bne, iz_bne, n, c, ID, IH, IW);
-            real bsw_val = SAFE_GET(input, ix_bsw, iy_bsw, iz_bsw, n, c, ID, IH, IW);
-            real bse_val = SAFE_GET(input, ix_bse, iy_bse, iz_bse, n, c, ID, IH, IW);
-            real out_val = tnw_val * tnw + tne_val * tne + tsw_val * tsw + tse_val * tse +
-              bnw_val * bnw + bne_val * bne + bsw_val * bsw + bse_val * bse;
-            THTensor_(fastSet5d)(output, n, c, d, h, w, out_val);
-          }
-        }
-      }
-    }
-  }
-}
-
-#define SAFE_ADD(input, x, y, z, n, c, D, H, W, value)  \
-  do {                                                                  \
-    if (x >= 0 && x < W && y >=0 && y < H && z >=0 && z < D) {          \
-      real old_value = THTensor_(fastGet5d)(input, n, c, z, y, x);        \
-      THTensor_(fastSet5d)(input, n, c, z, y, x, value + old_value);      \
-    }                                                                   \
-  } while(0)
-
-TH_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)(
-    THNNState *state,
-    THTensor *input, THTensor *gradInput,
-    THTensor *grid, THTensor *gradGrid,
-    THTensor *gradOutput,
-    int padding_mode) {
-
-  THNN_(VolumetricGridSamplerBilinear_shapeCheck)(input, grid, gradOutput);
-  int N = THTensor_(size)(input, 0);
-  int C = THTensor_(size)(input, 1);
-  int ID = THTensor_(size)(input, 2);
-  int IH = THTensor_(size)(input, 3);
-  int IW = THTensor_(size)(input, 4);
-  int D = THTensor_(size)(grid, 1);
-  int H = THTensor_(size)(grid, 2);
-  int W = THTensor_(size)(grid, 3);
-
-  THTensor_(resize5d)(gradInput, N, C, ID, IH, IW);
-  THTensor_(resize5d)(gradGrid, N, D, H, W, 3);
-  THTensor_(zero)(gradInput);
-  THTensor_(zero)(gradGrid);
-
-  // loop over each output pixel
-  int n, d, h, w;
-//#pragma omp parallel for private(n, d, h, w)
-  for (n = 0; n < N; ++n) {
-    for (d = 0; d < D; ++d) {
-      for (h = 0; h < H; ++h) {
-        for (w = 0; w < W; ++w) {
-          // get the corresponding input x, y, z co-ordinates from grid
-          real ix = THTensor_(fastGet5d)(grid, n, d, h, w, 0);
-          real iy = THTensor_(fastGet5d)(grid, n, d, h, w, 1);
-          real iz = THTensor_(fastGet5d)(grid, n, d, h, w, 2);
-
-          real gix = 0;
-          real giy = 0;
-          real giz = 0;
-
-          // normalize ix, iy, iz from [-1, 1] to [0, W-1] & [0, H-1] & [0, D-1]
-          ix = ((ix + 1) / 2) * (IW-1);
-          iy = ((iy + 1) / 2) * (IH-1);
-          iz = ((iz + 1) / 2) * (ID-1);
-
-          // get corner pixel values from (x, y, z)
-          // for 4d, we used north-east-south-west
-          // for 5d, we add top-bottom
-          int ix_tnw = floor(ix);
-          int iy_tnw = floor(iy);
-          int iz_tnw = floor(iz);
-
-          int ix_tne = ix_tnw + 1;
-          int iy_tne = iy_tnw;
-          int iz_tne = iz_tnw;
-
-          int ix_tsw = ix_tnw;
-          int iy_tsw = iy_tnw + 1;
-          int iz_tsw = iz_tnw;
-
-          int ix_tse = ix_tnw + 1;
-          int iy_tse = iy_tnw + 1;
-          int iz_tse = iz_tnw;
-
-          int ix_bnw = ix_tnw;
-          int iy_bnw = iy_tnw;
-          int iz_bnw = iz_tnw + 1;
-
-          int ix_bne = ix_tnw + 1;
-          int iy_bne = iy_tnw;
-          int iz_bne = iz_tnw + 1;
-
-          int ix_bsw = ix_tnw;
-          int iy_bsw = iy_tnw + 1;
-          int iz_bsw = iz_tnw + 1;
-
-          int ix_bse = ix_tnw + 1;
-          int iy_bse = iy_tnw + 1;
-          int iz_bse = iz_tnw + 1;
-
-          // get surfaces to each neighbor:
-          real tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
-          real tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
-          real tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
-          real tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
-          real bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
-          real bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
-          real bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
-          real bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
-
-          int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl;
-          int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl;
-          int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl;
-          int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl;
-
-          if (padding_mode==MODE_BORDER){
-            // clip coordinates to image borders
-            CLIP_COORDINATES(ix_tnw, ix_tnw_cl, IW);
-            CLIP_COORDINATES(iy_tnw, iy_tnw_cl, IH);
-            CLIP_COORDINATES(iz_tnw, iz_tnw_cl, ID);
-            CLIP_COORDINATES(ix_tne, ix_tne_cl, IW);
-            CLIP_COORDINATES(iy_tne, iy_tne_cl, IH);
-            CLIP_COORDINATES(iz_tne, iz_tne_cl, ID);
-            CLIP_COORDINATES(ix_tsw, ix_tsw_cl, IW);
-            CLIP_COORDINATES(iy_tsw, iy_tsw_cl, IH);
-            CLIP_COORDINATES(iz_tsw, iz_tsw_cl, ID);
-            CLIP_COORDINATES(ix_tse, ix_tse_cl, IW);
-            CLIP_COORDINATES(iy_tse, iy_tse_cl, IH);
-            CLIP_COORDINATES(iz_tse, iz_tse_cl, ID);
-            CLIP_COORDINATES(ix_bnw, ix_bnw_cl, IW);
-            CLIP_COORDINATES(iy_bnw, iy_bnw_cl, IH);
-            CLIP_COORDINATES(iz_bnw, iz_bnw_cl, ID);
-            CLIP_COORDINATES(ix_bne, ix_bne_cl, IW);
-            CLIP_COORDINATES(iy_bne, iy_bne_cl, IH);
-            CLIP_COORDINATES(iz_bne, iz_bne_cl, ID);
-            CLIP_COORDINATES(ix_bsw, ix_bsw_cl, IW);
-            CLIP_COORDINATES(iy_bsw, iy_bsw_cl, IH);
-            CLIP_COORDINATES(iz_bsw, iz_bsw_cl, ID);
-            CLIP_COORDINATES(ix_bse, ix_bse_cl, IW);
-            CLIP_COORDINATES(iy_bse, iy_bse_cl, IH);
-            CLIP_COORDINATES(iz_bse, iz_bse_cl, ID);
-          }
-          else {
-            ix_tnw_cl = ix_tnw;
-            iy_tnw_cl = iy_tnw;
-            iz_tnw_cl = iz_tnw;
-            ix_tne_cl = ix_tne;
-            iy_tne_cl = iy_tne;
-            iz_tne_cl = iz_tne;
-            ix_tsw_cl = ix_tsw;
-            iy_tsw_cl = iy_tsw;
-            iz_tsw_cl = iz_tsw;
-            ix_tse_cl = ix_tse;
-            iy_tse_cl = iy_tse;
-            iz_tse_cl = iz_tse;
-            ix_bnw_cl = ix_bnw;
-            iy_bnw_cl = iy_bnw;
-            iz_bnw_cl = iz_bnw;
-            ix_bne_cl = ix_bne;
-            iy_bne_cl = iy_bne;
-            iz_bne_cl = iz_bne;
-            ix_bsw_cl = ix_bsw;
-            iy_bsw_cl = iy_bsw;
-            iz_bsw_cl = iz_bsw;
-            ix_bse_cl = ix_bse;
-            iy_bse_cl = iy_bse;
-            iz_bse_cl = iz_bse;
-          }
-
-          for (int c = 0; c < C; ++c) {
-            real gradout = THTensor_(fastGet5d)(gradOutput, n, c, d, h, w);
-
-            // calculate and set gradInput
-            SAFE_ADD(gradInput, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW, tnw * gradout);
-            SAFE_ADD(gradInput, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW, tne * gradout);
-            SAFE_ADD(gradInput, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW, tsw * gradout);
-            SAFE_ADD(gradInput, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW, tse * gradout);
-            SAFE_ADD(gradInput, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW, bnw * gradout);
-            SAFE_ADD(gradInput, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW, bne * gradout);
-            SAFE_ADD(gradInput, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW, bsw * gradout);
-            SAFE_ADD(gradInput, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW, bse * gradout);
-
-            // calculate gradGrid
-            real tnw_val = SAFE_GET(input, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW);
-            real tne_val = SAFE_GET(input, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW);
-            real tsw_val = SAFE_GET(input, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW);
-            real tse_val = SAFE_GET(input, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW);
-            real bnw_val = SAFE_GET(input, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW);
-            real bne_val = SAFE_GET(input, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW);
-            real bsw_val = SAFE_GET(input, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW);
-            real bse_val = SAFE_GET(input, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW);
-
-            gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gradout;
-            gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gradout;
-            gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gradout;
-            gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gradout;
-            gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gradout;
-            gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gradout;
-            gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gradout;
-            gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gradout;
-
-
-            giy -= tnw_val * (ix_bse - ix)    * (iz_bse - iz) * gradout;
-            giy -= tne_val * (ix    - ix_bsw) * (iz_bsw - iz) * gradout;
-            giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz) * gradout;
-            giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz) * gradout;
-            giy -= bnw_val * (ix_tse - ix)    * (iz - iz_tse) * gradout;
-            giy -= bne_val * (ix    - ix_tsw) * (iz - iz_tsw) * gradout;
-            giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne) * gradout;
-            giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw) * gradout;
-
-            giz -= tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gradout;
-            giz -= tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gradout;
-            giz -= tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gradout;
-            giz -= tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gradout;
-            giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gradout;
-            giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gradout;
-            giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gradout;
-            giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gradout;
-
-          }
-
-          // un-normalize gradGrid values back to [-1, 1] constraints
-          gix = gix * (IW - 1) / 2;
-          giy = giy * (IH - 1) / 2;
-          giz = giz * (ID - 1) / 2;
-
-          real gix_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 0);
-          real giy_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 1);
-          real giz_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 2);
-
-          THTensor_(fastSet5d)(gradGrid, n, d, h, w, 0, gix_old + gix);
-          THTensor_(fastSet5d)(gradGrid, n, d, h, w, 1, giy_old + giy);
-          THTensor_(fastSet5d)(gradGrid, n, d, h, w, 2, giz_old + giz);
-        }
-      }
-    }
-  }
-}
-
-#undef MIN
-#undef MAX
-#undef SAFE_GET
-#undef CLIP_COORDINATES
-#undef SAFE_ADD
-#undef MODE_BORDER
-
-#endif
diff --git a/aten/src/THNN/init.cpp b/aten/src/THNN/init.cpp
index 6c79f5be295b60..c77cd76d54ec87 100644
--- a/aten/src/THNN/init.cpp
+++ b/aten/src/THNN/init.cpp
@@ -45,7 +45,7 @@
 
 #define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE)			\
   if (THTensor_(nDimensionLegacyNoScalars)(T) != DIM ||				\
-      THTensor_(size)(T, DIM_SIZE) != SIZE) {				\
+      THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) {				\
       THDescBuff s1 = THTensor_(sizeDesc)(T);				\
       THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
 	      " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
@@ -53,7 +53,7 @@
 
 #define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE)			\
   if (THIndexTensor_(nDimensionLegacyNoScalars)(T) != DIM ||				\
-      THIndexTensor_(size)(T, DIM_SIZE) != SIZE) {				\
+      THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) {				\
       THDescBuff s1 = THIndexTensor_(sizeDesc)(T);				\
       THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
         " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
@@ -245,9 +245,6 @@
 #include "generic/SpatialUpSamplingBilinear.c"
 #include "THGenerateFloatTypes.h"
 
-#include "generic/SpatialGridSamplerBilinear.c"
-#include "THGenerateFloatTypes.h"
-
 #include "generic/VolumetricAveragePooling.c"
 #include "THGenerateFloatTypes.h"
 
@@ -304,6 +301,3 @@
 
 #include "generic/VolumetricUpSamplingTrilinear.c"
 #include "THGenerateFloatTypes.h"
-
-#include "generic/VolumetricGridSamplerBilinear.c"
-#include "THGenerateFloatTypes.h"
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 0d84ccbfb606a1..588dae10e8e8e3 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -40,6 +40,7 @@ if(BUILD_ATEN)
   # ATen tests use catch instead of gtest so keep separate for now
   # list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS})
   # list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS})
+  list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS})
   list(APPEND Caffe2_CPU_INCLUDE ${ATen_CPU_INCLUDE})
   list(APPEND Caffe2_GPU_INCLUDE ${ATen_CUDA_INCLUDE})
   list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS})
@@ -51,6 +52,15 @@ if(BUILD_ATEN)
     set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS})
     set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE})
   ENDIF(USE_ROCM)
+else()
+  # Only add "ATen Core", a minimal, easy-to-compile fragment of ATen.
+  # This codepath should only be exercised by the Android build.
+  add_subdirectory(../aten/src/ATen/core ATen_core)
+  list(APPEND Caffe2_CPU_SRCS ${ATen_CORE_SRCS})
+  list(APPEND Caffe2_CPU_INCLUDE ${ATen_CORE_INCLUDE})
+  list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS})
+  # TODO: We should probably install the headers, but I don't know
+  # how to do that.
 endif()
 
 # ---[ Torch build
@@ -215,6 +225,72 @@ target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}")
 aten_set_target_props(caffe2)
 target_compile_options(caffe2 INTERFACE "-std=c++11")
 target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
+if (MSVC AND NOT BUILD_SHARED_LIBS)
+  # Note [Supporting both static and dynamic libraries on Window]
+  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  # A Windows library may be distributed as either a static or dynamic
+  # library.  The chosen distribution mechanism affects how you setup
+  # the headers for the library: if you statically link a function,
+  # all you need is an ordinary signature:
+  #
+  #     void f();
+  #
+  # But if you *dynamically* link it, then you must provide a __declspec
+  # specifying that it should be imported from a DLL:
+  #
+  #     __declspec(dllimport) void f();
+  #
+  # Mixing the two situations will not work: if you specify dllimport
+  # while statically linking, the linker will complain it cannot find
+  # the __imp_f symbol (which serve as the DLL entrypoint); if you
+  # fail to specify dllimport for a symbol that's coming from a DLL,
+  # the linker will complain that it can't find f.  Joy!
+  #
+  # Most places on the Internet, you will find people have written
+  # their headers under the assumption that the application will
+  # only ever be dynamically linked, as they define a macro which
+  # tags a function as __declspec(dllexport) if you are actually
+  # building the library, and __declspec(dllimport) otherwise.  But
+  # if you want these headers to also work if you are linking against
+  # a static library, you need a way to avoid adding these __declspec's
+  # at all.  And that "mechanism" needs to apply to any downstream
+  # libraries/executables which are going to link against your library.
+  #
+  #   As an aside, why do we need to support both modes?
+  #   For historical reasons, PyTorch ATen on Windows is built dynamically,
+  #   while Caffe2 on Windows is built statically (mostly because if
+  #   we build it dynamically, we are over the DLL exported symbol limit--and
+  #   that is because Caffe2 hasn't comprehensively annotated all symbols
+  #   which cross the DLL boundary with CAFFE_API).  So any code
+  #   which is used by both PyTorch and Caffe2 needs to support both
+  #   modes of linking.
+  #
+  # So, you have a macro (call it AT_CORE_STATIC_WINDOWS) which you need to have
+  # set for any downstream library/executable that transitively includes your
+  # headers.  How are you going to do this?  You have two options:
+  #
+  #   1. Write out a config.h header which stores whether or not
+  #      you are linking statically or dynamically.
+  #
+  #   2. Force all of users to set the the macro themselves.  If they
+  #      use cmake, you can set -DAT_CORE_STATIC_WINDOWS=1 as a PUBLIC
+  #      compile option, in which case cmake will automatically
+  #      add the macro for you.
+  #
+  # Which one is better? Well, it depends: they trade off implementor
+  # ease versus user ease: (1) is more work for the library author
+  # but the user doesn't have to worry about it; (2) requires the user
+  # to set the macro themselves... but only if they don't use cmake.
+  #
+  # So, which is appropriate in our situation?  In my mind, here is
+  # the distinguishing factor: it is more common to distribute
+  # DLLs, since they don't require you to line up the CRT version
+  # (/MD, /MDd, /MT, /MTd) and MSVC version at the use site.  So,
+  # if a user is already in the business of static linkage, they're
+  # already in "expert user" realm.  So, I've decided that at this
+  # point in time, the simplicity of implementation of (2) wins out.
+  target_compile_options(caffe2 PUBLIC "-DAT_CORE_STATIC_WINDOWS=1")
+endif()
 # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
 target_compile_options(caffe2 PRIVATE "$<$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>:-O2>")
 install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib)
diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc
index bc93f4866ebc28..df3ee5326b7d90 100644
--- a/caffe2/contrib/aten/aten_op.cc
+++ b/caffe2/contrib/aten/aten_op.cc
@@ -10,7 +10,6 @@ at::Backend ATenOp<CPUContext>::backend() const {
 }
 
 OPERATOR_SCHEMA(ATen);
-CAFFE_KNOWN_TYPE(at::Half);
 
 namespace math {
 template <>
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index f2831909e1587a..fc3969879f30c4 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -13,6 +13,9 @@
 #include "caffe2/core/typeid.h"
 #include "caffe2/proto/caffe2.pb.h"
 
+#include "ATen/core/ATenCoreTest.h"
+#include "ATen/core/ArrayRef.h"
+
 CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage);
 
 namespace caffe2 {
diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc
index a6e44846e9e0be..8924a9dc931be9 100644
--- a/caffe2/core/context_test.cc
+++ b/caffe2/core/context_test.cc
@@ -6,6 +6,11 @@
 
 namespace caffe2 {
 
+TEST(CPUContextTest, ATenCoreTest) {
+  int i = at::CoreTest();
+  EXPECT_EQ(i + 1, at::CoreTest());
+}
+
 TEST(CPUContextTest, TestAllocAlignment) {
   for (int i = 1; i < 10; ++i) {
     auto data = CPUContext::New(i);
diff --git a/caffe2/core/dispatch/DeviceId.h b/caffe2/core/dispatch/DeviceId.h
index e74a803557ea0d..e5744ce1e1c2d6 100644
--- a/caffe2/core/dispatch/DeviceId.h
+++ b/caffe2/core/dispatch/DeviceId.h
@@ -1,8 +1,8 @@
 #pragma once
 
+#include <ATen/core/C++17.h>
 #include <functional>
 #include <iostream>
-#include "caffe2/utils/C++17.h"
 
 namespace c10 {
 
diff --git a/caffe2/core/dispatch/LayoutId.h b/caffe2/core/dispatch/LayoutId.h
index 7f039fadfa9698..9ec44519b95a99 100644
--- a/caffe2/core/dispatch/LayoutId.h
+++ b/caffe2/core/dispatch/LayoutId.h
@@ -1,10 +1,10 @@
 #pragma once
 
-#include "caffe2/utils/IdWrapper.h"
+#include "ATen/core/IdWrapper.h"
 
 namespace c10 {
 
-class LayoutId final : public c10::guts::IdWrapper<LayoutId, uint8_t> {
+class LayoutId final : public at::IdWrapper<LayoutId, uint8_t> {
 public:
     constexpr explicit LayoutId(underlying_type id): IdWrapper(id) {}
 
@@ -19,4 +19,4 @@ class LayoutId final : public c10::guts::IdWrapper<LayoutId, uint8_t> {
 
 }
 
-C10_DEFINE_HASH_FOR_IDWRAPPER(c10::LayoutId)
+AT_DEFINE_HASH_FOR_IDWRAPPER(c10::LayoutId)
diff --git a/caffe2/core/dispatch/TensorTypeId.h b/caffe2/core/dispatch/TensorTypeId.h
index a80fc8377c8ca5..244817904667b9 100644
--- a/caffe2/core/dispatch/TensorTypeId.h
+++ b/caffe2/core/dispatch/TensorTypeId.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "caffe2/utils/IdWrapper.h"
+#include "ATen/core/IdWrapper.h"
 #include <string>
 #include <iostream>
 #include <mutex>
@@ -21,7 +21,7 @@ namespace details {
 /**
  * Dynamic type ID of a Tensor argument.  It represents something like CPUTensor, etc.
  */
-class TensorTypeId final : public guts::IdWrapper<TensorTypeId, details::_tensorTypeId_underlyingType> {
+class TensorTypeId final : public at::IdWrapper<TensorTypeId, details::_tensorTypeId_underlyingType> {
 public:
   // Don't use this!
   // Unfortunately, a default constructor needs to be defined because of https://reviews.llvm.org/D41223
@@ -35,4 +35,4 @@ class TensorTypeId final : public guts::IdWrapper<TensorTypeId, details::_tensor
 
 }  // namespace c10
 
-C10_DEFINE_HASH_FOR_IDWRAPPER(c10::TensorTypeId)
+AT_DEFINE_HASH_FOR_IDWRAPPER(c10::TensorTypeId)
diff --git a/caffe2/core/dispatch/TensorTypeIdRegistration.cpp b/caffe2/core/dispatch/TensorTypeIdRegistration.cpp
index 9c7831b76a6dd0..31b4c6b671aa29 100644
--- a/caffe2/core/dispatch/TensorTypeIdRegistration.cpp
+++ b/caffe2/core/dispatch/TensorTypeIdRegistration.cpp
@@ -1,5 +1,5 @@
 #include "caffe2/core/dispatch/TensorTypeIdRegistration.h"
-#include "caffe2/utils/C++17.h"
+#include <ATen/core/C++17.h>
 
 namespace c10 {
 
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
index 1e8156abe42172..70490856b5ecaf 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
@@ -659,336 +659,3 @@ class NHWC2NCHW : public NeuralNetOperator {
 
  private:
 };
-
-class Int8Quantize : public NeuralNetOperator {
- public:
-  Int8Quantize() : NeuralNetOperator(NNKind::Int8Quantize) {}
-
-  ~Int8Quantize() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Quantize);
-
- private:
-};
-
-class Int8Dequantize : public NeuralNetOperator {
- public:
-  Int8Dequantize() : NeuralNetOperator(NNKind::Int8Dequantize) {}
-
-  ~Int8Dequantize() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Dequantize);
-
- private:
-};
-
-class Int8AveragePool : public NeuralNetOperator {
- public:
-  Int8AveragePool() : NeuralNetOperator(NNKind::Int8AveragePool) {}
-
-  Int8AveragePool(const AveragePool& averagePool)
-      : NeuralNetOperator(NNKind::Int8AveragePool) {}
-
-  ~Int8AveragePool() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8AveragePool);
-
- private:
-};
-
-class Int8Conv : public NeuralNetOperator {
- public:
-  Int8Conv() : NeuralNetOperator(NNKind::Int8Conv) {}
-
-  Int8Conv(const Conv& conv) : NeuralNetOperator(NNKind::Int8Conv) {}
-
-  ~Int8Conv() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Conv);
-
- private:
-};
-
-class Int8ConvTranspose : public NeuralNetOperator {
- public:
-  Int8ConvTranspose() : NeuralNetOperator(NNKind::Int8ConvTranspose) {}
-
-  Int8ConvTranspose(const ConvTranspose& convTranspose)
-      : NeuralNetOperator(NNKind::Int8ConvTranspose) {}
-
-  ~Int8ConvTranspose() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8ConvTranspose);
-
- private:
-};
-
-class Int8FC : public NeuralNetOperator {
- public:
-  Int8FC() : NeuralNetOperator(NNKind::Int8FC) {}
-
-  Int8FC(const FC& fC) : NeuralNetOperator(NNKind::Int8FC) {}
-
-  ~Int8FC() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8FC);
-
- private:
-};
-
-class Int8MaxPool : public NeuralNetOperator {
- public:
-  Int8MaxPool() : NeuralNetOperator(NNKind::Int8MaxPool) {}
-
-  Int8MaxPool(const MaxPool& maxPool)
-      : NeuralNetOperator(NNKind::Int8MaxPool) {}
-
-  ~Int8MaxPool() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8MaxPool);
-
- private:
-};
-
-class Int8Relu : public NeuralNetOperator {
- public:
-  Int8Relu() : NeuralNetOperator(NNKind::Int8Relu) {}
-
-  Int8Relu(const Relu& relu) : NeuralNetOperator(NNKind::Int8Relu) {}
-
-  ~Int8Relu() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Relu);
-
- private:
-};
-
-class Int8GivenTensorFill : public NeuralNetOperator {
- public:
-  Int8GivenTensorFill() : NeuralNetOperator(NNKind::Int8GivenTensorFill) {}
-
-  Int8GivenTensorFill(const GivenTensorFill& givenTensorFill)
-      : NeuralNetOperator(NNKind::Int8GivenTensorFill) {}
-
-  ~Int8GivenTensorFill() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8GivenTensorFill);
-
- private:
-};
-
-class Int8Concat : public NeuralNetOperator {
- public:
-  Int8Concat() : NeuralNetOperator(NNKind::Int8Concat) {}
-
-  Int8Concat(const Concat& concat) : NeuralNetOperator(NNKind::Int8Concat) {}
-
-  ~Int8Concat() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Concat);
-
- private:
-};
-
-class Int8Softmax : public NeuralNetOperator {
- public:
-  Int8Softmax() : NeuralNetOperator(NNKind::Int8Softmax) {}
-
-  Int8Softmax(const Softmax& softmax)
-      : NeuralNetOperator(NNKind::Int8Softmax) {}
-
-  ~Int8Softmax() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Softmax);
-
- private:
-};
-
-class Int8ChannelShuffle : public NeuralNetOperator {
- public:
-  Int8ChannelShuffle() : NeuralNetOperator(NNKind::Int8ChannelShuffle) {}
-
-  Int8ChannelShuffle(const ChannelShuffle& channelShuffle)
-      : NeuralNetOperator(NNKind::Int8ChannelShuffle) {}
-
-  ~Int8ChannelShuffle() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8ChannelShuffle);
-
- private:
-};
-
-class Int8Sum : public NeuralNetOperator {
- public:
-  Int8Sum() : NeuralNetOperator(NNKind::Int8Sum) {}
-
-  Int8Sum(const Sum& sum) : NeuralNetOperator(NNKind::Int8Sum) {}
-
-  ~Int8Sum() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Sum);
-
- private:
-};
-
-class Int8Add : public NeuralNetOperator {
- public:
-  Int8Add() : NeuralNetOperator(NNKind::Int8Add) {}
-
-  Int8Add(const Add& add) : NeuralNetOperator(NNKind::Int8Add) {}
-
-  ~Int8Add() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Add);
-
- private:
-};
-
-class Int8Reshape : public NeuralNetOperator {
- public:
-  Int8Reshape() : NeuralNetOperator(NNKind::Int8Reshape) {}
-
-  Int8Reshape(const Reshape& reshape)
-      : NeuralNetOperator(NNKind::Int8Reshape) {}
-
-  ~Int8Reshape() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Reshape);
-
- private:
-};
-
-class Int8Flatten : public NeuralNetOperator {
- public:
-  Int8Flatten() : NeuralNetOperator(NNKind::Int8Flatten) {}
-
-  Int8Flatten(const Flatten& flatten)
-      : NeuralNetOperator(NNKind::Int8Flatten) {}
-
-  ~Int8Flatten() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Flatten);
-
- private:
-};
-
-class Int8ConvRelu : public NeuralNetOperator {
- public:
-  Int8ConvRelu() : NeuralNetOperator(NNKind::Int8ConvRelu) {}
-
-  Int8ConvRelu(const ConvRelu& convRelu)
-      : NeuralNetOperator(NNKind::Int8ConvRelu) {}
-
-  ~Int8ConvRelu() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8ConvRelu);
-
- private:
-};
-
-class Int8SumRelu : public NeuralNetOperator {
- public:
-  Int8SumRelu() : NeuralNetOperator(NNKind::Int8SumRelu) {}
-
-  Int8SumRelu(const SumRelu& sumRelu)
-      : NeuralNetOperator(NNKind::Int8SumRelu) {}
-
-  ~Int8SumRelu() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8SumRelu);
-
- private:
-};
-
-class Int8AveragePoolRelu : public NeuralNetOperator {
- public:
-  Int8AveragePoolRelu() : NeuralNetOperator(NNKind::Int8AveragePoolRelu) {}
-
-  Int8AveragePoolRelu(const AveragePoolRelu& averagePoolRelu)
-      : NeuralNetOperator(NNKind::Int8AveragePoolRelu) {}
-
-  ~Int8AveragePoolRelu() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8AveragePoolRelu);
-
- private:
-};
-
-class Int8MaxPoolRelu : public NeuralNetOperator {
- public:
-  Int8MaxPoolRelu() : NeuralNetOperator(NNKind::Int8MaxPoolRelu) {}
-
-  Int8MaxPoolRelu(const MaxPoolRelu& maxPoolRelu)
-      : NeuralNetOperator(NNKind::Int8MaxPoolRelu) {}
-
-  ~Int8MaxPoolRelu() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(Int8MaxPoolRelu);
-
- private:
-};
-
-class BatchMatMul : public NeuralNetOperator {
- public:
-  BatchMatMul(bool transA = false, bool transB = true, bool broadcast = false)
-      : NeuralNetOperator(NNKind::BatchMatMul),
-        TransA(transA),
-        TransB(transB),
-        Broadcast(broadcast) {}
-
-  ~BatchMatMul() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(BatchMatMul);
-
-  bool getTransA() const {
-    return TransA;
-  }
-
-  bool getTransB() const {
-    return TransB;
-  }
-
-  bool getBroadcast() const {
-    return Broadcast;
-  }
-
-  void setTransA(bool transA) {
-    TransA = transA;
-  }
-
-  void setTransB(bool transB) {
-    TransB = transB;
-  }
-
-  void setBroadcast(bool broadcast) {
-    Broadcast = broadcast;
-  }
-
- private:
-  bool TransA;
-  bool TransB;
-  bool Broadcast;
-};
-
-class BatchGather : public NeuralNetOperator {
- public:
-  BatchGather() : NeuralNetOperator(NNKind::BatchGather) {}
-
-  ~BatchGather() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(BatchGather);
-
- private:
-};
-
-class ConcatBatchMatMulBatchGatherOp : public NeuralNetOperator {
- public:
-  ConcatBatchMatMulBatchGatherOp()
-      : NeuralNetOperator(NNKind::ConcatBatchMatMulBatchGatherOp) {}
-
-  ~ConcatBatchMatMulBatchGatherOp() {}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI(ConcatBatchMatMulBatchGatherOp);
-
- private:
-};
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
index 9c4277293d0b41..4d15dd40613403 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
@@ -1,9 +1,4 @@
 Relu, Conv, ConvRelu, ConvTranspose, AveragePool, AveragePoolRelu, MaxPool,
     MaxPoolRelu, Sum, SumRelu, Send, Receive, BatchNormalization, FC,
     GivenTensorFill, Concat, Softmax, ChannelShuffle, Add, Reshape, Flatten,
-    NCHW2NHWC, NHWC2NCHW, Int8Quantize, Int8Dequantize, Int8AveragePool,
-    Int8Conv, Int8ConvTranspose, Int8FC, Int8MaxPool, Int8Relu,
-    Int8GivenTensorFill, Int8Concat, Int8Softmax, Int8ChannelShuffle, Int8Sum,
-    Int8Add, Int8Reshape, Int8Flatten, Int8ConvRelu, Int8SumRelu,
-    Int8AveragePoolRelu, Int8MaxPoolRelu, BatchMatMul, BatchGather,
-    ConcatBatchMatMulBatchGatherOp
+    NCHW2NHWC, NHWC2NCHW
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
index 87ffda3c4f3436..88ffa0b1ba6bb0 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
@@ -1,92 +1,68 @@
 case NNKind::Relu:
   return "Relu";
+
 case NNKind::Conv:
   return "Conv";
+
 case NNKind::ConvRelu:
   return "ConvRelu";
+
 case NNKind::ConvTranspose:
   return "ConvTranspose";
+
 case NNKind::AveragePool:
   return "AveragePool";
+
 case NNKind::AveragePoolRelu:
   return "AveragePoolRelu";
+
 case NNKind::MaxPool:
   return "MaxPool";
+
 case NNKind::MaxPoolRelu:
   return "MaxPoolRelu";
+
 case NNKind::Sum:
   return "Sum";
+
 case NNKind::SumRelu:
   return "SumRelu";
+
 case NNKind::Send:
   return "Send";
+
 case NNKind::Receive:
   return "Receive";
+
 case NNKind::BatchNormalization:
   return "BatchNormalization";
+
 case NNKind::FC:
   return "FC";
+
 case NNKind::GivenTensorFill:
   return "GivenTensorFill";
+
 case NNKind::Concat:
   return "Concat";
+
 case NNKind::Softmax:
   return "Softmax";
+
 case NNKind::ChannelShuffle:
   return "ChannelShuffle";
+
 case NNKind::Add:
   return "Add";
+
 case NNKind::Reshape:
   return "Reshape";
+
 case NNKind::Flatten:
   return "Flatten";
+
 case NNKind::NCHW2NHWC:
   return "NCHW2NHWC";
+
 case NNKind::NHWC2NCHW:
   return "NHWC2NCHW";
-case NNKind::Int8Quantize:
-  return "Int8Quantize";
-case NNKind::Int8Dequantize:
-  return "Int8Dequantize";
-case NNKind::Int8AveragePool:
-  return "Int8AveragePool";
-case NNKind::Int8Conv:
-  return "Int8Conv";
-case NNKind::Int8ConvTranspose:
-  return "Int8ConvTranspose";
-case NNKind::Int8FC:
-  return "Int8FC";
-case NNKind::Int8MaxPool:
-  return "Int8MaxPool";
-case NNKind::Int8Relu:
-  return "Int8Relu";
-case NNKind::Int8GivenTensorFill:
-  return "Int8GivenTensorFill";
-case NNKind::Int8Concat:
-  return "Int8Concat";
-case NNKind::Int8Softmax:
-  return "Int8Softmax";
-case NNKind::Int8ChannelShuffle:
-  return "Int8ChannelShuffle";
-case NNKind::Int8Sum:
-  return "Int8Sum";
-case NNKind::Int8Add:
-  return "Int8Add";
-case NNKind::Int8Reshape:
-  return "Int8Reshape";
-case NNKind::Int8Flatten:
-  return "Int8Flatten";
-case NNKind::Int8ConvRelu:
-  return "Int8ConvRelu";
-case NNKind::Int8SumRelu:
-  return "Int8SumRelu";
-case NNKind::Int8AveragePoolRelu:
-  return "Int8AveragePoolRelu";
-case NNKind::Int8MaxPoolRelu:
-  return "Int8MaxPoolRelu";
-case NNKind::BatchMatMul:
-  return "BatchMatMul";
-case NNKind::BatchGather:
-  return "BatchGather";
-case NNKind::ConcatBatchMatMulBatchGatherOp:
-  return "ConcatBatchMatMulBatchGatherOp";
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
index 3c5148e5b6c70f..aab127d8c56e16 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
@@ -46,28 +46,31 @@ class Edge : public StorageType<U...> {
  public:
   using NodeRef = typename Graph<T, U...>::NodeRef;
   Edge(NodeRef tail, NodeRef head, U... args)
-      : StorageType<U...>(std::forward<U...>(args)...), Tail(tail), Head(head) {
+      : StorageType<U...>(std::forward<U...>(args)...),
+        tail_(tail),
+        head_(head) {
     DEBUG_PRINT("Creating instance of Edge: %p\n", this);
   }
 
   const NodeRef& tail() const {
-    return Tail;
+    return tail_;
   }
   const NodeRef& head() const {
-    return Head;
+    return head_;
   }
 
   void setTail(NodeRef n) {
-    Tail = n;
+    tail_ = n;
   }
 
   void setHead(NodeRef n) {
-    Head = n;
+    head_ = n;
   }
 
  private:
-  NodeRef Tail;
-  NodeRef Head;
+  NodeRef tail_;
+  NodeRef head_;
+
   friend class Graph<T, U...>;
 };
 
@@ -88,54 +91,55 @@ class Node : public StorageType<T>, public Notifier<Node<T, U...>> {
   /// \brief Adds an edge by reference to known in-edges.
   /// \p e A reference to an edge that will be added as an in-edge.
   void addInEdge(EdgeRef e) {
-    inEdges.emplace_back(e);
+    inEdges_.emplace_back(e);
   }
 
   /// \brief Adds an edge by reference to known out-edges.
   /// \p e A reference to an edge that will be added as an out-edge.
   void addOutEdge(EdgeRef e) {
-    outEdges.emplace_back(e);
+    outEdges_.emplace_back(e);
   }
 
   /// \brief Removes an edge by reference to known in-edges.
   /// \p e A reference to an edge that will be removed from in-edges.
   void removeInEdge(EdgeRef e) {
-    auto iter = std::find(inEdges.begin(), inEdges.end(), e);
-    assert(
-        iter != inEdges.end() &&
-        "Attempted to remove edge that isn't connected to this node");
-    inEdges.erase(iter);
+    removeEdgeInternal(inEdges_, e);
   }
 
   /// \brief Removes an edge by reference to known out-edges.
   /// \p e A reference to an edge that will be removed from out-edges.
   void removeOutEdge(EdgeRef e) {
-    auto iter = std::find(outEdges.begin(), outEdges.end(), e);
-    assert(
-        iter != outEdges.end() &&
-        "Attempted to remove edge that isn't connected to this node");
-    outEdges.erase(iter);
+    removeEdgeInternal(outEdges_, e);
   }
 
   const std::vector<EdgeRef>& getOutEdges() const {
-    return outEdges;
+    return outEdges_;
   }
   const std::vector<EdgeRef>& getInEdges() const {
-    return inEdges;
+    return inEdges_;
   }
 
-  void setInEdges(std::vector<EdgeRef> es) {
-    inEdges = es;
+  void setInEdges(std::vector<EdgeRef> edges) {
+    inEdges_ = edges;
   }
 
-  void setOutEdges(std::vector<EdgeRef> es) {
-    outEdges = es;
+  void setOutEdges(std::vector<EdgeRef> edges) {
+    outEdges_ = edges;
   }
 
- protected:
-  std::vector<EdgeRef> inEdges;
-  std::vector<EdgeRef> outEdges;
+ private:
+  std::vector<EdgeRef> inEdges_;
+  std::vector<EdgeRef> outEdges_;
+
   friend class Graph<T, U...>;
+
+  void removeEdgeInternal(std::vector<EdgeRef>& edges, EdgeRef e) {
+    auto iter = std::find(edges.begin(), edges.end(), e);
+    assert(
+        iter != edges.end() &&
+        "Attempted to remove edge that isn't connected to this node");
+    edges.erase(iter);
+  }
 };
 
 /// \brief Effectively a constant reference to a graph.
@@ -158,46 +162,56 @@ class Subgraph {
   using EdgeRef = typename Graph<T, U...>::EdgeRef;
 
   void addNode(NodeRef n) {
-    Nodes.insert(n);
+    nodes_.insert(n);
   }
+
   bool hasNode(NodeRef n) const {
-    return Nodes.count(n) != 0;
+    return nodes_.count(n) != 0;
   }
+
   void removeNode(NodeRef n) {
-    Nodes.erase(n);
+    nodes_.erase(n);
   }
 
   void addEdge(EdgeRef e) {
-    Edges.insert(e);
+    edges_.insert(e);
   }
-  bool hasEdge(EdgeRef n) const {
-    return Edges.count(n) != 0;
+
+  bool hasEdge(EdgeRef e) const {
+    return edges_.count(e) != 0;
   }
+
   void removeEdge(EdgeRef e) {
-    Edges.erase(e);
+    edges_.erase(e);
   }
 
   const std::unordered_set<NodeRef>& getNodes() const {
-    return Nodes;
+    return nodes_;
+  }
+
+  const size_t getNodesCount() const {
+    return (size_t)nodes_.size();
   }
+
   const std::unordered_set<EdgeRef>& getEdges() const {
-    return Edges;
+    return edges_;
   }
 
+ private:
+  std::unordered_set<NodeRef> nodes_;
+  std::unordered_set<EdgeRef> edges_;
+
   void printEdges() {
-    for (const auto& edge : Edges) {
+    for (const auto& edge : edges_) {
       printf("Edge: %p (%p -> %p)\n", &edge, edge->tail(), edge->head());
     }
   }
 
   void printNodes() const {
-    for (const auto& node : Nodes) {
+    for (const auto& node : nodes_) {
       printf("Node: %p\n", node);
     }
   }
-
-  std::unordered_set<NodeRef> Nodes;
-  std::unordered_set<EdgeRef> Edges;
 };
 
 /// \brief A simple graph implementation
@@ -231,21 +245,21 @@ class Graph {
   }
 
   void importNode(NodeRef node, Graph<T, U...>& otherGraph) {
-    for (auto it = Nodes.begin(); it != Nodes.end(); ++it) {
+    for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
       if (&(*it) == node) {
-        std::list<Node<T, U...>>& otherNodes = otherGraph.Nodes;
-        otherNodes.splice(otherNodes.end(), Nodes, it, ++it);
-        otherGraph.NodeRefs.insert(node);
+        std::list<Node<T, U...>>& otherNodes = otherGraph.nodes_;
+        otherNodes.splice(otherNodes.end(), nodes_, it, ++it);
+        otherGraph.nodeRefs_.insert(node);
         break;
       }
     }
   }
 
   void importEdge(EdgeRef edge, Graph<T, U...>& otherGraph) {
-    std::list<Edge<T, U...>>& otherEdges = otherGraph.Edges;
-    for (auto it = Edges.begin(); it != Edges.end(); ++it) {
+    std::list<Edge<T, U...>>& otherEdges = otherGraph.edges_;
+    for (auto it = edges_.begin(); it != edges_.end(); ++it) {
       if (&(*it) == edge) {
-        otherEdges.splice(otherEdges.end(), Edges, it, ++it);
+        otherEdges.splice(otherEdges.end(), edges_, it, ++it);
         break;
       }
     }
@@ -313,9 +327,9 @@ class Graph {
   /// \return A reference to the edge created.
   EdgeRef createEdge(NodeRef tail, NodeRef head, U... data) {
     DEBUG_PRINT("Creating edge (%p -> %p)\n", tail, head);
-    this->Edges.emplace_back(
+    this->edges_.emplace_back(
         Edge<T, U...>(tail, head, std::forward<U...>(data)...));
-    EdgeRef e = &this->Edges.back();
+    EdgeRef e = &this->edges_.back();
     head->addInEdge(e);
     tail->addOutEdge(e);
     return e;
@@ -339,85 +353,85 @@ class Graph {
   /// related to the node.
   void deleteNode(NodeRef n, bool deleteEdges = true) {
     if (deleteEdges) {
-      auto inEdges = n->inEdges;
+      auto inEdges = n->inEdges_;
       for (auto& edge : inEdges) {
         deleteEdge(edge);
       }
-      auto outEdges = n->outEdges;
+      auto outEdges = n->outEdges_;
       for (auto& edge : outEdges) {
         deleteEdge(edge);
       }
     }
-    for (auto i = Nodes.begin(); i != Nodes.end(); ++i) {
+    for (auto i = nodes_.begin(); i != nodes_.end(); ++i) {
       if (&*i == n) {
-        NodeRefs.erase(n);
-        Nodes.erase(i);
+        nodeRefs_.erase(n);
+        nodes_.erase(i);
         break;
       }
     }
   }
 
-  bool hasNode(NodeRef ref) const {
-    return NodeRefs.find(ref) != NodeRefs.end();
+  bool hasNode(NodeRef node) const {
+    return nodeRefs_.find(node) != nodeRefs_.end();
   }
 
   /// \brief Deletes a edge from the graph.
   /// \p e A reference to the edge.
-  void deleteEdge(EdgeRef e, bool remove_ref = true) {
-    if (remove_ref) {
-      e->Tail->removeOutEdge(e);
-      e->Head->removeInEdge(e);
+  void deleteEdge(EdgeRef e, bool removeRef = true) {
+    if (removeRef) {
+      e->tail_->removeOutEdge(e);
+      e->head_->removeInEdge(e);
     }
-    for (auto i = Edges.begin(); i != Edges.end(); ++i) {
+    for (auto i = edges_.begin(); i != edges_.end(); ++i) {
       if (&*i == e) {
-        Edges.erase(i);
+        edges_.erase(i);
         break;
       }
     }
   }
 
   const std::vector<NodeRef> getMutableNodes() {
-    std::vector<NodeRef> v;
-    for (auto& n : Nodes) {
+    std::vector<NodeRef> result;
+    for (auto& n : nodes_) {
       DEBUG_PRINT("Adding node to mutable output (%p)\n", &n);
-      v.emplace_back(&n);
+      result.emplace_back(&n);
     }
-    return v;
+    return result;
   }
 
   const std::vector<EdgeRef> getMutableEdges() {
-    std::vector<EdgeRef> v;
-    for (auto& e : Edges) {
+    std::vector<EdgeRef> result;
+    for (auto& e : edges_) {
       DEBUG_PRINT("Adding edge to mutable output (%p)\n", &e);
-      v.emplace_back(&e);
+      result.emplace_back(&e);
     }
-    return v;
+    return result;
+  }
+
+ private:
+  std::list<Node<T, U...>> nodes_;
+  std::list<Edge<T, U...>> edges_;
+  std::unordered_set<NodeRef> nodeRefs_;
+
+  NodeRef createNodeInternal(Node<T, U...>&& node) {
+    nodes_.emplace_back(std::move(node));
+    NodeRef nodeRef = &nodes_.back();
+    DEBUG_PRINT("Creating node (%p)\n", nodeRef);
+    nodeRefs_.insert(nodeRef);
+    return nodeRef;
   }
 
   void printEdges() {
-    for (const auto& edge : Edges) {
+    for (const auto& edge : edges_) {
       printf("Edge: %p (%p -> %p)\n", &edge, edge.tail(), edge.head());
     }
   }
 
   void printNodes() const {
-    for (const auto& node : Nodes) {
+    for (const auto& node : nodes_) {
       printf("Node: %p\n", &node);
     }
   }
-
- private:
-  std::list<Node<T, U...>> Nodes;
-  std::list<Edge<T, U...>> Edges;
-  std::unordered_set<NodeRef> NodeRefs;
-
-  NodeRef createNodeInternal(Node<T, U...>&& node) {
-    Nodes.emplace_back(std::move(node));
-    NodeRef nodeRef = &Nodes.back();
-    DEBUG_PRINT("Creating node (%p)\n", nodeRef);
-    NodeRefs.insert(nodeRef);
-    return nodeRef;
-  }
 };
 
 } // namespace nom
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
new file mode 100644
index 00000000000000..08ead742950740
--- /dev/null
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
@@ -0,0 +1,174 @@
+#ifndef NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H
+#define NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H
+
+namespace nom {
+
+namespace matcher {
+
+/*
+ * Subtree matching criteria consists of
+ * - Node matching criteria for the subtree's root.
+ * - Children subtree matching criteria
+ * - A count, which means we may want more than one of this subtree. The count
+ * can be unlimited. The count is only used when we match children of a
+ * subtree root, not matching the subtree itself.
+ */
+template <typename NodeMatchCriteria>
+class SubtreeMatchCriteria {
+ public:
+  static const int kStarCount = -1;
+  SubtreeMatchCriteria(
+      const NodeMatchCriteria& root,
+      const std::vector<SubtreeMatchCriteria>& children,
+      int count)
+      : root_(root), children_(children), count_(count){};
+
+ private:
+  NodeMatchCriteria root_;
+  std::vector<SubtreeMatchCriteria> children_;
+  int count_;
+
+  template <typename, typename, typename>
+  friend class SubgraphMatcher;
+};
+
+/*
+ * Utilities for subgraph matching.
+ */
+template <
+    typename GraphType,
+    typename NodeMatchCriteria,
+    typename NodeMatcherClass>
+struct SubgraphMatcher {
+  static bool isNodeMatch(
+      typename GraphType::NodeRef node,
+      const NodeMatchCriteria& criteria) {
+    return NodeMatcherClass::isMatch(node, criteria);
+  }
+
+  // Check if there can be a sub-tree that matches the given criteria that
+  // is rooted at the given rootNode.
+  // The flag invertGraphTraversal specify if we should follow out edges or
+  // in edges. The default is true which is useful for a functional
+  // intepretation of a dataflow graph.
+  static bool isSubtreeMatch(
+      typename GraphType::NodeRef root,
+      const SubtreeMatchCriteria<NodeMatchCriteria>& criteria,
+      bool invertGraphTraversal = true) {
+    if (!isNodeMatch(root, criteria.root_)) {
+      return false;
+    }
+    auto& edges =
+        invertGraphTraversal ? root->getInEdges() : root->getOutEdges();
+
+    int numEdges = edges.size();
+    int numChildrenCriteria = criteria.children_.size();
+
+    // The current algorithm implies that the ordering of the children is
+    // important. The children nodes will be matched with the children subtree
+    // criteria in the given order.
+
+    int currentEdgeIdx = 0;
+    for (int criteriaIdx = 0; criteriaIdx < numChildrenCriteria;
+         criteriaIdx++) {
+      auto childrenCriteria = criteria.children_[criteriaIdx];
+
+      int expectedCount = childrenCriteria.count_;
+      bool isStarCount =
+          expectedCount == SubtreeMatchCriteria<NodeMatchCriteria>::kStarCount;
+
+      int countMatch = 0;
+
+      // Continue to match subsequent edges with the current children criteria.
+      // Note that if the child criteria is a * pattern, this greedy algorithm
+      // will attempt to find the longest possible sequence that matches the
+      // children criteria.
+      for (; currentEdgeIdx < numEdges &&
+           (isStarCount || countMatch < expectedCount);
+           currentEdgeIdx++) {
+        auto edge = edges[currentEdgeIdx];
+        auto nextNode = invertGraphTraversal ? edge->tail() : edge->head();
+
+        if (!isSubtreeMatch(nextNode, childrenCriteria, invertGraphTraversal)) {
+          if (!isStarCount) {
+            // If the current criteria isn't a * pattern, this indicates a
+            // failure.
+            return false;
+          } else {
+            // Otherwise, we should move on to the next children criteria.
+            break;
+          }
+        }
+
+        countMatch++;
+      }
+
+      if (countMatch < expectedCount) {
+        // Fails because there are not enough matches as specified by the
+        // criteria.
+        return false;
+      }
+    }
+
+    if (currentEdgeIdx < numEdges) {
+      // Fails because there are unmatched edges.
+      return false;
+    }
+    return true;
+  }
+
+  // Utility to transform a graph by looking for subtrees that match
+  // a given pattern and then allow callers to mutate the graph based on
+  // subtrees that are found.
+  // The current implementation doesn't handle any graph transformation
+  // itself. Callers should be responsible for all intended mutation, including
+  // deleting nodes in the subtrees found by this algorithm.
+  // Note: if the replaceFunction lambda returns false, the entire procedure
+  // is aborted. This maybe useful in certain cases when we want to terminate
+  // the subtree search early.
+  // invertGraphTraversal flag: see documentation in isSubtreeMatch
+  static void replaceSubtree(
+      GraphType& graph,
+      const SubtreeMatchCriteria<NodeMatchCriteria>& criteria,
+      const std::function<
+          bool(GraphType& g, typename GraphType::NodeRef subtreeRoot)>&
+          replaceFunction,
+      bool invertGraphTraversal = true) {
+    for (auto nodeRef : graph.getMutableNodes()) {
+      // Make sure the node is still in the graph.
+      if (!graph.hasNode(nodeRef)) {
+        continue;
+      }
+      if (isSubtreeMatch(nodeRef, criteria, invertGraphTraversal)) {
+        if (!replaceFunction(graph, nodeRef)) {
+          // If replaceFunction returns false, it means that we should abort
+          // the entire procedure.
+          break;
+        }
+      }
+    }
+  }
+};
+
+// Convenient methods to create subtree matching criteria.
+template <typename NodeMatchCriteria>
+SubtreeMatchCriteria<NodeMatchCriteria> tree(
+    const NodeMatchCriteria& root,
+    const std::vector<SubtreeMatchCriteria<NodeMatchCriteria>>& children = {},
+    int count = 1) {
+  return SubtreeMatchCriteria<NodeMatchCriteria>(root, children, count);
+}
+
+template <typename NodeMatchCriteria>
+SubtreeMatchCriteria<NodeMatchCriteria> treeStar(
+    const NodeMatchCriteria& root,
+    const std::vector<SubtreeMatchCriteria<NodeMatchCriteria>>& children = {}) {
+  return tree(
+      root, children, SubtreeMatchCriteria<NodeMatchCriteria>::kStarCount);
+}
+
+} // namespace matcher
+
+} // namespace nom
+
+#endif // NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H
diff --git a/caffe2/core/nomnigraph/op_gen.py b/caffe2/core/nomnigraph/op_gen.py
index c62148ea52cff5..2d1125f5762ad4 100755
--- a/caffe2/core/nomnigraph/op_gen.py
+++ b/caffe2/core/nomnigraph/op_gen.py
@@ -6,6 +6,8 @@
 from __future__ import unicode_literals
 
 import argparse
+from textwrap import dedent
+from subprocess import call
 
 
 def parse_lines(lines):
@@ -22,25 +24,27 @@ def parse_lines(lines):
     index = 0
     while index < len(lines):
         line = lines[index]
-        if line.lower().startswith('macro'):
-            assert (parse_state == EMPTY)
-            macro_line = line.split(' ')
+        if line.lower().startswith("macro"):
+            assert parse_state == EMPTY
+            macro_line = line.split(" ")
             # Support macros that look like attributes
             # e.g. macro - CONV_LIKE
-            curr_macro = ' '.join(macro_line[1:])
-            assert (curr_macro not in macros)
+            curr_macro = " ".join(macro_line[1:])
+            assert curr_macro not in macros, 'Macro "{}" defined twice.'.format(
+                curr_macro
+            )
             macros[curr_macro] = []
             parse_state = MACRO
-            lines = lines[:index] + lines[index + 1:]
+            lines = lines[:index] + lines[index + 1 :]
             continue
-        elif line.lower().startswith('endmacro'):
-            assert (parse_state == MACRO)
+        elif line.lower().startswith("endmacro"):
+            assert parse_state == MACRO
             parse_state = EMPTY
-            lines = lines[:index] + lines[index + 1:]
+            lines = lines[:index] + lines[index + 1 :]
             continue
         elif parse_state == MACRO:
             macros[curr_macro].append(line)
-            lines = lines[:index] + lines[index + 1:]
+            lines = lines[:index] + lines[index + 1 :]
             continue
         index += 1
 
@@ -48,7 +52,7 @@ def parse_lines(lines):
     while index < len(lines):
         line = lines[index]
         if line in macros:
-            lines = lines[:index] + macros[line] + lines[index + 1:]
+            lines = lines[:index] + macros[line] + lines[index + 1 :]
             index += len(macros[line]) - 1
         index += 1
 
@@ -63,20 +67,20 @@ def parse_lines(lines):
     for line in lines:
         if not len(line):
             continue
-        if line[0] == '-':
-            assert (parse_state is OP)
-            attr = [_.strip() for _ in line[1:].split(':')]
-            assert (attr[0][0].isupper())
-            if (len(attr) == 2):  # attribute : type
+        if line[0] == "-":
+            assert parse_state is OP
+            attr = [_.strip() for _ in line[1:].split(":")]
+            assert attr[0][0].isupper()
+            if len(attr) == 2:  # attribute : type
                 ops[curr_op]["attributes"].append((attr[0], attr[1]))
-            elif (len(attr) == 3):  # attribute : type
+            elif len(attr) == 3:  # attribute : type
                 ops[curr_op]["attributes"].append((attr[0], attr[1], attr[2]))
         else:
-            op = [l.strip() for l in line.split(':')]
-            assert (len(op[0].split(' ')) == 1)
+            op = [l.strip() for l in line.split(":")]
+            assert len(op[0].split(" ")) == 1
             parse_state = OP
             curr_op = op[0]
-            assert (curr_op not in ops)
+            assert curr_op not in ops
             ops[curr_op] = {}
             op_list.append(curr_op)
             if len(op) > 1:
@@ -101,20 +105,26 @@ def gen_class(op, op_def):
         attr_arg = "{type} {lower_name}".format(
             type=t, lower_name=lower_name + default_arg
         )
-        attr_init = "{name}({lower_name})".format(
-            name=name, lower_name=lower_name
-        )
+        attr_init = "{name}({lower_name})".format(name=name, lower_name=lower_name)
         attr_declare = "{type} {name};".format(type=t, name=name)
-        attr_get = """
-  {type} get{name}() const {{
-    return {name};
-  }}
-""".format(type=t, name=name)
-        attr_set = """
-  void set{name}({type} {lower_name}) {{
-    {name} = {lower_name};
-  }}
-""".format(type=t, name=name, lower_name=lower_name)
+        attr_get = dedent(
+            """
+              {type} get{name}() const {{
+                return {name};
+              }}
+            """.format(
+                type=t, name=name
+            )
+        )
+        attr_set = dedent(
+            """
+              void set{name}({type} {lower_name}) {{
+                {name} = {lower_name};
+              }}
+            """.format(
+                type=t, name=name, lower_name=lower_name
+            )
+        )
         attribute_args.append(attr_arg)
         attribute_init.append(attr_init)
         attribute_declarations.append(attr_declare)
@@ -132,38 +142,43 @@ def gen_class(op, op_def):
                         name=attr[0], other_op=lower_other_op
                     )
                 )
-            init = """
-  {op}(const {other_op}& {lower_other_op}) :
-      {other_init} {{}}
-""".format(
-                op=op,
-                other_op=other_op,
-                lower_other_op=lower_other_op,
-                other_init=',\n      '.join(other_init)
+            init = dedent(
+                """
+                  {op}(const {other_op}& {lower_other_op}) :
+                      {other_init} {{}}
+                """.format(
+                    op=op,
+                    other_op=other_op,
+                    lower_other_op=lower_other_op,
+                    other_init=",\n      ".join(other_init),
+                )
             )
             extra_init += init
 
-    return """class {op} : public NeuralNetOperator {{
- public:
-  {op}({attribute_args}) :
-      {attribute_init} {{}}
-  {extra_init}
-  ~{op}() {{}}
-
-  NOMNIGRAPH_DEFINE_NN_RTTI({op});
-{getters}{setters}
- private:
-  {attribute_declarations}
-}};
-
-""".format(
-        op=op,
-        extra_init=extra_init,
-        getters=''.join(attribute_getters),
-        setters=''.join(attribute_setters),
-        attribute_args=',\n    '.join(attribute_args),
-        attribute_init=',\n      '.join(attribute_init),
-        attribute_declarations='\n  '.join(attribute_declarations)
+    return dedent(
+        """
+        class {op} : public NeuralNetOperator {{
+         public:
+          {op}({attribute_args}) :
+              {attribute_init} {{}}
+          {extra_init}
+          ~{op}() {{}}
+
+          NOMNIGRAPH_DEFINE_NN_RTTI({op});
+        {getters}{setters}
+         private:
+          {attribute_declarations}
+        }};
+
+        """.format(
+            op=op,
+            extra_init=extra_init,
+            getters="".join(attribute_getters),
+            setters="".join(attribute_setters),
+            attribute_args=",\n".join(attribute_args),
+            attribute_init=",\n".join(attribute_init),
+            attribute_declarations="\n".join(attribute_declarations),
+        )
     )
 
 
@@ -175,33 +190,51 @@ def gen_classes(ops, op_list):
 
 
 def gen_enum(op_list):
-    return ',\n'.join([op for op in op_list]) + '\n'
+    return ",\n".join([op for op in op_list]) + "\n"
 
 
 def gen_names(op_list):
     f = ""
     for op in op_list:
-        f += """case NNKind::{name}:
-    return \"{name}\";
-""".format(name=op)
+        f += dedent(
+            """
+            case NNKind::{name}:
+                return \"{name}\";
+            """.format(
+                name=op
+            )
+        )
     return f
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Generate op files.')
-    parser.add_argument('--install_dir', help='installation directory')
-    parser.add_argument('--source_def', help='ops.def')
+    parser = argparse.ArgumentParser(description="Generate op files.")
+    parser.add_argument("--install_dir", help="installation directory")
+    parser.add_argument("--source_def", help="ops.def", action="append")
     args = parser.parse_args()
     install_dir = args.install_dir
+    sources = args.source_def
 
-    with open(args.source_def, 'rb') as f:
-        lines = f.readlines()
-        lines = [l.strip().decode("utf-8") for l in lines]
+    lines = []
+    for source in sources:
+        with open(source, "rb") as f:
+            lines_tmp = f.readlines()
+            lines += [l.strip().decode("utf-8") for l in lines_tmp]
     ops, op_list = parse_lines(lines)
 
-    with open(install_dir + '/OpClasses.h', 'wb') as f:
+    with open(install_dir + "/OpClasses.h", "wb") as f:
         f.write(gen_classes(ops, op_list).encode("utf-8"))
-    with open(install_dir + '/OpNames.h', 'wb') as f:
+    with open(install_dir + "/OpNames.h", "wb") as f:
         f.write(gen_names(op_list).encode("utf-8"))
-    with open(install_dir + '/OpEnum.h', 'wb') as f:
+    with open(install_dir + "/OpEnum.h", "wb") as f:
         f.write(gen_enum(op_list).encode("utf-8"))
+
+    try:
+        cmd = ["clang-format", "-i", install_dir + "/OpClasses.h"]
+        call(cmd)
+        cmd = ["clang-format", "-i", install_dir + "/OpNames.h"]
+        call(cmd)
+        cmd = ["clang-format", "-i", install_dir + "/OpEnum.h"]
+        call(cmd)
+    except Exception:
+        pass
diff --git a/caffe2/core/nomnigraph/ops.def b/caffe2/core/nomnigraph/ops.def
index 53dd951c8fc1c2..6183e3c25726a3 100644
--- a/caffe2/core/nomnigraph/ops.def
+++ b/caffe2/core/nomnigraph/ops.def
@@ -69,30 +69,3 @@ CopyFromOpenCL
 NCHW2NHWC
 NHWC2NCHW
 
-Int8Quantize
-Int8Dequantize
-Int8AveragePool : AveragePool
-Int8Conv : Conv
-Int8ConvTranspose : ConvTranspose
-Int8FC : FC
-Int8MaxPool : MaxPool
-Int8Relu : Relu
-Int8GivenTensorFill : GivenTensorFill
-Int8Concat : Concat
-Int8Softmax : Softmax
-Int8ChannelShuffle : ChannelShuffle
-Int8Sum : Sum
-Int8Add : Add
-Int8Reshape : Reshape
-Int8Flatten : Flatten
-Int8ConvRelu : ConvRelu
-Int8SumRelu : SumRelu
-Int8AveragePoolRelu : AveragePoolRelu
-Int8MaxPoolRelu : MaxPoolRelu
-
-BatchMatMul
-- TransA : bool : false
-- TransB : bool : true
-- Broadcast: bool : false
-BatchGather
-ConcatBatchMatMulBatchGatherOp
diff --git a/caffe2/core/nomnigraph/tests/binary_match_test.cc b/caffe2/core/nomnigraph/tests/binary_match_test.cc
index 4834cea30f3e23..ca3fd11b3a9126 100644
--- a/caffe2/core/nomnigraph/tests/binary_match_test.cc
+++ b/caffe2/core/nomnigraph/tests/binary_match_test.cc
@@ -19,7 +19,7 @@ TEST(BinaryMatch, AllMatch) {
   auto matches = nom::algorithm::binaryMatch(
       &graph, [](decltype(graph)::NodeRef n) { return true; });
   EXPECT_EQ(matches.size(), 1);
-  EXPECT_EQ(matches.front().Nodes.size(), graph.getMutableNodes().size());
+  EXPECT_EQ(matches.front().getNodesCount(), graph.getMutableNodes().size());
 }
 
 TEST(BinaryMatch, EmptyGraph) {
@@ -58,9 +58,9 @@ TEST(BinaryMatch, Basic) {
 
   EXPECT_EQ(matches.size(), 1);
   auto match = matches.front();
-  EXPECT_EQ(match.Nodes.size(), 4);
+  EXPECT_EQ(match.getNodesCount(), 4);
   std::set<std::string> exp{"2", "3", "4", "6"};
-  for (auto n : match.Nodes) {
+  for (auto n : match.getNodes()) {
     EXPECT_EQ(exp.count(n->data()), 1);
     exp.erase(n->data());
   }
@@ -104,16 +104,16 @@ TEST(BinaryMatch, RemovedMiddleNode) {
   auto match1 = matches.front();
   auto match2 = matches.back();
 
-  EXPECT_EQ(match1.Nodes.size(), 2);
-  EXPECT_EQ(match2.Nodes.size(), 1);
+  EXPECT_EQ(match1.getNodesCount(), 2);
+  EXPECT_EQ(match2.getNodesCount(), 1);
 
   std::set<std::string> exp1{"2", "4"};
   std::set<std::string> exp2{"6"};
-  for (auto n : match1.Nodes) {
+  for (auto n : match1.getNodes()) {
     EXPECT_EQ(exp1.count(n->data()), 1);
     exp1.erase(n->data());
   }
-  for (auto n : match2.Nodes) {
+  for (auto n : match2.getNodes()) {
     EXPECT_EQ(exp2.count(n->data()), 1);
     exp2.erase(n->data());
   }
diff --git a/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc b/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc
new file mode 100644
index 00000000000000..ddd8a15fcdc2bc
--- /dev/null
+++ b/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc
@@ -0,0 +1,404 @@
+#include <algorithm>
+
+#include "test_util.h"
+
+#include "nomnigraph/Transformations/SubgraphMatcher.h"
+
+#include <gtest/gtest.h>
+
+namespace nom {
+
+namespace matcher {
+
+using NodeType = std::string;
+using Criteria = std::string;
+
+// Node matches a criteria (string) if the data string is the same as the
+// criteria. Special case: "*" will match any thing.
+struct TestNodeMatch {
+  static bool isMatch(
+      const nom::Graph<NodeType>::NodeRef& node,
+      const Criteria& criteria) {
+    return criteria == "*" || criteria == node->data();
+  }
+};
+
+using TestGraph = Graph<NodeType>;
+using TestMatcher = SubgraphMatcher<TestGraph, Criteria, TestNodeMatch>;
+
+Criteria any() {
+  return Criteria("*");
+}
+
+// Make it more concise to create matching criteria in dataflow graph.
+// For example, operatorTree("opA", ...) will refer to a tree like this:
+// ... -> opA -> opA_Output
+SubtreeMatchCriteria<Criteria> operatorTree(
+    const Criteria& root,
+    const std::vector<SubtreeMatchCriteria<Criteria>>& childrenCriteria = {},
+    int count = 1) {
+  return tree(any(), {tree(root, childrenCriteria)}, count);
+}
+
+std::map<std::string, std::string> TestGraphNodePrinter(
+    TestGraph::NodeRef node) {
+  std::map<std::string, std::string> labelMap;
+  labelMap["label"] = node->data();
+  return labelMap;
+};
+
+// Attempts to create a realistic dataflow graph that shows a fuse procedure.
+struct DataFlowTestGraph {
+  const int numInputs = 4;
+
+  TestGraph graph;
+
+  TestGraph::NodeRef opB;
+  TestGraph::NodeRef opF;
+  TestGraph::NodeRef opC;
+  TestGraph::NodeRef opG;
+  TestGraph::NodeRef dataOut;
+
+  // Realistic data flow test graph.
+  /*
+
+
+                          +---------------+
+                          |               |
+                          |  +---------+  |  +---------+
+    +---------------------+  | input_A |  |  | input_B |
+    |                        +---------+  |  +---------+
+    |                          |          |    |
+    |                          |          |    |
+    |                          v          v    v
+  +---------++---------+     +-------------------------+     +--------+
+  | input_C || input_D | --> |           opC           | --> | dataC2 |
+  +---------++---------+     +-------------------------+     +--------+
+                               |
+                               |
+                               v
+                             +---------+
+                             |  dataC  | -+
+                             +---------+  |
+                               |          |
+                               |          |
+                               v          |
+                             +---------+  |
+                             |   opB   | <+
+                             +---------+
+                               |
+                               |
+                               v
+                             +---------+
+                             |  dataB  |
+                             +---------+
+                               |
+                               |
+                               v
+                             +---------+
+                             |   opF   |
+                             +---------+
+                               |
+                               |
+                               v
+                             +---------+
+                             |  dataF  |
+                             +---------+
+                               |
+                               |
+                               v
+             +---------+     +---------+
+             |  dataI  | --> |   opG   |
+             +---------+     +---------+
+                               |
+                               |
+                               v
+                             +---------+
+                             | dataOut |
+                             +---------+
+  */
+  DataFlowTestGraph() {
+    opC = graph.createNode("opC");
+
+    for (int i = 0; i < numInputs; i++) {
+      auto dataInput = graph.createNode("input");
+      graph.createEdge(dataInput, opC);
+    }
+
+    auto dataC = graph.createNode("dataC");
+    auto dataC2 = graph.createNode("dataC2");
+    graph.createEdge(opC, dataC);
+    graph.createEdge(opC, dataC2);
+
+    opB = graph.createNode("opB");
+    // There are 2 edges
+    graph.createEdge(dataC, opB);
+    graph.createEdge(dataC, opB);
+
+    auto dataB = graph.createNode("dataB");
+    graph.createEdge(opB, dataB);
+
+    opF = graph.createNode("opF");
+    graph.createEdge(dataB, opF);
+
+    auto dataF = graph.createNode("dataF");
+    graph.createEdge(opF, dataF);
+
+    auto dataI = graph.createNode("dataI");
+
+    opG = graph.createNode("opG");
+    graph.createEdge(dataF, opG);
+    graph.createEdge(dataI, opG);
+
+    dataOut = graph.createNode("dataOut");
+    graph.createEdge(opG, dataOut);
+
+    // Use nom::converters::convertToDotString(&graph, TestGraphNodePrinter)
+    // to visualize the graph.
+  }
+};
+
+SubtreeMatchCriteria<Criteria> DataFlowTestGraphCriteria() {
+  // clang-format off
+  return tree(
+      Criteria("opG"),{
+        operatorTree("opF", {
+            // Note: we currently don't enforce that these 2 opC nodes
+            // have to be the same.
+            operatorTree("opB", {
+              operatorTree("opC", {
+                treeStar(Criteria("input"))
+              }, 2),
+            })
+        }),
+        tree(any()) // matches dataI
+      });
+  // clang-format on
+}
+
+TestGraph::NodeRef getInNode(TestGraph::NodeRef node, int index) {
+  return node->getInEdges()[index]->tail();
+}
+
+} // namespace matcher
+
+} // namespace nom
+
+using namespace nom::matcher;
+
+// Simple test cases for node matching criteria.
+TEST(SubgraphMatcher, IsNodeMatch) {
+  TestGraph graph;
+  auto n1 = graph.createNode("Hello");
+  auto n2 = graph.createNode("Le");
+  graph.createEdge(n1, n2);
+
+  EXPECT_TRUE(TestMatcher::isNodeMatch(n1, "Hello"));
+  EXPECT_FALSE(TestMatcher::isNodeMatch(n1, "G"));
+  EXPECT_TRUE(TestMatcher::isNodeMatch(n2, "Le"));
+  EXPECT_FALSE(TestMatcher::isNodeMatch(n2, "le"));
+}
+
+// Test subtree matching with a simple tree graph.
+TEST(SubgraphMatcher, IsSubtreeMatch) {
+  TestGraph graph;
+  auto n1 = graph.createNode("1");
+  auto n2 = graph.createNode("2");
+  auto n3 = graph.createNode("3");
+  auto n4 = graph.createNode("4");
+  auto n5 = graph.createNode("5");
+  auto n6 = graph.createNode("6");
+  auto n7 = graph.createNode("7");
+
+  graph.createEdge(n1, n2);
+  graph.createEdge(n2, n3);
+  graph.createEdge(n2, n4);
+  graph.createEdge(n1, n5);
+  graph.createEdge(n5, n6);
+  graph.createEdge(n5, n7);
+  /*       N1
+         /     \
+      N2         N5
+    /    \     /    \
+  N3     N4   N6   N7
+  */
+
+  auto subtree = tree(any(), {tree(any()), tree(any())});
+  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false));
+  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n4, subtree, false));
+
+  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n2, subtree, false));
+  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n5, subtree, false));
+
+  subtree = tree(Criteria("5"), {tree(any()), tree(any())});
+  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n2, subtree, false));
+  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n5, subtree, false));
+
+  subtree = tree(any(), {tree(any()), tree(Criteria("4"))});
+  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n2, subtree, false));
+  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n5, subtree, false));
+}
+
+// Test subtree matching in which * (repeated) matching of children is allowed.
+TEST(SubgraphMatcher, IsSubtreeMatchRepeated) {
+  TestGraph graph;
+  auto n1 = graph.createNode("1");
+  auto n2 = graph.createNode("2");
+  auto n3A = graph.createNode("3");
+  auto n3B = graph.createNode("3");
+  auto n4 = graph.createNode("4");
+  auto n5A = graph.createNode("5");
+  auto n5B = graph.createNode("5");
+  auto n5C = graph.createNode("5");
+  graph.createEdge(n1, n2);
+  graph.createEdge(n1, n3A);
+  graph.createEdge(n1, n3B);
+  graph.createEdge(n1, n4);
+  graph.createEdge(n1, n4);
+  graph.createEdge(n1, n5A);
+  graph.createEdge(n1, n5B);
+  graph.createEdge(n1, n5C);
+
+  auto subtree = tree(any(), {tree(Criteria("2"))});
+  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false));
+
+  subtree = tree(any(), {treeStar(Criteria("2"))});
+  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false));
+
+  // clang-format off
+  subtree = tree(any(), {
+    tree(Criteria("2")),
+    tree(Criteria("3"), {}, 2),
+    tree(Criteria("4"), {}, 2),
+    tree(Criteria("5"), {}, 3)
+  });
+  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n1, subtree, false));
+
+  subtree = tree(any(), {
+    tree(Criteria("2")),
+    tree(Criteria("3"), {}, 2),
+    tree(Criteria("4"), {}, 2),
+    treeStar(Criteria("5"))
+  });
+  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n1, subtree, false));
+
+  subtree = tree(any(), {
+    tree(Criteria("2")),
+    treeStar(Criteria("3")),
+    tree(Criteria("4"), {}, 2),
+    treeStar(Criteria("5"))
+  });
+  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n1, subtree, false));
+
+  subtree = tree(any(), {
+    tree(Criteria("2")),
+    treeStar(Criteria("3")),
+  });
+  // Fails because there are unmatched edges.
+  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false));
+
+  subtree = tree(any(), {
+    tree(Criteria("2")),
+    tree(Criteria("3"), {}, 2),
+    tree(Criteria("4")),
+    tree(Criteria("5"), {}, 3)
+  });
+  // Fails because the count is wrong; we have 2 edges to node N4 while
+  // the pattern expects only 1.
+  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false));
+  // clang-format on
+}
+
+TEST(SubgraphMatcher, IsSubtreeMatchRealistic) {
+  auto graph = DataFlowTestGraph();
+  auto subtree = DataFlowTestGraphCriteria();
+
+  EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.opF, subtree));
+  EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.opC, subtree));
+  EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.opB, subtree));
+  EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.dataOut, subtree));
+
+  EXPECT_TRUE(TestMatcher::isSubtreeMatch(graph.opG, subtree));
+}
+
+TEST(SubgraphMatcher, ReplaceSubtreeRealistic) {
+  auto graph = DataFlowTestGraph();
+  auto subtree = DataFlowTestGraphCriteria();
+
+  TestMatcher::replaceSubtree(
+      graph.graph, subtree, [](TestGraph& g, TestGraph::NodeRef opG) {
+        auto opFused = g.createNode("opFused");
+
+        auto dataF = getInNode(opG, 0);
+        auto opF = getInNode(dataF, 0);
+        auto dataB = getInNode(opF, 0);
+        auto opB = getInNode(dataB, 0);
+        auto dataC = getInNode(opB, 0);
+        auto opC = getInNode(dataC, 0);
+
+        g.deleteNode(dataF);
+        g.replaceNode(opG, opFused);
+
+        auto outEdgesC = opC->getOutEdges();
+        g.deleteNode(outEdgesC[0]->head());
+        g.deleteNode(outEdgesC[1]->head());
+        g.replaceNode(opC, opFused);
+
+        g.deleteNode(opC);
+        g.deleteNode(opB);
+        g.deleteNode(dataB);
+        g.deleteNode(opF);
+        g.deleteNode(opG);
+
+        return true;
+      });
+
+  // Now the nodes are:
+  // - NumInputs input nodes
+  // - dataI node
+  // - fused node
+  // - output node
+  auto nodes = graph.graph.getMutableNodes();
+
+  // Test that the graph is transformed as expected.
+  EXPECT_EQ(nodes.size(), graph.numInputs + 3);
+  TestGraph::NodeRef opFused;
+  TestGraph::NodeRef dataI;
+  TestGraph::NodeRef dataOut;
+  for (auto node : nodes) {
+    if (node->data() == "opFused") {
+      opFused = node;
+    } else if (node->data() == "dataOut") {
+      dataOut = node;
+    } else if (node->data() == "dataI") {
+      dataI = node;
+    }
+  }
+
+  EXPECT_EQ(getInNode(dataOut, 0), opFused);
+  EXPECT_EQ(getInNode(opFused, 0), dataI);
+  for (int i = 1; i <= graph.numInputs; i++) {
+    EXPECT_EQ(getInNode(opFused, i)->data(), "input");
+  }
+
+  // Use nom::converters::convertToDotString(&graph.graph, TestGraphNodePrinter)
+  // to visualize. The transformed graph looks like This
+  /*
+
+                +---------++---------+
+                | input_A || input_D |
+                +---------++---------+
+                  |          |
+                  |          |
+                  v          v
++---------+     +--------------------+     +---------+
+| input_B | --> |      opFused       | <-- | input_C |
++---------+     +--------------------+     +---------+
+                  |          ^
+                  |          |
+                  v          |
+                +---------++---------+
+                | dataOut ||  dataI  |
+                +---------++---------+
+   */
+}
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 734d38d75e680d..9f88f192936fe4 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -323,6 +323,10 @@ class OperatorBase : public Observable<OperatorBase> {
     return !event_;
   }
 
+  virtual void SyncDevice() {
+    CAFFE_NOT_IMPLEMENTED;
+  }
+
   // Checks whether stream is ready to execute new computation,
   // used in stream allocation optimization to skip stream that is currently
   // busy. Depends on context and operator's device, returns true by default
@@ -577,6 +581,8 @@ class Operator : public OperatorBase {
     return &context_;
   }
 
+  void SyncDevice() final {}
+
   virtual std::vector<TensorFiller<Context>> InputFillers(
       const std::vector<std::vector<TIndex>>& shapes) {
     CAFFE_ENFORCE(shapes.size() == Inputs().size());
diff --git a/caffe2/core/operator_gpu.cc b/caffe2/core/operator_gpu.cc
new file mode 100644
index 00000000000000..03f227f7453524
--- /dev/null
+++ b/caffe2/core/operator_gpu.cc
@@ -0,0 +1,26 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <>
+void Operator<CUDAContext>::SyncDevice() {
+  auto* context = getContext();
+  int device;
+  cudaGetDevice(&device);
+
+  cudaEvent_t ev;
+  cudaSetDevice(context->cuda_gpu_id());
+  cudaEventCreateWithFlags(&ev, cudaEventDisableTiming);
+  cudaEventRecord(ev, context->cuda_stream());
+  cudaEventSynchronize(ev);
+  cudaEventDestroy(ev);
+  cudaSetDevice(device);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) {
+    CAFFE_THROW("Encountered CUDA error Stop: ", cudaGetErrorString(error));
+  }
+}
+
+} // namespace caffe2
diff --git a/caffe2/core/typeid.h b/caffe2/core/typeid.h
index b4a01b57cc11e3..facea9fa64d2fa 100644
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@@ -14,8 +14,9 @@
 
 #include <exception>
 
+#include "ATen/core/Half.h"
 #include "caffe2/core/common.h"
-#include "caffe2/utils/IdWrapper.h"
+#include "ATen/core/IdWrapper.h"
 
 namespace caffe2 {
 class CaffeTypeId;
@@ -32,16 +33,16 @@ class TypeMeta;
  * You need to register your types using CAFFE_KNOWN_TYPE(MyType) to be able to use CaffeTypeId with custom types.
  * This is for example used to store the dtype of tensors.
  */
-class CaffeTypeId final : public c10::guts::IdWrapper<CaffeTypeId, uint16_t> {
+class CaffeTypeId final : public at::IdWrapper<CaffeTypeId, uint16_t> {
 public:
   static CaffeTypeId createTypeId();
 
   friend std::ostream& ::operator<<(std::ostream& stream, CaffeTypeId typeId);
   friend bool operator<(CaffeTypeId lhs, CaffeTypeId rhs);
 
-  // TODO Can we get rid of uninitialized?
+  // This is 8, because 0 is uint8_t (due to ScalarType BC constraint)
   static constexpr CaffeTypeId uninitialized() {
-    return CaffeTypeId(0);
+    return CaffeTypeId(8);
   }
 
 private:
@@ -57,7 +58,7 @@ inline bool operator<(CaffeTypeId lhs, CaffeTypeId rhs) {
 
 }
 
-C10_DEFINE_HASH_FOR_IDWRAPPER(caffe2::CaffeTypeId)
+AT_DEFINE_HASH_FOR_IDWRAPPER(caffe2::CaffeTypeId)
 
 inline std::ostream& operator<<(std::ostream& stream, caffe2::CaffeTypeId typeId) {
   return stream << typeId.underlyingId();
@@ -439,35 +440,41 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
 
 class Tensor;
 
-// note: first preallocated id is 1, because 0 is used for uninitialized type
-// ids.
+// Note: we have preallocated the numbers 0-8 so they line up exactly
+// with at::ScalarType's numbering.  All other numbers do not matter.
+//
+// Notably, the "uninitialized" type id is 8, not 0, for hysterical raisins.
+
 struct _CaffeHighestPreallocatedTypeId final {};
 
-CAFFE_DECLARE_KNOWN_TYPE(1, Tensor);
-CAFFE_DECLARE_KNOWN_TYPE(2, float);
+CAFFE_DECLARE_KNOWN_TYPE(0, uint8_t);
+CAFFE_DECLARE_KNOWN_TYPE(1, int8_t);
+CAFFE_DECLARE_KNOWN_TYPE(2, int16_t);
 CAFFE_DECLARE_KNOWN_TYPE(3, int);
-CAFFE_DECLARE_KNOWN_TYPE(4, std::string);
-CAFFE_DECLARE_KNOWN_TYPE(5, bool);
-CAFFE_DECLARE_KNOWN_TYPE(6, uint8_t);
-CAFFE_DECLARE_KNOWN_TYPE(7, int8_t);
-CAFFE_DECLARE_KNOWN_TYPE(8, uint16_t);
-CAFFE_DECLARE_KNOWN_TYPE(9, int16_t);
-CAFFE_DECLARE_KNOWN_TYPE(10, int64_t);
-CAFFE_DECLARE_KNOWN_TYPE(11, double);
-CAFFE_DECLARE_KNOWN_TYPE(12, char);
-CAFFE_DECLARE_KNOWN_TYPE(13, std::unique_ptr<std::mutex>);
-CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::atomic<bool>>);
-CAFFE_DECLARE_KNOWN_TYPE(15, std::vector<int32_t>);
-CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int64_t>);
-CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<unsigned long>);
-CAFFE_DECLARE_KNOWN_TYPE(18, bool*);
-CAFFE_DECLARE_KNOWN_TYPE(19, char*);
-CAFFE_DECLARE_KNOWN_TYPE(20, int*);
+CAFFE_DECLARE_KNOWN_TYPE(4, int64_t);
+CAFFE_DECLARE_KNOWN_TYPE(5, at::Half);
+CAFFE_DECLARE_KNOWN_TYPE(6, float);
+CAFFE_DECLARE_KNOWN_TYPE(7, double);
+// 8 = undefined type id
+
+CAFFE_DECLARE_KNOWN_TYPE(9, Tensor);
+CAFFE_DECLARE_KNOWN_TYPE(10, std::string);
+CAFFE_DECLARE_KNOWN_TYPE(11, bool);
+CAFFE_DECLARE_KNOWN_TYPE(12, uint16_t);
+CAFFE_DECLARE_KNOWN_TYPE(13, char);
+CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::mutex>);
+CAFFE_DECLARE_KNOWN_TYPE(15, std::unique_ptr<std::atomic<bool>>);
+CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int32_t>);
+CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<int64_t>);
+CAFFE_DECLARE_KNOWN_TYPE(18, std::vector<unsigned long>);
+CAFFE_DECLARE_KNOWN_TYPE(19, bool*);
+CAFFE_DECLARE_KNOWN_TYPE(20, char*);
+CAFFE_DECLARE_KNOWN_TYPE(21, int*);
 
 #ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
-CAFFE_DECLARE_KNOWN_TYPE(21, long);
-CAFFE_DECLARE_KNOWN_TYPE(22, std::vector<long>);
+CAFFE_DECLARE_KNOWN_TYPE(22, long);
+CAFFE_DECLARE_KNOWN_TYPE(23, std::vector<long>);
 #endif // CAFFE2_UNIQUE_LONG_TYPEMETA
 
-CAFFE_DECLARE_KNOWN_TYPE(23, _CaffeHighestPreallocatedTypeId);
+CAFFE_DECLARE_KNOWN_TYPE(24, _CaffeHighestPreallocatedTypeId);
 }
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index 35c2008d4fdab0..c7215e0ed28b32 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -21,7 +21,7 @@ class IDEEPContext final : public BaseContext {
     CAFFE_ENFORCE_EQ(option.device_type(), IDEEP);
   }
 
-  ~IDEEPContext() noexcept {}
+  ~IDEEPContext() noexcept override {}
 
   BaseStaticContext* GetStaticContext() const override {
     return GetIDEEPStaticContext();
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index 45f55ab2407a2e..755e1b5a57b8a9 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -489,7 +489,7 @@ bool RunOnDevice() override {
         "noise_size", 491 /* prime to avoid artifacts */);
     // Treaded as half4 in the kernel, so need half4 here.
     noiseSize = divRoundUp(noiseSize, 4) * 4;
-    if (!noiseBlob->IsType<TensorCPU>() ||
+    if (!noiseBlob->IsType<Tensor>(CPU) ||
         noiseBlob->Get<TensorCPU>().size() != noiseSize) {
       VLOG(2) << "Initializing stylizer with noise: " << noiseSize;
       caffe2::Timer rt;
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
index 9f032e6fe299d0..bcf588d8a384f0 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@@ -94,7 +94,7 @@ void testMPSCNN() {
 
               Workspace ws;
               for (auto i = 0; i < N; ++i) {
-                auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
                 t->Resize(BS, C, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -152,7 +152,7 @@ void testMPSCNN() {
 
         Workspace ws;
         for (auto i = 0; i < N; ++i) {
-          auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
           switch (ndim) {
             case 1:
               t->Resize(5);
@@ -210,7 +210,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: ";
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -218,14 +218,14 @@ void testMPSCNN() {
         }
 
         {
-          auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 0, 1, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = ws.CreateBlob("stddev")->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob("stddev")->GetMutableTensor(CPU);
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandUniform<float, CPUContext>(
@@ -290,7 +290,7 @@ void testMPSCNN() {
           for (const auto dim : {10, 40}) {
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
               t->Resize(batchSize, channels, dim, dim);
               CPUContext ctx;
               // Too noisy.
@@ -299,7 +299,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -310,7 +310,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -321,7 +321,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = ws.CreateBlob("pw")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("pw")->GetMutableTensor(CPU);
               t->Resize(prelu == PreluTy::SHARED ? 1 : channels);
               CPUContext ctx;
               // Too noisy.
@@ -409,7 +409,7 @@ void testMPSCNN() {
           Workspace ws;
           const auto channels = array ? 12 : 3;
           {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
             t->Resize(batch_size, channels, 8, 13);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -417,7 +417,7 @@ void testMPSCNN() {
           }
 
           {
-            auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+            auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
             t->Resize(shared ? channels : 1);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -480,7 +480,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNSpatialBN Test: " << channels;
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -488,7 +488,7 @@ void testMPSCNN() {
         }
 
         for (const std::string name : {"scale", "bias", "mean", "var"}) {
-          auto* t = ws.CreateBlob(name)->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob(name)->GetMutableTensor(CPU);
           t->Resize(channels);
           CPUContext ctx;
           // High mean to avoid var division by zero.
@@ -575,7 +575,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSCNNFC Test";
               Workspace ws;
               {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                 t->Resize(batchSize, CIn, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -583,7 +583,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
                 t->Resize(COut, CIn * H * W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -591,7 +591,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
                 t->Resize(COut);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -683,7 +683,7 @@ void testMPSCNN() {
                           Workspace ws;
                           {
                             auto* t =
-                                ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                                ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                             t->Resize(batchSize, 8, 8, 13);
                             CPUContext ctx;
                             math::RandGaussian<float, CPUContext>(
@@ -784,7 +784,7 @@ void testMPSCNN() {
          std::vector<std::vector<size_t>>{{1, 3, 50, 80}, {1, 12, 50, 80}}) {
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
         t->Resize(dims);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -860,7 +860,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNPreprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 8, 13, 4);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -869,7 +869,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -940,7 +940,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 3, 8, 24);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -949,7 +949,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -999,7 +999,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 3, 1280, 720);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -1008,7 +1008,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 30;
@@ -1072,8 +1072,7 @@ void testMPSCNN() {
                       LOG(INFO) << "MPSCNNConv Test";
                       Workspace ws;
                       {
-                        auto* t =
-                            ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                         t->Resize(batchSize, 12, 57, 72);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1081,7 +1080,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                        auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
                         t->Resize(8, 12, kernel_h, kernel_w);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1093,7 +1092,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                        auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
                         t->Resize(8);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1189,7 +1188,7 @@ void testMPSCNN() {
             Workspace ws;
             int output_channels = input_channels * channel_multiplier;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
               t->Resize(batchSize, input_channels, 57, 72);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1197,7 +1196,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
               t->Resize(output_channels, 1, 3, 3);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1205,7 +1204,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
               t->Resize(output_channels);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1276,7 +1275,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNConvRelu Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1284,7 +1283,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1292,7 +1291,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1386,7 +1385,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSConv Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1394,7 +1393,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1402,7 +1401,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1494,7 +1493,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSConv Test";
               Workspace ws;
               {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                 t->Resize(batchSize, C, 12, 16);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1502,7 +1501,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
                 t->Resize(M, C, K, K);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1510,7 +1509,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
                 t->Resize(M);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1608,7 +1607,7 @@ void testMPSCNN() {
                 LOG(INFO) << "MPSCNNConv Test - group";
                 Workspace ws;
                 {
-                  auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+                  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                   t->Resize(batchSize, C, 12, 16);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1616,7 +1615,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                  auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
                   t->Resize(M, C / group, K, K);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1624,7 +1623,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                  auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
                   t->Resize(M);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1727,7 +1726,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNMul Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1735,7 +1734,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1792,7 +1791,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNSub Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1800,7 +1799,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1857,7 +1856,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1865,7 +1864,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1922,7 +1921,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1930,7 +1929,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2012,7 +2011,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNNeuron Test: " << n;
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
         t->Resize(1, 4, 12, 12);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -2066,7 +2065,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDropout Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2137,7 +2136,7 @@ void testMPSCNN() {
                       << " - scale: " << scale;
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
               t->Resize(1, channels, 40, 40);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2145,7 +2144,7 @@ void testMPSCNN() {
             }
             {
               // Use the batch-first encoding (n, [bbox])
-              auto* t = ws.CreateBlob("R")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
               t->Resize(6, 5);
               for (auto i = 0; i < t->dim32(0); ++i) {
                 t->mutable_data<float>()[5 * i + 0] = 0; // batch
@@ -2251,14 +2250,14 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNRoIWarp Test 2";
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
           t->Resize(1, 8, 40, 40);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 4, 2, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = ws.CreateBlob("R")->GetMutable<TensorCPU>();
+          auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
           t->Resize(6, 4);
           for (auto i = 0; i < t->dim32(0); ++i) {
             t->mutable_data<float>()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale;
@@ -2363,7 +2362,7 @@ void testMPSCNN() {
             LOG(INFO) << "MPSCNNResizeNearestOp Test";
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
               t->Resize(N, C, 37, 89);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2498,7 +2497,7 @@ void testMPSCNN() {
     vector<float> im_info{60, 80, 0.166667};
     vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
       t->Resize(num_images, A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = scores[i];
@@ -2506,7 +2505,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutableTensor(CPU);
       t->Resize(num_images, 4 * A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = bbx[i];
@@ -2514,7 +2513,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("im_info")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("im_info")->GetMutableTensor(CPU);
       t->Resize(num_images, 3);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = im_info[i];
@@ -2522,7 +2521,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("anchors")->GetMutable<TensorCPU>();
+      auto* t = ws.CreateBlob("anchors")->GetMutableTensor(CPU);
       t->Resize(A, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = anchors[i];
@@ -2588,7 +2587,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNSoftmax Test";
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
         // Only works for spatial dimension of (1, 1) - weird.
         t->Resize(batchSize, 12, 1, 1);
         CPUContext ctx;
@@ -2662,8 +2661,8 @@ void testMPSCNN() {
                             LOG(INFO) << "MPSConvTranspose Test";
                             Workspace ws;
                             {
-                              auto* t = ws.CreateBlob("X_cpu")
-                                            ->GetMutable<TensorCPU>();
+                              auto* t =
+                                  ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
                               t->Resize(batchSize, inputChannels, 8, 12);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2676,7 +2675,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  ws.CreateBlob("W")->GetMutable<TensorCPU>();
+                                  ws.CreateBlob("W")->GetMutableTensor(CPU);
                               t->Resize(
                                   inputChannels,
                                   outputChannels,
@@ -2693,7 +2692,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  ws.CreateBlob("b")->GetMutable<TensorCPU>();
+                                  ws.CreateBlob("b")->GetMutableTensor(CPU);
                               t->Resize(outputChannels);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2810,7 +2809,7 @@ void testMPSCNN() {
                     << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
+            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
             t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2892,7 +2891,7 @@ void testMPSCNN() {
           }
           Workspace ws;
           {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
+            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
             t->Resize(batchSize, inputChannels, 53, 47);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2965,7 +2964,7 @@ void testMPSCNN() {
                     << numInputs << ", " << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
+            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
             t->Resize(batchSize, channelCount, 9, 17);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -3338,7 +3337,7 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     cws.RunNetOnce(initNet);
     {
       auto* t =
-          cws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
+          cws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3350,7 +3349,7 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     mws.RunNetOnce(initNet);
     {
       auto* t =
-          mws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
+          mws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3398,16 +3397,16 @@ void verifyRewrite(
   dumpDef(predictNet);
   dumpDef(metalPredictNet);
 
-#define RUN_NET(ws, predictNet)                                               \
-  ws.RunNetOnce(initNet);                                                     \
-  {                                                                           \
-    auto* t =                                                                 \
-        ws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>(); \
-    t->Resize(inputDims);                                                     \
-    CPUContext ctx;                                                           \
-    math::RandGaussian<float, CPUContext>(                                    \
-        t->size(), 0, 1, t->mutable_data<float>(), &ctx);                     \
-  }                                                                           \
+#define RUN_NET(ws, predictNet)                                             \
+  ws.RunNetOnce(initNet);                                                   \
+  {                                                                         \
+    auto* t =                                                               \
+        ws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); \
+    t->Resize(inputDims);                                                   \
+    CPUContext ctx;                                                         \
+    math::RandGaussian<float, CPUContext>(                                  \
+        t->size(), 0, 1, t->mutable_data<float>(), &ctx);                   \
+  }                                                                         \
   ws.RunNetOnce(predictNet);
 
   // initialize
diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
index 70b9ac05747511..2896bc26ac08d4 100644
--- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
+++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
@@ -41,7 +41,7 @@ void RowsWhereRoILevelEquals(Eigen::Ref<const ERArrXXf> rois,
 //    distribute those proposals to their appropriate FPN levels for Faster RCNN.
 //    An anchor at one FPN level may predict an RoI that will map to another
 //    level, hence the need to redistribute the proposals.
-// Reference: detectron/lib/ops/collect_and_distribute_fpn_rpn_proposals.py
+// Reference: facebookresearch/Detectron/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py
 template <class Context>
 class CollectAndDistributeFpnRpnProposalsOp final : public Operator<Context> {
  public:
diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc
index ddb0f8f89c144b..2f11645f21c5cc 100644
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@@ -602,12 +602,12 @@ bool CudnnConvOp::DoRunWithType() {
             kernel_w()));
       } else {
         vector<int> dims(filter.dims().begin(), filter.dims().end());
-        dims[0] /= group_;
 #if !CUDNN_VERSION_MIN(7, 0, 0)
+        // We only need to divide dims by group_ when CUDNN version < 7.0
+        // see CUDA group convolution doc: https://fburl.com/dgj6dvpd
         order_ == StorageOrder::NCHW ? dims[1] /= group_
                                      : dims[filter.ndim() - 1] /= group_;
 #endif
-        dims[filter.ndim() - 1] /= group_;
         CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
             filter_desc_,
             cudnnTypeWrapper<T_W>::type,
@@ -959,10 +959,12 @@ bool CudnnConvGradientOp::DoRunWithType() {
       } else {
         vector<int> dims(filter.dims().begin(), filter.dims().end());
 #if !CUDNN_VERSION_MIN(7, 0, 0)
-        dims[0] /= group_;
-#endif
+        // We only need to divide dims by group_ when CUDNN version < 7.0
+        // see CUDA group convolution doc: https://fburl.com/dgj6dvpd
         order_ == StorageOrder::NCHW ? dims[1] /= group_
                                      : dims[filter.ndim() - 1] /= group_;
+#endif
+
         CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
             filter_desc_,
             cudnnTypeWrapper<T_W>::type,
diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h
index 81f7d9ac43123f..faf4936495244f 100644
--- a/caffe2/operators/generate_proposals_op.h
+++ b/caffe2/operators/generate_proposals_op.h
@@ -59,7 +59,7 @@ ERMatXf ComputeAllAnchors(
 //     regression result 'deltas' as well as predefined bounding box shapes
 //     'anchors'. Greedy non-maximum suppression is applied to generate the
 //     final bounding boxes.
-// Reference: detectron/lib/ops/generate_proposals.py
+// Reference: facebookresearch/Detectron/detectron/ops/generate_proposals.py
 template <class Context>
 class GenerateProposalsOp final : public Operator<Context> {
  public:
diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h
index 0c4c345d382cb1..333514102b7d4b 100644
--- a/caffe2/operators/generate_proposals_op_util_boxes.h
+++ b/caffe2/operators/generate_proposals_op_util_boxes.h
@@ -5,7 +5,7 @@
 #include "caffe2/utils/math.h"
 
 // Bounding box utils for generate_proposals_op
-// Reference: detectron/lib/utils/boxes.py
+// Reference: facebookresearch/Detectron/detectron/utils/boxes.py
 
 namespace caffe2 {
 namespace utils {
diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h
index 5d6f87d4d30563..7b38cd6a1420d6 100644
--- a/caffe2/operators/generate_proposals_op_util_nms.h
+++ b/caffe2/operators/generate_proposals_op_util_nms.h
@@ -19,7 +19,7 @@ namespace utils {
 // Reject a bounding box if its region has an intersection-overunion (IoU)
 //    overlap with a higher scoring selected bounding box larger than a
 //    threshold.
-// Reference: detectron/lib/utils/cython_nms.pyx
+// Reference: facebookresearch/Detectron/detectron/utils/cython_nms.pyx
 // proposals: pixel coordinates of proposed bounding boxes,
 //    size: (M, 4), format: [x1; y1; x2; y2]
 // scores: scores for each bounding box, size: (M, 1)
@@ -78,7 +78,7 @@ std::vector<int> nms_cpu_upright(
 
 /**
  * Soft-NMS implementation as outlined in https://arxiv.org/abs/1704.04503.
- * Reference: detectron/lib/utils/cython_nms.pyx
+ * Reference: facebookresearch/Detectron/detectron/utils/cython_nms.pyx
  * out_scores: Output updated scores after applying Soft-NMS
  * proposals: pixel coordinates of proposed bounding boxes,
  *    size: (M, 4), format: [x1; y1; x2; y2]
@@ -426,7 +426,7 @@ std::vector<int> nms_cpu(
 // Reject a bounding box if its region has an intersection-overunion (IoU)
 //    overlap with a higher scoring selected bounding box larger than a
 //    threshold.
-// Reference: detectron/lib/utils/cython_nms.pyx
+// Reference: facebookresearch/Detectron/detectron/lib/utils/cython_nms.pyx
 // proposals: pixel coordinates of proposed bounding boxes,
 //    size: (M, 4), format: [x1; y1; x2; y2]
 //    size: (M, 5), format: [ctr_x; ctr_y; w; h; angle (degrees)] for RRPN
diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
index b4866618b4e607..37d675eba83a49 100644
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@@ -146,9 +146,6 @@ REGISTER_CONVERTER(SpatialBN, BatchNormalizationConverter);
 TRIVIAL_CONVERTER(Flatten);
 REGISTER_CONVERTER(Flatten, FlattenConverter);
 
-TRIVIAL_CONVERTER(BatchGather);
-REGISTER_CONVERTER(BatchGather, BatchGatherConverter);
-
 class AveragePoolConverter : public Converter {
   std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator(
       const OperatorDef& op) override {
@@ -205,37 +202,6 @@ class ConcatConverter : public Converter {
 };
 REGISTER_CONVERTER(Concat, ConcatConverter);
 
-class BatchMatMulConverter : public Converter {
-  std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator(
-      const OperatorDef& op) override {
-    std::unique_ptr<repr::NeuralNetOperator> nnOp =
-        util::make_unique<repr::BatchMatMul>();
-    auto argMap = getArgumentsFromOperator(op);
-
-    auto c = dyn_cast<repr::BatchMatMul>(nnOp.get());
-    if (argMap.count("trans_a")) {
-      CAFFE_ENFORCE(argMap["trans_a"].has_i(), "Invalid axis argument");
-      int trans_a = static_cast<int>(argMap["trans_a"].i());
-      c->setTransA(!!trans_a);
-    }
-    if (argMap.count("trans_b")) {
-      CAFFE_ENFORCE(argMap["trans_b"].has_i(), "Invalid add_axis argument");
-      int trans_b = static_cast<int>(argMap["trans_b"].i());
-      c->setTransB(!!trans_b);
-    }
-    if (argMap.count("broadcast")) {
-      CAFFE_ENFORCE(argMap["broadcast"].has_i(), "Invalid add_axis argument");
-      int broadcast = static_cast<int>(argMap["broadcast"].i());
-      c->setBroadcast(!!broadcast);
-    }
-    return nnOp;
-  }
-  // Does not override default converter to OperatorDef
-
-  virtual ~BatchMatMulConverter() {}
-};
-REGISTER_CONVERTER(BatchMatMul, BatchMatMulConverter);
-
 } // namespace
 
 std::unique_ptr<repr::NeuralNetOperator> convertToNeuralNetOperator(
@@ -270,145 +236,6 @@ std::unique_ptr<repr::NeuralNetOperator> convertToNeuralNetOperator(
   return nnOp;
 }
 
-void handleWhileOp(
-    repr::NNGraph& dfg,
-    repr::NNCFGraph& cfg,
-    repr::NNGraph::NodeRef& opNode,
-    repr::NNCFGraph::NodeRef& bbNode,
-    OperatorDef& op,
-    std::unordered_map<std::string, repr::NNGraph::NodeRef>& blobMap
-) {
-  opNode->resetData(util::make_unique<repr::While>());
-  auto argMap = Converter::getArgumentsFromOperator(op);
-  std::string bodyNetSerialized = argMap["body"].s();
-  auto bodyNet = caffe2::NetDef();
-  bodyNet.ParseFromString(bodyNetSerialized);
-
-  std::unordered_map<std::string, repr::NNGraph::NodeRef> bodyBlobMap;
-  auto bodyNN = convertToNNModule(bodyNet, &bodyBlobMap);
-  repr::NNGraph bodyGraph = std::move(bodyNN.dataFlow);
-  repr::NNCFGraph bodyCFGraph = std::move(bodyNN.controlFlow);
-
-  auto rev_sorted = algorithm::tarjans(&bodyGraph);
-
-  for (auto& k : bodyBlobMap) {
-    auto name = k.first;
-    if (blobMap.count(name)) {
-      auto oldNode = blobMap[name];
-      printf("Exit tensor %s is in the parent scope, inserting Phi node...\n", k.first.c_str());
-      auto phiNode = dfg.createNode(util::make_unique<repr::NNPhi>()); // NN variant of a Phi node
-      // Clone the operator.
-      auto tensor = dyn_cast<repr::NeuralNetData>(blobMap[name]->data().get());
-      auto* clonedTensor = tensor->clone();
-      auto phiOut = dfg.createNode(std::unique_ptr<repr::NeuralNetData>(clonedTensor));
-      dfg.createEdge(phiNode, phiOut);
-      dfg.createEdge(oldNode, phiNode);
-      dfg.createEdge(bodyBlobMap[name], phiNode);
-      blobMap[name] = phiOut;
-      for (auto& inEdge : opNode->getInEdges()) {
-        if (inEdge->tail() == oldNode) {
-          dfg.deleteEdge(inEdge);
-          dfg.createEdge(phiOut, opNode);
-        }
-      }
-    }
-  }
-
-  // Dependencies simply have no producers
-  std::unordered_map<repr::NNGraph::NodeRef, repr::NNGraph::NodeRef> inNodeMap;
-  for (auto& n : bodyGraph.getMutableNodes()) {
-    if (!isa<repr::NeuralNetData>(n->data())) { continue; }
-    if (n->getInEdges().size() == 0) {
-      auto name = dyn_cast<repr::NeuralNetData>(n->data().get())->getName();
-      // TODO(bwasti): this may be needed, depending on constraints
-      //assert(blobMap.count(name) != 0 && "Loop body takes undefined dependency.");
-      if (blobMap.count(name)) {
-        inNodeMap[n] = blobMap[name];
-      }
-    }
-  }
-
-  CAFFE_ENFORCE(rev_sorted.front().getNodes().size() == 1,
-      "More than one exit node.");
-  CAFFE_ENFORCE(rev_sorted.back().getNodes().size() == 1,
-      "More than one entry node.");
-
-  auto exit_tensor = *(rev_sorted.front().getNodes().begin());
-  CAFFE_ENFORCE(isa<repr::NeuralNetData>(exit_tensor->data()),
-      "Exit node is not a tensor.");
-
-  auto bodyNodes = bodyGraph.getMutableNodes();
-  auto bodyEdges = bodyGraph.getMutableEdges();
-
-  for (auto node : bodyNodes) {
-    bodyGraph.importNode(node, dfg);
-  }
-
-  for (auto edge : bodyEdges) {
-    bodyGraph.importEdge(edge, dfg);
-  }
-
-  // Merge all dependencies
-  for (auto node : dfg.getMutableNodes()) {
-    if (inNodeMap.count(node)) {
-      dfg.replaceNode(node, inNodeMap[node]);
-      dfg.deleteNode(node);
-    }
-  }
-
-  for (const auto& inEdge : opNode->getInEdges()) {
-    auto* inputData = dyn_cast<repr::NeuralNetData>(inEdge->tail()->data().get());
-    auto* exitData = dyn_cast<repr::NeuralNetData>(exit_tensor->data().get());
-    if (inputData->getName() == exitData->getName()) {
-      dfg.replaceNode(exit_tensor, inEdge->tail());
-      dfg.deleteNode(exit_tensor);
-    }
-  }
-
-  // CFG Handling
-  auto bodyCFNodes = bodyCFGraph.getMutableNodes();
-  auto bodyCFEdges = bodyCFGraph.getMutableEdges();
-
-  // Create a while loop CFG node.
-  auto whileBasicBlock = util::make_unique<repr::BasicBlockType<repr::NNGraph>>();
-  for (auto& inEdge : opNode->getInEdges()) {
-    auto node = inEdge->tail();
-    for (auto& parentInEdge : node->getInEdges()) {
-      auto parentNode = parentInEdge->tail();
-      if (isa<repr::Phi>(parentNode->data().get())) {
-        whileBasicBlock->pushInstructionNode(parentNode);
-      }
-    }
-  }
-  whileBasicBlock->pushInstructionNode(opNode);
-
-  auto whileCFNode = cfg.createNode(std::move(whileBasicBlock));
-  cfg.createEdge(bbNode, whileCFNode, 0);
-
-  // The true path executes the body of the loop, so we
-  // take that BB and point to it.
-  for (auto cfNode : bodyCFNodes) {
-    bodyCFGraph.importNode(cfNode, cfg);
-    // If the CFG node has no children, we loop back to the top of the
-    // while loop.
-    if (cfNode->getOutEdges().size() == 0) {
-      cfg.createEdge(cfNode, whileCFNode, 0);
-    }
-    // TODO check for a single entry point
-    if (cfNode->getInEdges().size() == 0) {
-      cfg.createEdge(whileCFNode, cfNode, 1);
-    }
-  }
-  for (auto cfEdge : bodyCFEdges) {
-    bodyCFGraph.importEdge(cfEdge, cfg);
-  }
-
-  // Now create the false case.
-  bbNode =
-    cfg.createNode(util::make_unique<repr::BasicBlockType<repr::NNGraph>>());
-  cfg.createEdge(whileCFNode, bbNode, -1);
-}
-
 
 /// \brief Ingest a caffe2 protobuf model and output an NNModule.
 /// \param net The caffe2 protobuf NetDef
@@ -455,13 +282,9 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::st
       blobMap[output] = tensorNode;
     }
 
-    if (op.type() == "While") {
-      handleWhileOp(dfg, cfg, opNode, bbNode, op, blobMap);
-    } else {
-      opNode->resetData(convertToNeuralNetOperator(op));
-      auto currentBasicBlock = bbNode->mutableData()->get();
-      currentBasicBlock->pushInstructionNode(opNode);
-    }
+    opNode->resetData(convertToNeuralNetOperator(op));
+    auto currentBasicBlock = bbNode->mutableData()->get();
+    currentBasicBlock->pushInstructionNode(opNode);
   }
 
   repr::NNModule module;
diff --git a/caffe2/opt/converter_nomigraph_test.cc b/caffe2/opt/converter_nomigraph_test.cc
index 69f51df49cbf74..0bab53f738d7c2 100644
--- a/caffe2/opt/converter_nomigraph_test.cc
+++ b/caffe2/opt/converter_nomigraph_test.cc
@@ -48,65 +48,3 @@ TEST(Converter, UnknownType) {
   auto new_netdef = caffe2::convertToCaffe2Proto(nn);
 }
 
-/* Temporarily disabled While conversion tests
-TEST(Converter, While) {
-  caffe2::NetDef net;
-
-  caffe2::OperatorDef *def = net.add_op();
-  def->set_type("While");
-  def->add_input("X");
-
-  caffe2::NetDef body_net;
-  {
-    caffe2::OperatorDef *rdef = body_net.add_op();
-    rdef->set_type("Relu");
-    rdef->add_input("X");
-    rdef->add_output("X");
-  }
-  std::string body_net_serialized;
-  assert(body_net.SerializeToString(&body_net_serialized));
-  ADD_ARG(def, "body", s, body_net_serialized);
-
-  auto nn = caffe2::convertToNNModule(net);
-}
-
-TEST(Converter, ComplexWhile) {
-  caffe2::NetDef net;
-
-  {
-    caffe2::OperatorDef *rdef = net.add_op();
-    rdef->set_type("Relu");
-    rdef->add_input("X");
-    rdef->add_output("X");
-  }
-
-  caffe2::OperatorDef *def = net.add_op();
-  def->set_type("While");
-  def->add_input("X");
-
-  caffe2::NetDef body_net;
-  {
-    caffe2::OperatorDef *rdef = body_net.add_op();
-    rdef->set_type("Instr1");
-    rdef->add_input("X");
-    rdef->add_output("X");
-  }
-  {
-    caffe2::OperatorDef *rdef = body_net.add_op();
-    rdef->set_type("Instr2");
-    rdef->add_input("X");
-    rdef->add_output("X");
-  }
-  {
-    caffe2::OperatorDef *rdef = body_net.add_op();
-    rdef->set_type("Instr3");
-    rdef->add_input("X");
-    rdef->add_output("X");
-  }
-  std::string body_net_serialized;
-  assert(body_net.SerializeToString(&body_net_serialized));
-  ADD_ARG(def, "body", s, body_net_serialized);
-
-  auto nn = caffe2::convertToNNModule(net);
-}
-*/
diff --git a/caffe2/opt/device.cc b/caffe2/opt/device.cc
index 9abca6d67e08b3..0cfdd6c1dc91a3 100644
--- a/caffe2/opt/device.cc
+++ b/caffe2/opt/device.cc
@@ -9,15 +9,14 @@ std::vector<NNGraph::EdgeRef> getInputEdges(
     const NNGraph::SubgraphType& sg,
     const NNGraph& g) {
   std::vector<NNGraph::EdgeRef> inputTensorEdges;
-  for (const auto& node : sg.Nodes) {
+  for (const auto& node : sg.getNodes()) {
     NOM_REQUIRE_OR_CONT(nn::is<NeuralNetOperator>(node));
     NOM_REQUIRE_OR_CONT(nn::hasInputs(node));
 
     // Check if tensor's parents are in the sg
     for (const auto& input : nn::getInputs(node)) {
       NOM_REQUIRE_OR_CONT(
-          !nn::hasProducer(input) ||
-          sg.Nodes.count(nn::getProducer(input)) == 0);
+          !nn::hasProducer(input) || !sg.hasNode(nn::getProducer(input)));
       inputTensorEdges.emplace_back(g.getEdge(input, node));
     }
   }
@@ -28,13 +27,13 @@ std::vector<NNGraph::EdgeRef> getOutputEdges(
     const NNGraph::SubgraphType& sg,
     const NNGraph& g) {
   std::vector<NNGraph::EdgeRef> outputTensorEdges;
-  for (const auto& node : sg.Nodes) {
+  for (const auto& node : sg.getNodes()) {
     NOM_REQUIRE_OR_CONT(nn::is<NeuralNetOperator>(node));
 
     for (const auto& output : nn::getOutputs(node)) {
       auto consumers = nn::getConsumers(output);
       for (const auto& consumer : consumers) {
-        NOM_REQUIRE_OR_CONT(sg.Nodes.count(consumer) == 0);
+        NOM_REQUIRE_OR_CONT(!sg.hasNode(consumer));
         outputTensorEdges.emplace_back(g.getEdge(node, output));
       }
       NOM_REQUIRE_OR_CONT(consumers.size() == 0);
diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc
index 8a1b736399562a..f5ea0f678ed515 100644
--- a/caffe2/opt/fusion.cc
+++ b/caffe2/opt/fusion.cc
@@ -1,5 +1,6 @@
-#include "caffe2/opt/converter.h"
 #include "caffe2/opt/fusion.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/opt/converter.h"
 #include "caffe2/opt/passes.h"
 
 namespace caffe2 {
@@ -18,27 +19,25 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
   for (auto convNode : repr::nn::nodeIterator<repr::Conv>(nn->dataFlow)) {
     auto output = repr::nn::getOutputs(convNode).front();
     auto consumers = repr::nn::getConsumers(output);
-    if (consumers.size() != 1) {
-      continue;
-    }
+    NOM_REQUIRE_OR_CONT(consumers.size() == 1);
+
     auto consumer = consumers.front();
-    if (!repr::nn::is<repr::BatchNormalization>(consumer)) {
-      continue;
-    }
+    NOM_REQUIRE_OR_CONT(repr::nn::is<repr::BatchNormalization>(consumer));
+
     auto bnNode = consumer;
     auto bn = repr::nn::get<repr::BatchNormalization>(bnNode);
+    auto bnOutputs = nn::getOutputs(bnNode);
+    NOM_REQUIRE_OR_CONT(bnOutputs.size() == 1);
+    auto bnOutput = bnOutputs.front();
 
     auto convInputs = repr::nn::getInputs(convNode);
-    if (convInputs.size() < 3) {
-      assert(0 && "Invalid convolution input size (TODO: optional bias)");
-      continue;
-    }
+    CAFFE_ENFORCE(
+        convInputs.size() >= 3,
+        "Invalid convolution input size (TODO: optional bias)");
 
     auto bnInputs = repr::nn::getInputs(bnNode);
-    if (bnInputs.size() < 5) {
-      assert(0 && "Invalid batch normalization input size");
-      continue;
-    }
+    CAFFE_ENFORCE(
+        bnInputs.size() >= 5, "Invalid batch normalization input size");
 
 #define EXPOSE_TENSOR_DATA(name, index, inputs)                            \
   auto name = repr::nn::get<repr::Tensor>(inputs[index]);                  \
@@ -69,6 +68,8 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
       biasConvData[c] = bias;
     }
 
+    nn->dataFlow.deleteNode(output);
+    nn->dataFlow.createEdge(convNode, bnOutput);
     nn->dataFlow.deleteNode(bnNode);
     return true;
   }
diff --git a/caffe2/opt/mobile.cc b/caffe2/opt/mobile.cc
index 6d0006818789bb..adbbbd19a1e367 100644
--- a/caffe2/opt/mobile.cc
+++ b/caffe2/opt/mobile.cc
@@ -11,23 +11,15 @@ using namespace nom;
 
 void addNNPACK(repr::NNModule* nn, bool low_memory) {
   for (auto node : nn->dataFlow.getMutableNodes()) {
-    auto* nodeData = node->data().get(); // Let graph retain ownership.
-
     // Skip blobs.
-    if (!isa<nom::repr::NeuralNetOperator>(nodeData)) {
-      continue;
-    }
+    NOM_REQUIRE_OR_CONT(repr::nn::is<repr::NeuralNetOperator>(node));
 
     // Check if it is a convolution.
-    auto nnOp = dyn_cast<nom::repr::NeuralNetOperator>(nodeData);
-    if (!isa<nom::repr::Conv>(nnOp)) {
-      continue;
-    }
+    auto nnOp = repr::nn::get<repr::NeuralNetOperator>(node);
+    NOM_REQUIRE_OR_CONT(isa<nom::repr::Conv>(nnOp));
 
     // Requires X, W, b for NNPACK
-    if (node->getInEdges().size() < 3) {
-      continue;
-    }
+    NOM_REQUIRE_OR_CONT(node->getInEdges().size() >= 3);
 
     std::string engine = "NNPACK";
 
@@ -35,9 +27,7 @@ void addNNPACK(repr::NNModule* nn, bool low_memory) {
     bool validTransformCandidate = true;
     auto conv = dyn_cast<nom::repr::Conv>(nnOp);
 
-    if (conv->getLayout() != nom::repr::Conv::NNLayout::NCHW) {
-      continue;
-    }
+    NOM_REQUIRE_OR_CONT(conv->getLayout() == nom::repr::Conv::NNLayout::NCHW);
 
     // NNPACK only supports stride == 1
     for (auto stride : conv->getStrides()) {
@@ -46,28 +36,21 @@ void addNNPACK(repr::NNModule* nn, bool low_memory) {
         break;
       }
     }
-    if (!validTransformCandidate) {
-      continue;
-    }
+    NOM_REQUIRE_OR_CONT(validTransformCandidate);
 
     // NNPACK only supports 2DConv.
     const auto& kernelShape = conv->getKernelShape();
-    if (kernelShape.size() != 2) {
-      continue;
-    }
+    NOM_REQUIRE_OR_CONT(kernelShape.size() == 2);
 
     // Kx1 and 1xK convs are inefficient in NNPACK.
     if (kernelShape[0] != kernelShape[1]) {
-      if (kernelShape[0] == 1 || kernelShape[1] == 1) {
-        continue;
-      }
+      NOM_REQUIRE_OR_CONT(kernelShape[0] != 1 && kernelShape[1] != 1);
     }
 
     // We're good to use our engine.
     auto annotation = conv->getMutableAnnotation();
-    if (!annotation || !isa<Caffe2Annotation>(annotation)) {
-      continue;
-    }
+    NOM_REQUIRE_OR_CONT(annotation && isa<Caffe2Annotation>(annotation));
+
     auto* op = dyn_cast<Caffe2Annotation>(annotation)->getMutableOperatorDef();
     op->set_engine(engine);
     if (!low_memory) {
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index 75baec0e9be66b..09528b99b5da51 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -323,8 +323,10 @@ void OnnxifiTransformer::Transform(
 
   // function to tell whether the ONNXIFI backend supports a given C2 op or not
   // TODO: choose backend id
+  onnxifi_library* backend = lib_;
+  onnxBackendID backend_id = backend_ids_[0];
   auto supports =
-      [&exporter, &shape_hints, backend = lib_, backend_id = backend_ids_[0]](
+      [&exporter, &shape_hints, backend, backend_id](
           const caffe2::OperatorDef& op) {
         const OpSchema* schema = OpSchemaRegistry::Schema(op.type());
         // NB: this might not be a hard constraint as we can just export C2
diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc
index 4c1e13d1008ac8..03264daf50f6a7 100644
--- a/caffe2/predictor/predictor.cc
+++ b/caffe2/predictor/predictor.cc
@@ -2,6 +2,7 @@
 #ifdef CAFFE2_OPTIMIZER
 #include "caffe2/opt/optimizer.h"
 #endif
+#include "caffe2/utils/proto_utils.h"
 
 #include <unordered_set>
 #include "caffe2/core/init.h"
@@ -96,7 +97,9 @@ Predictor::Predictor(
   GlobalInit();
 #endif
   auto predict_net = config_.predict_net;
-  if (optimization) {
+
+  if (optimization &&
+      !ArgumentHelper::HasArgument(*predict_net, "disable_nomnigraph")) {
 #ifdef CAFFE2_OPTIMIZER
     try {
       *predict_net = opt::optimize(*predict_net, &ws_, optimization);
diff --git a/caffe2/predictor/predictor.h b/caffe2/predictor/predictor.h
index a3f05d7aacac89..458bf4401476c4 100644
--- a/caffe2/predictor/predictor.h
+++ b/caffe2/predictor/predictor.h
@@ -28,7 +28,7 @@ class Predictor {
       const NetDef& run_net,
       Workspace* parent = nullptr,
       bool run_init = true,
-      int optimization = 0);
+      int optimization = 1);
 
   ~Predictor() {}
 
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index d10bfe209f7b39..dd1734a587c1fc 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -630,7 +630,7 @@ def _dense_gftrl(alpha, beta, lambda1, lambda2, w, nz, g):
            beta=st.floats(min_value=0.1, max_value=0.9),
            lambda1=st.floats(min_value=0.001, max_value=0.1),
            lambda2=st.floats(min_value=0.001, max_value=0.1),
-           engine=st.sampled_from([None]),
+           engine=st.sampled_from([None, "SIMD"]),
            **hu.gcs_cpu_only)
     def test_gftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2,
                       engine, gc, dc):
diff --git a/caffe2/python/models/seq2seq/translate.py b/caffe2/python/models/seq2seq/translate.py
index b1c0e1cd885ea4..d2b6a4f6399fff 100644
--- a/caffe2/python/models/seq2seq/translate.py
+++ b/caffe2/python/models/seq2seq/translate.py
@@ -5,10 +5,12 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from abc import ABCMeta, abstractmethod
 import argparse
 from future.utils import viewitems
 import logging
 import numpy as np
+from six import with_metaclass
 import sys
 
 from caffe2.python import core, rnn_cell, workspace
@@ -31,7 +33,60 @@ def _weighted_sum(model, values, weight, output_name):
     )
 
 
-class Seq2SeqModelCaffe2EnsembleDecoder(object):
+class Seq2SeqModelCaffe2EnsembleDecoderBase(with_metaclass(ABCMeta, object)):
+
+    @abstractmethod
+    def get_model_file(self, model):
+        pass
+
+    @abstractmethod
+    def get_db_type(self):
+        pass
+
+    def build_word_rewards(self, vocab_size, word_reward, unk_reward):
+        word_rewards = np.full([vocab_size], word_reward, dtype=np.float32)
+        word_rewards[seq2seq_util.PAD_ID] = 0
+        word_rewards[seq2seq_util.GO_ID] = 0
+        word_rewards[seq2seq_util.EOS_ID] = 0
+        word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward
+        return word_rewards
+
+    def load_models(self):
+        db_reader = 'reader'
+        for model, scope_name in zip(
+            self.models,
+            self.decoder_scope_names,
+        ):
+            params_for_current_model = [
+                param
+                for param in self.model.GetAllParams()
+                if str(param).startswith(scope_name)
+            ]
+            assert workspace.RunOperatorOnce(core.CreateOperator(
+                'CreateDB',
+                [], [db_reader],
+                db=self.get_model_file(model),
+                db_type=self.get_db_type())
+            ), 'Failed to create db {}'.format(self.get_model_file(model))
+            assert workspace.RunOperatorOnce(core.CreateOperator(
+                'Load',
+                [db_reader],
+                params_for_current_model,
+                load_all=1,
+                add_prefix=scope_name + '/',
+                strip_prefix='gpu_0/',
+            ))
+            logger.info('Model {} is loaded from a checkpoint {}'.format(
+                scope_name, self.get_model_file(model)))
+
+
+class Seq2SeqModelCaffe2EnsembleDecoder(Seq2SeqModelCaffe2EnsembleDecoderBase):
+
+    def get_model_file(self, model):
+        return model['model_file']
+
+    def get_db_type(self):
+        return 'minidb'
 
     def scope(self, scope_name, blob_name):
         return (
@@ -258,14 +313,6 @@ def _build_decoder(
             attention_weights,
         )
 
-    def build_word_rewards(self, vocab_size, word_reward, unk_reward):
-        word_rewards = np.full([vocab_size], word_reward, dtype=np.float32)
-        word_rewards[seq2seq_util.PAD_ID] = 0
-        word_rewards[seq2seq_util.GO_ID] = 0
-        word_rewards[seq2seq_util.EOS_ID] = 0
-        word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward
-        return word_rewards
-
     def __init__(
         self,
         translate_params,
@@ -414,36 +461,6 @@ def __init__(
         for param in self.model.params:
             logger.info(param)
 
-    def load_models(self):
-        db_reader = 'reader'
-        for model, scope_name in zip(
-            self.models,
-            self.decoder_scope_names,
-        ):
-            params_for_current_model = [
-                param
-                for param in self.model.GetAllParams()
-                if str(param).startswith(scope_name)
-            ]
-            assert workspace.RunOperatorOnce(core.CreateOperator(
-                'CreateDB',
-                [], [db_reader],
-                db=model['model_file'],
-                db_type='minidb')
-            ), 'Failed to create db {}'.format(model['model_file'])
-            assert workspace.RunOperatorOnce(core.CreateOperator(
-                'Load',
-                [db_reader],
-                params_for_current_model,
-                load_all=1,
-                add_prefix=scope_name + '/',
-                strip_prefix='gpu_0/',
-            ))
-            logger.info('Model {} is loaded from a checkpoint {}'.format(
-                scope_name,
-                model['model_file'],
-            ))
-
     def decode(self, numberized_input, max_output_seq_len):
         workspace.FeedBlob(
             self.encoder_inputs,
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index dab79b8b1fb0b4..93e45704fcfea6 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -35,6 +35,7 @@
 import onnx.defs
 import onnx.optimizer
 import onnx.shape_inference
+import onnx.utils
 from onnx.backend.base import Backend, Device, DeviceType, namedtupledict
 
 from caffe2.python.onnx.workspace import Workspace
@@ -876,6 +877,7 @@ def _graph_to_net(cls, onnx_graph, opset_version):
     def _onnx_model_to_caffe2_net(cls, onnx_model, device, opset_version, include_initializers):
         device_option = get_device_option(Device(device))
 
+        onnx_model = onnx.utils.polish_model(onnx_model)
         init_model = cls.optimize_onnx(onnx_model, init=True)
         pred_model = cls.optimize_onnx(onnx_model, predict=True)
 
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index db870972f83946..ee60d776d55a82 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -1421,7 +1421,8 @@ def build_ftrl(model, engine="SIMD", **kwargs):
 
 
 def build_gftrl(model, engine="", **kwargs):
-    # SIMD version of GFTRL is not supported
+    if engine == "SIMD":
+        assert core.IsOperator('GFtrl_ENGINE_SIMD')
     gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs)
     return _build(model, gftrl_optimizer)
 
diff --git a/caffe2/python/predictor/mobile_exporter.py b/caffe2/python/predictor/mobile_exporter.py
index 07f88def015544..3c42c2073163cd 100644
--- a/caffe2/python/predictor/mobile_exporter.py
+++ b/caffe2/python/predictor/mobile_exporter.py
@@ -20,6 +20,7 @@ def add_tensor(net, name, blob):
         np.dtype('int32'): "GivenTensorIntFill",
         np.dtype('int64'): "GivenTensorInt64Fill",
         np.dtype('uint8'): "GivenTensorStringFill",
+        np.dtype('O'): "GivenTensorStringFill"
     }
 
     shape = blob.shape
@@ -29,6 +30,12 @@ def add_tensor(net, name, blob):
     if blob.dtype == np.dtype('uint8'):
         shape = [1]
         values = [str(blob.data)]
+    # Only allow string arrays as objects.
+    # The only intended use case for this is to store arrays of strings in the
+    # model which can be used for post processing results in subsequent ops.
+    if blob.dtype == np.dtype('O'):
+        for blob_val in blob:
+            assert(isinstance(blob_val, bytes))
 
     op = core.CreateOperator(
         kTypeNameMapper[blob.dtype],
diff --git a/caffe2/python/predictor/mobile_exporter_test.py b/caffe2/python/predictor/mobile_exporter_test.py
index e7bbe2c90351c4..1c4cf77ea0512f 100644
--- a/caffe2/python/predictor/mobile_exporter_test.py
+++ b/caffe2/python/predictor/mobile_exporter_test.py
@@ -73,11 +73,15 @@ def test_mobile_exporter_datatypes(self):
         model = ModelHelper(name="mobile_exporter_test_model")
         model.Copy("data_int", "out")
         model.params.append("data_int")
+        model.Copy("data_obj", "out_obj")
+        model.params.append("data_obj")
 
         # Create our mobile exportable networks
         workspace.RunNetOnce(model.param_init_net)
         np_data_int = np.random.randint(100, size=(1, 1, 28, 28), dtype=np.int32)
         workspace.FeedBlob("data_int", np_data_int)
+        np_data_obj = np.array(['aa', 'bb']).astype(np.dtype('O'))
+        workspace.FeedBlob("data_obj", np_data_obj)
 
         init_net, predict_net = mobile_exporter.Export(
             workspace, model.net, model.params
@@ -86,6 +90,7 @@ def test_mobile_exporter_datatypes(self):
         workspace.CreateNet(model.net)
         workspace.RunNet(model.net)
         ref_out = workspace.FetchBlob("out")
+        ref_out_obj = workspace.FetchBlob("out_obj")
 
         # Clear the workspace
         workspace.ResetWorkspace()
@@ -97,9 +102,11 @@ def test_mobile_exporter_datatypes(self):
         workspace.CreateNet(predict_net, True)
         workspace.RunNet(predict_net.name)
         manual_run_out = workspace.FetchBlob("out")
+        manual_run_out_obj = workspace.FetchBlob("out_obj")
         np.testing.assert_allclose(
             ref_out, manual_run_out, atol=1e-10, rtol=1e-10
         )
+        np.testing.assert_equal(ref_out_obj, manual_run_out_obj)
 
         # Clear the workspace
         workspace.ResetWorkspace()
@@ -109,11 +116,17 @@ def test_mobile_exporter_datatypes(self):
             init_net.SerializeToString(), predict_net.SerializeToString()
         )
 
-        # Output is a vector of outputs but we only care about the first and only result
+        # Output is a vector of outputs.
         predictor_out = predictor.run([])
-        assert len(predictor_out) == 1
-        predictor_out = predictor_out[0]
-
+        assert len(predictor_out) == 2
+        predictor_out_int = predictor_out[1]
+        predictor_out_obj = predictor_out[0]
+        # The order in predictor_out is non-deterministic. Use type of the entry
+        # to figure out what to compare it to.
+        if isinstance(predictor_out[1][0], bytes):
+            predictor_out_int = predictor_out[0]
+            predictor_out_obj = predictor_out[1]
         np.testing.assert_allclose(
-            ref_out, predictor_out, atol=1e-10, rtol=1e-10
+            ref_out, predictor_out_int, atol=1e-10, rtol=1e-10
         )
+        np.testing.assert_equal(ref_out_obj, predictor_out_obj)
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 2edc88ce0458d4..6e66cd75315716 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -179,6 +179,7 @@ def test_transformer_SinkMaxPool(self):
         epsilon=st.floats(min_value=1e-5, max_value=1e-2),
     )
     def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon):
+        workspace.ResetWorkspace()
         net = core.Net("net")
         c = input_channels
         h = size
@@ -204,16 +205,22 @@ def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon
         workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
         workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32))
         workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
-        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32))
+        # This is necessary because 1/sqrt(var) is used and if var is too small
+        # we get floating point artifacts that cause test failures
+        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5)
         workspace.RunNetOnce(net)
-        preTransformOutput = workspace.FetchBlob("Y2")
+        preTransformOutput = workspace.FetchBlob("Y2").flatten()
+        workspace.FeedBlob("Y2", np.zeros((1, 1)))
         transformer.FuseConvBN(net)
 
         # Ensure fusion
         assert len(net.Proto().op) == 1
         workspace.RunNetOnce(net)
-        postTransformOutput = workspace.FetchBlob("Y2")
+        postTransformOutput = workspace.FetchBlob("Y2").flatten()
         # Check that there is no numerical difference
         assert np.allclose(
-            preTransformOutput, postTransformOutput, rtol=1e-05, atol=1e-08
+            preTransformOutput,
+            postTransformOutput,
+            rtol=1e-02,
+            atol=1e-04
         )
diff --git a/caffe2/requirements.txt b/caffe2/requirements.txt
index 9a1d67efc7c2f3..07fd95b72582a2 100644
--- a/caffe2/requirements.txt
+++ b/caffe2/requirements.txt
@@ -1,2 +1,4 @@
 numpy
 enum34
+pyyaml
+typing
diff --git a/caffe2/utils/Array.h b/caffe2/utils/Array.h
index 921deb9b0b41aa..ad9a80ed9203b5 100644
--- a/caffe2/utils/Array.h
+++ b/caffe2/utils/Array.h
@@ -38,10 +38,10 @@
 
 #pragma once
 
-#include <utility>
+#include <ATen/core/C++17.h>
 #include <stdexcept>
 #include <string>
-#include "caffe2/utils/C++17.h"
+#include <utility>
 
 namespace c10 { namespace guts {
 
@@ -101,32 +101,32 @@ class array final {
   // No explicit construct/copy/destroy for aggregate type.
 
   // DR 776.
-  C10_CPP14_CONSTEXPR void fill(const value_type& __u)
+  AT_CPP14_CONSTEXPR void fill(const value_type& __u)
   { std::fill_n(begin(), size(), __u); }
 
-  C10_CPP14_CONSTEXPR void swap(array& __other)
+  AT_CPP14_CONSTEXPR void swap(array& __other)
   { std::swap_ranges(begin(), end(), __other.begin()); }
 
   // Iterators.
-  C10_CPP14_CONSTEXPR iterator begin() noexcept
+  AT_CPP14_CONSTEXPR iterator begin() noexcept
   { return iterator(data()); }
 
   constexpr const_iterator begin() const noexcept
   { return const_iterator(data()); }
 
-  C10_CPP14_CONSTEXPR iterator end() noexcept
+  AT_CPP14_CONSTEXPR iterator end() noexcept
   { return iterator(data() + _Nm); }
 
   constexpr const_iterator end() const noexcept
   { return const_iterator(data() + _Nm); }
 
-  C10_CPP14_CONSTEXPR reverse_iterator rbegin() noexcept
+  AT_CPP14_CONSTEXPR reverse_iterator rbegin() noexcept
   { return reverse_iterator(end()); }
 
   constexpr const_reverse_iterator rbegin() const noexcept
   { return const_reverse_iterator(end()); }
 
-  C10_CPP14_CONSTEXPR reverse_iterator rend() noexcept
+  AT_CPP14_CONSTEXPR reverse_iterator rend() noexcept
   { return reverse_iterator(begin()); }
 
   constexpr const_reverse_iterator rend() const noexcept
@@ -152,13 +152,13 @@ class array final {
   constexpr bool empty() const noexcept { return size() == 0; }
 
   // Element access.
-  C10_CPP14_CONSTEXPR reference operator[](size_type __n) noexcept
+  AT_CPP14_CONSTEXPR reference operator[](size_type __n) noexcept
   { return _AT_Type::_S_ref(_M_elems, __n); }
 
   constexpr const_reference operator[](size_type __n) const noexcept
   { return _AT_Type::_S_ref(_M_elems, __n); }
 
-  C10_CPP14_CONSTEXPR reference at(size_type __n) {
+  AT_CPP14_CONSTEXPR reference at(size_type __n) {
     if (__n >= _Nm) {
       detail::__throw_out_of_range(std::string() +
           "array::at: __n (which is " + to_string(__n) + ") " +
@@ -177,13 +177,13 @@ class array final {
          _AT_Type::_S_ref(_M_elems, 0));
      }
 
-  C10_CPP14_CONSTEXPR reference front() noexcept
+  AT_CPP14_CONSTEXPR reference front() noexcept
   { return *begin(); }
 
   constexpr const_reference front() const noexcept
   { return _AT_Type::_S_ref(_M_elems, 0); }
 
-  C10_CPP14_CONSTEXPR reference back() noexcept
+  AT_CPP14_CONSTEXPR reference back() noexcept
   { return _Nm ? *(end() - 1) : *end(); }
 
   constexpr const_reference back() const noexcept
@@ -192,7 +192,7 @@ class array final {
             : _AT_Type::_S_ref(_M_elems, 0);
   }
 
-  C10_CPP14_CONSTEXPR pointer data() noexcept
+  AT_CPP14_CONSTEXPR pointer data() noexcept
   { return _AT_Type::_S_ptr(_M_elems); }
 
   constexpr const_pointer data() const noexcept
diff --git a/caffe2/utils/C++17.cpp b/caffe2/utils/C++17.cpp
deleted file mode 100644
index d75d9fc9dff490..00000000000000
--- a/caffe2/utils/C++17.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "caffe2/utils/C++17.h"
diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt
index 5db06663bf6403..67897c36fe485a 100644
--- a/caffe2/utils/CMakeLists.txt
+++ b/caffe2/utils/CMakeLists.txt
@@ -63,8 +63,6 @@ set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS}
 
 set(LIB_SOURCES_CPU
         Array.cpp
-        C++17.cpp
-        IdWrapper.cpp
         Optional.cpp
         Metaprogramming.cpp
         TypeList.cpp
diff --git a/caffe2/utils/IdWrapper.cpp b/caffe2/utils/IdWrapper.cpp
deleted file mode 100644
index 7646a1392d4a6b..00000000000000
--- a/caffe2/utils/IdWrapper.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "caffe2/utils/IdWrapper.h"
diff --git a/caffe2/utils/IdWrapper.h b/caffe2/utils/IdWrapper.h
deleted file mode 100644
index 0c8e548ca017f6..00000000000000
--- a/caffe2/utils/IdWrapper.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#pragma once
-
-#include <functional>
-
-namespace c10 { namespace guts {
-
-/**
- * This template simplifies generation of simple classes that wrap an id
- * in a typesafe way. Namely, you can use it to create a very lightweight
- * type that only offers equality comparators and hashing. Example:
- *
- *   struct MyIdType final : IdWrapper<MyIdType, uint32_t> {
- *     constexpr explicit MyIdType(uint32_t id): IdWrapper(id) {}
- *   };
- *
- * Then in the global top level namespace:
- *
- *   C10_DEFINE_IDWRAPPER(MyIdType);
- *
- * That's it - equality operators and hash functions are automatically defined
- * for you, given the underlying type supports it.
- */
-template <class ConcreteType, class UnderlyingType>
-class IdWrapper {
-public:
-    using underlying_type = UnderlyingType;
-    using concrete_type = ConcreteType;
-
-protected:
-    constexpr explicit IdWrapper(underlying_type id) noexcept(noexcept(underlying_type(std::declval<underlying_type>())))
-        : id_(id) {}
-
-    constexpr underlying_type underlyingId() const noexcept(noexcept(underlying_type(std::declval<underlying_type>()))) {
-        return id_;
-    }
-
-private:
-    friend size_t hash_value(const concrete_type& v) {
-        return std::hash<underlying_type>()(v.id_);
-    }
-
-    // TODO Making operator== noexcept if underlying type is noexcept equality comparable doesn't work with GCC 4.8.
-    //      Fix this once we don't need GCC 4.8 anymore.
-    friend constexpr bool operator==(const concrete_type& lhs, const concrete_type& rhs) {
-        return lhs.id_ == rhs.id_;
-    }
-
-    // TODO Making operator!= noexcept if operator== is noexcept doesn't work with GCC 4.8.
-    //      Fix this once we don't need GCC 4.8 anymore.
-    friend constexpr bool operator!=(const concrete_type& lhs, const concrete_type& rhs) {
-        return !(lhs == rhs);
-    }
-
-    underlying_type id_;
-};
-
-}}
-
-#define C10_DEFINE_HASH_FOR_IDWRAPPER(ClassName)             \
-  namespace std {                                            \
-  template <>                                                \
-  struct hash<ClassName> {                                   \
-    size_t operator()(ClassName x) const {                   \
-      return hash_value(x);                                  \
-    }                                                        \
-  };                                                         \
-  }
diff --git a/caffe2/utils/TypeList.h b/caffe2/utils/TypeList.h
index 3494843feae121..7c20fa6613b966 100644
--- a/caffe2/utils/TypeList.h
+++ b/caffe2/utils/TypeList.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "caffe2/utils/C++17.h"
+#include <ATen/core/C++17.h>
 #include "caffe2/utils/TypeTraits.h"
 
 namespace c10 { namespace guts { namespace typelist {
diff --git a/caffe2/utils/TypeTraits.h b/caffe2/utils/TypeTraits.h
index 004586987a81f7..c60f8a00b1ebdd 100644
--- a/caffe2/utils/TypeTraits.h
+++ b/caffe2/utils/TypeTraits.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "caffe2/utils/C++17.h"
+#include <ATen/core/C++17.h>
 #include <functional>
 
 namespace c10 {
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index e0ae5cc0336e2a..c573542af5763c 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -2605,6 +2605,13 @@ bool TransposeWithHPTT(
     axes_cm[i] = cm_fn(axes[cm_fn(i)]);
     dims_cm[i] = dims[cm_fn(i)];
   }
+
+  // HPTT doesn't handle 0 sized inputs.
+  for (auto dim : dims_cm) {
+    if (dim <= 0) {
+      return false;
+    }
+  }
   auto plan = hptt::create_plan(
       axes_cm.data(),
       ndim,
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index bc30f35f2a2eee..3829219a933b5d 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -1,3 +1,9 @@
+# This ill-named file does a number of things:
+# - Installs Caffe2 header files (this has nothing to do with code generation)
+# - Configures caffe2/core/macros.h
+# - Creates an ATen target for its generated C++ files and adds it
+#   as a dependency
+
 if (DEFINED ENV{PYTORCH_PYTHON})
   message(STATUS "Using python found in $ENV{PYTORCH_PYTHON}")
   set(PYCMD "$ENV{PYTORCH_PYTHON}")
@@ -14,6 +20,11 @@ configure_file(
 install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2
         DESTINATION include
         FILES_MATCHING PATTERN "*.h")
+if (NOT BUILD_ATEN)
+  install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core
+          DESTINATION include/ATen/core
+          FILES_MATCHING PATTERN "*.h")
+endif()
 install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h
         DESTINATION include/caffe2/core)
 
diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake
index 2a4e61f97b0b18..2f2628bb149866 100644
--- a/cmake/MiscCheck.cmake
+++ b/cmake/MiscCheck.cmake
@@ -83,22 +83,26 @@ endif()
 cmake_pop_check_state()
 
 # ---[ Check for NUMA support
-cmake_push_check_state(RESET)
-set(CMAKE_REQUIRED_FLAGS "-std=c++11")
-CHECK_CXX_SOURCE_COMPILES(
+if (USE_NUMA)
+  cmake_push_check_state(RESET)
+  set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+  CHECK_CXX_SOURCE_COMPILES(
     "#include <numa.h>
     #include <numaif.h>
 
     int main(int argc, char** argv) {
     }" CAFFE2_IS_NUMA_AVAILABLE)
-
-if (CAFFE2_IS_NUMA_AVAILABLE)
-  message(STATUS "NUMA is available")
+  if (CAFFE2_IS_NUMA_AVAILABLE)
+    message(STATUS "NUMA is available")
+  else()
+    message(STATUS "NUMA is not available")
+    set(CAFFE2_DISABLE_NUMA 1)
+  endif()
+  cmake_pop_check_state()
 else()
-  message(STATUS "NUMA is not available")
+  message(STATUS "NUMA is disabled")
   set(CAFFE2_DISABLE_NUMA 1)
 endif()
-cmake_pop_check_state()
 
 # ---[ Check if we want to turn off deprecated warning due to glog.
 # Note(jiayq): on ubuntu 14.04, the default glog install uses ext/hash_set that
@@ -157,6 +161,15 @@ if (${COMPILER_SUPPORTS_HIDDEN_INLINE_VISIBILITY})
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CAFFE2_VISIBILITY_FLAG}")
 endif()
 
+# ---[ Checks if linker supports -rdynamic. `-rdynamic` tells linker
+# -to add all (including unused) symbols into the dynamic symbol
+# -table. We need this to get symbols when generating backtrace at
+# -runtime.
+check_cxx_compiler_flag("-rdynamic" COMPILER_SUPPORTS_RDYNAMIC)
+if (${COMPILER_SUPPORTS_RDYNAMIC})
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic")
+endif()
+
 # ---[ If we are using msvc, set no warning flags
 # Note(jiayq): if you are going to add a warning flag, check if this is
 # totally necessary, and only add when you see fit. If it is needed due to
diff --git a/docs/libtorch.rst b/docs/libtorch.rst
new file mode 100644
index 00000000000000..9ab59a4d749d66
--- /dev/null
+++ b/docs/libtorch.rst
@@ -0,0 +1,19 @@
+libtorch (C++-only)
+===================
+
+The core of pytorch can be built and used without Python. A
+CMake-based build system compiles the C++ source code into a shared
+object, libtorch.so.
+
+Building libtorch
+-----------------
+
+There is a script which wraps the CMake build. Invoke it with
+
+::
+   cd pytorch
+   BUILD_TORCH=ON ONNX_NAMESPACE=onnx_torch bash tools/build_pytorch_libs.sh --use-nnpack caffe2
+   ls torch/lib/tmp_install # output is produced here
+   ls torch/lib/tmp_install/lib/libtorch.so # of particular interest
+
+Future work will simplify this further.
diff --git a/docs/source/distributions.rst b/docs/source/distributions.rst
index 93224462e3177e..de541b467e819e 100644
--- a/docs/source/distributions.rst
+++ b/docs/source/distributions.rst
@@ -203,6 +203,15 @@ Probability distributions - torch.distributions
     :undoc-members:
     :show-inheritance:
 
+:hidden:`NegativeBinomial`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.negative_binomial
+.. autoclass:: NegativeBinomial
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 :hidden:`Normal`
 ~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 987044bbd212f4..283409ea3676b8 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -338,6 +338,12 @@ Non-linear activations (weighted sum, nonlinearity)
 .. autoclass:: SELU
     :members:
 
+:hidden:`CELU`
+~~~~~~~~~~~~~~
+
+.. autoclass:: CELU
+    :members:
+
 :hidden:`Sigmoid`
 ~~~~~~~~~~~~~~~~~
 
@@ -604,6 +610,12 @@ Loss functions
 .. autoclass:: CrossEntropyLoss
     :members:
 
+:hidden:`CTCLoss`
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: CTCLoss
+    :members:
+
 :hidden:`NLLLoss`
 ~~~~~~~~~~~~~~~~~
 
@@ -984,6 +996,11 @@ Non-linear activation functions
 
 .. autofunction:: selu
 
+:hidden:`celu`
+~~~~~~~~~~~~~~
+
+.. autofunction:: celu
+
 :hidden:`leaky_relu`
 ~~~~~~~~~~~~~~~~~~~~
 
@@ -1180,6 +1197,11 @@ Loss functions
 
 .. autofunction:: cross_entropy
 
+:hidden:`ctc_loss`
+~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: ctc_loss
+
 :hidden:`hinge_embedding_loss`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/scripts/build_activation_images.py b/docs/source/scripts/build_activation_images.py
index ce424d1ff188fa..e973933e205692 100644
--- a/docs/source/scripts/build_activation_images.py
+++ b/docs/source/scripts/build_activation_images.py
@@ -36,6 +36,7 @@
     'ReLU6',
     'RReLU',
     'SELU',
+    'CELU',
     'Sigmoid',
     'Softplus',
     'Softshrink',
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index c3c85797b4cd82..06b0305d28aae8 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -46,7 +46,7 @@ A tensor can be constructed from a Python :class:`list` or sequence using the
     If you have a numpy array and want to avoid a copy, use
     :func:`torch.as_tensor`.
 
-An tensor of specific data type can be constructed by passing a
+A tensor of specific data type can be constructed by passing a
 :class:`torch.dtype` and/or a :class:`torch.device` to a
 constructor or tensor creation op:
 
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index c1e914c03c74e7..c68ec039d74ce3 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -306,3 +306,7 @@ BLAS and LAPACK Operations
 .. autofunction:: svd
 .. autofunction:: symeig
 .. autofunction:: trtrs
+
+Utilities
+----------------------------------
+.. autofunction:: compiled_with_cxx11_abi
diff --git a/scripts/build_anaconda.sh b/scripts/build_anaconda.sh
index 1db0f546724103..62185d1e9dc821 100755
--- a/scripts/build_anaconda.sh
+++ b/scripts/build_anaconda.sh
@@ -296,6 +296,10 @@ fi
 # Add packages required for all Caffe2 builds
 add_package 'glog'
 add_package 'gflags'
+add_package 'mkl' '>=2018'
+add_package 'mkl-include'
+add_package 'typing'
+append_to_section 'build' '- pyyaml'
 caffe2_cmake_args+=("-DUSE_LEVELDB=OFF")
 caffe2_cmake_args+=("-DUSE_LMDB=OFF")
 
@@ -303,10 +307,6 @@ caffe2_cmake_args+=("-DUSE_LMDB=OFF")
 # Add packages required for pytorch
 if [[ -n $integrated ]]; then
   add_package 'cffi'
-  add_package 'mkl' '>=2018'
-  add_package 'mkl-include'
-  add_package 'typing'
-  append_to_section 'build' '- pyyaml'
   append_to_section 'build' '- setuptools'
   #caffe2_cmake_args+=("-DBLAS=MKL")
   if [[ -n $cuda_ver ]]; then
diff --git a/setup.py b/setup.py
index 042d8668bb7b96..2e2ef60fb41313 100644
--- a/setup.py
+++ b/setup.py
@@ -659,7 +659,9 @@ def run(self):
         # Clang has an unfixed bug leading to spurious missing
         # braces warnings, see
         # https://bugs.llvm.org/show_bug.cgi?id=21629
-        '-Wno-missing-braces'
+        '-Wno-missing-braces',
+        # gcc7 seems to report spurious warnings with this enabled
+        "-Wno-stringop-overflow",
     ]
     if check_env_flag('WERROR'):
         extra_compile_args.append('-Werror')
@@ -1023,6 +1025,7 @@ def make_relative_rpath(path):
                 'lib/torch_shm_manager',
                 'lib/*.h',
                 'lib/include/ATen/*.h',
+                'lib/include/ATen/core/*.h',
                 'lib/include/ATen/detail/*.h',
                 'lib/include/ATen/cuda/*.h',
                 'lib/include/ATen/cuda/*.cuh',
diff --git a/setup_caffe2.py b/setup_caffe2.py
index 0fd620549b31d8..d8ebf4fc7ed84f 100644
--- a/setup_caffe2.py
+++ b/setup_caffe2.py
@@ -131,6 +131,7 @@ def run(self):
             # configure
             cmake_args = [
                 find_executable('cmake'),
+                '-DUSE_ATEN=ON',
                 '-DBUILD_SHARED_LIBS=OFF',
                 '-DPYTHON_EXECUTABLE:FILEPATH={}'.format(sys.executable),
                 '-DPYTHON_INCLUDE_DIR={}'.format(sysconfig.get_python_inc()),
diff --git a/test/common.py b/test/common.py
index 1eb4076dbf360b..4dbe3c56c47c98 100644
--- a/test/common.py
+++ b/test/common.py
@@ -118,16 +118,6 @@ def dec(fn):
     return dec
 
 
-def skipIfNoZeroSize(fn):
-    @wraps(fn)
-    def wrapper(*args, **kwargs):
-        if torch._C._use_zero_size_dim():
-            fn(*args, **kwargs)
-        else:
-            raise unittest.SkipTest('Compiled without arbitrary zero size dimension support')
-    return wrapper
-
-
 def get_cuda_memory_usage():
     # we don't need CUDA synchronize because the statistics are not tracked at
     # actual freeing, but at when marking the block as free.
diff --git a/test/common_nn.py b/test/common_nn.py
index 6172f4b15adc3f..0444ba4eb6ae46 100644
--- a/test/common_nn.py
+++ b/test/common_nn.py
@@ -125,6 +125,7 @@ def get_weight(m):
         module_name='ELU',
         constructor_args=(2.,),
         input_size=(3, 2, 5),
+        reference_fn=lambda x, _: torch.where(x >= 0, x, 2 * (x.exp() - 1))
     ),
     # TODO: reference function
     dict(
@@ -448,6 +449,43 @@ def marginrankingloss_reference(input1, input2, target, margin=0, reduction='ele
     return output
 
 
+# this directly follows Graves et al's paper, in contrast to the production implementation, it does not use log-space
+def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0, reduction='elementwise_mean'):
+    input_lengths = torch.tensor(input_lengths, dtype=torch.long)
+    target_lengths = torch.tensor(target_lengths, dtype=torch.long)
+    dt = log_probs.dtype
+    log_probs = log_probs.double()  # we need the accuracy as we are not in logspace
+    targets = targets.long()
+    cum_target_lengths = target_lengths.cumsum(0)
+    losses = []
+    for i in range(log_probs.size(1)):
+        input_length = input_lengths[i].item()
+        target_length = target_lengths[i].item()
+        cum_target_length = cum_target_lengths[i].item()
+        targets_prime = targets.new_full((2 * target_length + 1,), blank)
+        if targets.dim() == 2:
+            targets_prime[1::2] = targets[i, :target_length]
+        else:
+            targets_prime[1::2] = targets[cum_target_length - target_length:cum_target_length]
+        probs = log_probs[:input_length, i].exp()
+        alpha = log_probs.new_zeros((target_length * 2 + 1,))
+        alpha[0] = probs[0, blank]
+        alpha[1] = probs[0, targets_prime[1]]
+        mask_third = (targets_prime[:-2] != targets_prime[2:])
+        for t in range(1, input_length):
+            alpha_next = alpha.clone()
+            alpha_next[1:] += alpha[:-1]
+            alpha_next[2:] += torch.where(mask_third, alpha[:-2], alpha.new_zeros(1))
+            alpha = probs[t, targets_prime] * alpha_next
+        losses.append(-alpha[-2:].sum().log()[None])
+    output = torch.cat(losses, 0)
+    if reduction == 'elementwise_mean':
+        return (output / target_lengths.to(dtype=output.dtype, device=output.device)).mean()
+    elif reduction == 'sum':
+        return output.sum()
+    output = output.to(dt)
+    return output
+
 loss_reference_fns = {
     'KLDivLoss': kldivloss_reference,
     'NLLLoss': nllloss_reference,
@@ -460,6 +498,7 @@ def marginrankingloss_reference(input1, input2, target, margin=0, reduction='ele
     'CosineEmbeddingLoss': cosineembeddingloss_reference,
     'TripletMarginLoss': tripletmarginloss_reference,
     'MarginRankingLoss': marginrankingloss_reference,
+    'CTCLoss': ctcloss_reference,
 }
 
 
@@ -841,7 +880,7 @@ def check_criterion_jacobian(self, criterion, input, target):
 
 class TestBase(object):
 
-    _required_arg_names = {'constructor_args', 'input'}
+    _required_arg_names = {'constructor_args', 'input', 'extra_args'}
 
     def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwargs):
         self.desc = desc
@@ -850,8 +889,8 @@ def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwa
         self.reference_fn = reference_fn
         for name in self._required_arg_names:
             if name not in kwargs and name + '_fn' not in kwargs and name + '_size' not in kwargs:
-                if name == 'constructor_args':
-                    kwargs['constructor_args'] = tuple()
+                if name in {'constructor_args', 'extra_args'}:
+                    kwargs[name] = tuple()
                 else:
                     raise ValueError("{}: Specify {} by a value, a function to generate it, or it's size!"
                                      .format(self.get_name(), name))
@@ -879,6 +918,10 @@ def _unpack(self, value):
     def constructor_args(self):
         return self._get_arg('constructor_args', True)
 
+    @property
+    def extra_args(self):
+        return self._get_arg('extra_args', True)
+
     def _get_arg(self, name, unpack):
         assert name in self._required_arg_names
 
@@ -1103,9 +1146,9 @@ def __call__(self, test_case):
         target = self._get_target()
 
         if self.reference_fn is not None:
-            out = test_case._forward_criterion(module, input, target)
-            expected_out = self.reference_fn(deepcopy(input),
-                                             deepcopy(target), module)
+            out = test_case._forward_criterion(module, input, target, extra_args=self.extra_args)
+            ref_args = (deepcopy(input), deepcopy(target)) + self.extra_args + (module,)
+            expected_out = self.reference_fn(*ref_args)
             if isinstance(expected_out, torch.Tensor):
                 expected_out = expected_out.item()
             test_case.assertEqual(out, expected_out)
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 8e66a66962d44f..be2fd6e0d969ba 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -237,7 +237,7 @@ TEST_CASE("modules") {
       REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0);
     }
     {
-      auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0);
+      auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1);
       REQUIRE(functional(torch::ones({})).toCFloat() == 0);
     }
   }
diff --git a/test/expect/TestBatched.test_for.expect b/test/expect/TestBatched.test_for.expect
new file mode 100644
index 00000000000000..bcbcffaee486a3
--- /dev/null
+++ b/test/expect/TestBatched.test_for.expect
@@ -0,0 +1,22 @@
+graph(%x.1_data : Dynamic
+      %x.1_mask : Dynamic
+      %x.1_dims : Dynamic
+      %y_data : Dynamic
+      %y_mask : Dynamic
+      %y_dims : Dynamic) {
+  %6 : int = prim::Constant[value=10]()
+  %7 : int = prim::Constant[value=1]()
+  %x : Dynamic, %21 : Dynamic, %22 : Dynamic = prim::Loop(%6, %7, %x.1_data, %x.1_mask, %x.1_dims)
+    block0(%loop_num : int, %5_data : Dynamic, %5_mask : Dynamic, %5_dims : Dynamic) {
+      %13 : int = prim::Constant[value=1]()
+      %14 : Long() = prim::NumToTensor(%13)
+      %alpha : float = prim::TensorToNum(%14)
+      %data.1 : Dynamic = aten::add(%5_data, %y_data, %alpha)
+      %mask : Dynamic = aten::mul(%5_mask, %y_mask)
+      %dims : Dynamic = aten::__or__(%5_dims, %y_dims)
+      %19 : int = prim::Constant[value=1]()
+      %data : Dynamic = aten::where(%mask, %data.1, %5_data)
+      -> (%19, %data, %mask, %dims)
+    }
+  return (%x, %21, %22);
+}
diff --git a/test/expect/TestBatched.test_if_else.expect b/test/expect/TestBatched.test_if_else.expect
new file mode 100644
index 00000000000000..0698584377a433
--- /dev/null
+++ b/test/expect/TestBatched.test_if_else.expect
@@ -0,0 +1,52 @@
+graph(%a.1_data : Dynamic
+      %a.1_mask : Dynamic
+      %a.1_dims : Dynamic
+      %b_data : Dynamic
+      %b_mask : Dynamic
+      %b_dims : Dynamic) {
+  %6 : Dynamic = aten::gt(%a.1_data, %b_data)
+  %7 : Dynamic = aten::mul(%a.1_mask, %b_mask)
+  %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %9 : int = prim::TensorToNum(%6)
+  %10 : int = prim::Constant[value=1]()
+  %11 : Long() = prim::NumToTensor(%10)
+  %alpha.1 : float = prim::TensorToNum(%11)
+  %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1)
+  %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask)
+  %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %16 : int = prim::Constant[value=1]()
+  %17 : Long() = prim::NumToTensor(%16)
+  %alpha : float = prim::TensorToNum(%17)
+  %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha)
+  %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
+  %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %22 : Dynamic = aten::type_as(%7, %6)
+  %cond_mask.1 : Dynamic = aten::mul(%6, %22)
+  %24 : int = aten::dim(%cond_mask.1)
+  %25 : int = prim::Constant[value=1]()
+  %26 : int = aten::eq(%24, %25)
+  %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%26)
+    block0() {
+      %30 : int = aten::dim(%data.1)
+      %31 : int = prim::Constant[value=1]()
+      %32 : int = aten::sub(%30, %31)
+      %33 : int = prim::Constant[value=1]()
+      %data.3 : Dynamic = prim::Loop(%32, %33, %cond_mask.1)
+        block0(%_ : int, %36 : Dynamic) {
+          %37 : int = aten::dim(%36)
+          %data.2 : Dynamic = aten::unsqueeze(%36, %37)
+          %39 : int = prim::Constant[value=1]()
+          -> (%39, %data.2)
+        }
+      %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
+      %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1)
+      -> (%cond_data.1, %cond_mask.2, %data.3)
+    }
+    block1() {
+      -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
+    }
+  %res_data : Dynamic = aten::where(%cond_data, %data.1, %data.4)
+  %res_mask : Dynamic = aten::where(%cond_mask, %mask.1, %mask)
+  %res_dims : Dynamic = aten::__or__(%dims.1, %dims)
+  return (%res_data, %res_mask, %res_dims);
+}
diff --git a/test/expect/TestBatched.test_if_else_with_scalar.expect b/test/expect/TestBatched.test_if_else_with_scalar.expect
new file mode 100644
index 00000000000000..c7755a5b5501fc
--- /dev/null
+++ b/test/expect/TestBatched.test_if_else_with_scalar.expect
@@ -0,0 +1,53 @@
+graph(%a.1_data : Dynamic
+      %a.1_mask : Dynamic
+      %a.1_dims : Dynamic
+      %b_data : Dynamic
+      %b_mask : Dynamic
+      %b_dims : Dynamic) {
+  %6 : float = prim::Constant[value=0.1]()
+  %7 : Float() = prim::NumToTensor(%6)
+  %other : float = prim::TensorToNum(%7)
+  %9 : Dynamic = aten::gt(%a.1_data, %other)
+  %10 : int = prim::TensorToNum(%9)
+  %11 : int = prim::Constant[value=1]()
+  %12 : Long() = prim::NumToTensor(%11)
+  %alpha.1 : float = prim::TensorToNum(%12)
+  %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1)
+  %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask)
+  %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %17 : int = prim::Constant[value=1]()
+  %18 : Long() = prim::NumToTensor(%17)
+  %alpha : float = prim::TensorToNum(%18)
+  %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha)
+  %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
+  %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %23 : Dynamic = aten::type_as(%a.1_mask, %9)
+  %cond_mask.1 : Dynamic = aten::mul(%9, %23)
+  %25 : int = aten::dim(%cond_mask.1)
+  %26 : int = prim::Constant[value=1]()
+  %27 : int = aten::eq(%25, %26)
+  %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%27)
+    block0() {
+      %31 : int = aten::dim(%data.1)
+      %32 : int = prim::Constant[value=1]()
+      %33 : int = aten::sub(%31, %32)
+      %34 : int = prim::Constant[value=1]()
+      %data.3 : Dynamic = prim::Loop(%33, %34, %cond_mask.1)
+        block0(%_ : int, %37 : Dynamic) {
+          %38 : int = aten::dim(%37)
+          %data.2 : Dynamic = aten::unsqueeze(%37, %38)
+          %40 : int = prim::Constant[value=1]()
+          -> (%40, %data.2)
+        }
+      %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
+      %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1)
+      -> (%cond_data.1, %cond_mask.2, %data.3)
+    }
+    block1() {
+      -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
+    }
+  %res_data : Dynamic = aten::where(%cond_data, %data.1, %data.4)
+  %res_mask : Dynamic = aten::where(%cond_mask, %mask.1, %mask)
+  %res_dims : Dynamic = aten::__or__(%dims.1, %dims)
+  return (%res_data, %res_mask, %res_dims);
+}
diff --git a/test/expect/TestBatched.test_if_noelse.expect b/test/expect/TestBatched.test_if_noelse.expect
new file mode 100644
index 00000000000000..1d98fe9d02f29c
--- /dev/null
+++ b/test/expect/TestBatched.test_if_noelse.expect
@@ -0,0 +1,46 @@
+graph(%a.1_data : Dynamic
+      %a.1_mask : Dynamic
+      %a.1_dims : Dynamic
+      %b_data : Dynamic
+      %b_mask : Dynamic
+      %b_dims : Dynamic) {
+  %6 : Dynamic = aten::gt(%a.1_data, %b_data)
+  %7 : Dynamic = aten::mul(%a.1_mask, %b_mask)
+  %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %9 : int = prim::TensorToNum(%6)
+  %10 : int = prim::Constant[value=1]()
+  %11 : Long() = prim::NumToTensor(%10)
+  %alpha : float = prim::TensorToNum(%11)
+  %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha)
+  %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
+  %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %16 : Dynamic = aten::type_as(%7, %6)
+  %cond_mask.1 : Dynamic = aten::mul(%6, %16)
+  %18 : int = aten::dim(%cond_mask.1)
+  %19 : int = prim::Constant[value=1]()
+  %20 : int = aten::eq(%18, %19)
+  %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%20)
+    block0() {
+      %24 : int = aten::dim(%data.1)
+      %25 : int = prim::Constant[value=1]()
+      %26 : int = aten::sub(%24, %25)
+      %27 : int = prim::Constant[value=1]()
+      %data.3 : Dynamic = prim::Loop(%26, %27, %cond_mask.1)
+        block0(%_ : int, %30 : Dynamic) {
+          %31 : int = aten::dim(%30)
+          %data.2 : Dynamic = aten::unsqueeze(%30, %31)
+          %33 : int = prim::Constant[value=1]()
+          -> (%33, %data.2)
+        }
+      %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
+      %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
+      -> (%cond_data.1, %cond_mask.2, %data.3)
+    }
+    block1() {
+      -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
+    }
+  %res_data : Dynamic = aten::where(%cond_data, %data.1, %a.1_data)
+  %res_mask : Dynamic = aten::where(%cond_mask, %mask, %a.1_mask)
+  %res_dims : Dynamic = aten::__or__(%dims, %a.1_dims)
+  return (%res_data, %res_mask, %res_dims);
+}
diff --git a/test/expect/TestBatched.test_if_noelse_with_scalar.expect b/test/expect/TestBatched.test_if_noelse_with_scalar.expect
new file mode 100644
index 00000000000000..935bedb22b3f80
--- /dev/null
+++ b/test/expect/TestBatched.test_if_noelse_with_scalar.expect
@@ -0,0 +1,47 @@
+graph(%a.1_data : Dynamic
+      %a.1_mask : Dynamic
+      %a.1_dims : Dynamic
+      %b_data : Dynamic
+      %b_mask : Dynamic
+      %b_dims : Dynamic) {
+  %6 : float = prim::Constant[value=0.1]()
+  %7 : Float() = prim::NumToTensor(%6)
+  %other : float = prim::TensorToNum(%7)
+  %9 : Dynamic = aten::gt(%a.1_data, %other)
+  %10 : int = prim::TensorToNum(%9)
+  %11 : int = prim::Constant[value=1]()
+  %12 : Long() = prim::NumToTensor(%11)
+  %alpha : float = prim::TensorToNum(%12)
+  %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha)
+  %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
+  %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %17 : Dynamic = aten::type_as(%a.1_mask, %9)
+  %cond_mask.1 : Dynamic = aten::mul(%9, %17)
+  %19 : int = aten::dim(%cond_mask.1)
+  %20 : int = prim::Constant[value=1]()
+  %21 : int = aten::eq(%19, %20)
+  %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%21)
+    block0() {
+      %25 : int = aten::dim(%data.1)
+      %26 : int = prim::Constant[value=1]()
+      %27 : int = aten::sub(%25, %26)
+      %28 : int = prim::Constant[value=1]()
+      %data.3 : Dynamic = prim::Loop(%27, %28, %cond_mask.1)
+        block0(%_ : int, %31 : Dynamic) {
+          %32 : int = aten::dim(%31)
+          %data.2 : Dynamic = aten::unsqueeze(%31, %32)
+          %34 : int = prim::Constant[value=1]()
+          -> (%34, %data.2)
+        }
+      %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
+      %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
+      -> (%cond_data.1, %cond_mask.2, %data.3)
+    }
+    block1() {
+      -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
+    }
+  %res_data : Dynamic = aten::where(%cond_data, %data.1, %a.1_data)
+  %res_mask : Dynamic = aten::where(%cond_mask, %mask, %a.1_mask)
+  %res_dims : Dynamic = aten::__or__(%dims, %a.1_dims)
+  return (%res_data, %res_mask, %res_dims);
+}
diff --git a/test/expect/TestBatched.test_while.expect b/test/expect/TestBatched.test_while.expect
new file mode 100644
index 00000000000000..a32cd392044f00
--- /dev/null
+++ b/test/expect/TestBatched.test_while.expect
@@ -0,0 +1,65 @@
+graph(%a.1_data : Dynamic
+      %a.1_mask : Dynamic
+      %a.1_dims : Dynamic
+      %b_data : Dynamic
+      %b_mask : Dynamic
+      %b_dims : Dynamic) {
+  %6 : int = prim::Constant[value=2147483647]()
+  %7 : Dynamic = aten::gt(%a.1_data, %b_data)
+  %8 : Dynamic = aten::mul(%a.1_mask, %b_mask)
+  %9 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+  %10 : int = prim::TensorToNum(%7)
+  %11 : Dynamic = aten::mul(%7, %8)
+  %12 : Dynamic = aten::sum(%11)
+  %13 : int = prim::Constant[value=0]()
+  %14 : Dynamic = aten::gt(%12, %13)
+  %15 : int = prim::TensorToNum(%14)
+  %64 : Dynamic, %65 : Dynamic, %66 : Dynamic, %a : Dynamic, %62 : Dynamic, %63 : Dynamic = prim::Loop(%6, %15, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
+    block0(%loop_num : int, %cond_data.2 : Dynamic, %cond_mask.3 : Dynamic, %cond_dims : Dynamic, %6_data : Dynamic, %6_mask : Dynamic, %6_dims : Dynamic) {
+      %24 : int = prim::Constant[value=1]()
+      %25 : Long() = prim::NumToTensor(%24)
+      %alpha : float = prim::TensorToNum(%25)
+      %data.1 : Dynamic = aten::sub(%6_data, %b_data, %alpha)
+      %mask : Dynamic = aten::mul(%6_mask, %b_mask)
+      %dims : Dynamic = aten::__or__(%6_dims, %b_dims)
+      %30 : Dynamic = aten::gt(%data.1, %b_data)
+      %31 : Dynamic = aten::mul(%mask, %b_mask)
+      %32 : Dynamic = aten::__or__(%dims, %b_dims)
+      %33 : int = prim::TensorToNum(%30)
+      %34 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2)
+      %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %34)
+      %36 : int = aten::dim(%cond_mask.1)
+      %37 : int = prim::Constant[value=1]()
+      %38 : int = aten::eq(%36, %37)
+      %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%38)
+        block0() {
+          %42 : int = aten::dim(%data.1)
+          %43 : int = prim::Constant[value=1]()
+          %44 : int = aten::sub(%42, %43)
+          %45 : int = prim::Constant[value=1]()
+          %data.3 : Dynamic = prim::Loop(%44, %45, %cond_mask.1)
+            block0(%_ : int, %48 : Dynamic) {
+              %49 : int = aten::dim(%48)
+              %data.2 : Dynamic = aten::unsqueeze(%48, %49)
+              %51 : int = prim::Constant[value=1]()
+              -> (%51, %data.2)
+            }
+          %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
+          %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
+          -> (%cond_data.1, %cond_mask.2, %data.3)
+        }
+        block1() {
+          -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
+        }
+      %res_data : Dynamic = aten::where(%cond_data, %data.1, %6_data)
+      %res_mask : Dynamic = aten::where(%cond_mask, %mask, %6_mask)
+      %res_dims : Dynamic = aten::__or__(%dims, %6_dims)
+      %57 : Dynamic = aten::mul(%30, %31)
+      %58 : Dynamic = aten::sum(%57)
+      %59 : int = prim::Constant[value=0]()
+      %60 : Dynamic = aten::gt(%58, %59)
+      %61 : int = prim::TensorToNum(%60)
+      -> (%61, %30, %31, %32, %res_data, %res_mask, %res_dims)
+    }
+  return (%a, %62, %63);
+}
diff --git a/test/expect/TestJit.test_concat_fusion.expect b/test/expect/TestJit.test_concat_fusion.expect
index 027c2de33e5926..454a84cba1db76 100644
--- a/test/expect/TestJit.test_concat_fusion.expect
+++ b/test/expect/TestJit.test_concat_fusion.expect
@@ -3,12 +3,11 @@ graph(%0 : Float(3, 20)
   %2 : Float(6, 20) = prim::FusionGroup_0[device=0](%0, %1)
   return (%2);
 }
-with prim::FusionGroup_0 = graph(%4 : Float(3, 20)
-      %5 : Float(3, 20)) {
-  %7 : int = prim::Constant[value=1]()
-  %8 : Float(3, 20) = aten::add(%4, %5, %7)
-  %6 : Float(3, 20) = aten::mul(%4, %5)
-  %2 : int = prim::Constant[value=0]()
-  %3 : Float(6, 20) = aten::cat(%8, %6, %2)
-  return (%3);
+with prim::FusionGroup_0 = graph(%3 : Float(3, 20)
+      %4 : Float(3, 20)) {
+  %6 : int = prim::Constant[value=1]()
+  %7 : Float(3, 20) = aten::add(%3, %4, %6)
+  %5 : Float(3, 20) = aten::mul(%3, %4)
+  %2 : Float(6, 20) = prim::FusedConcat[dim=0](%7, %5)
+  return (%2);
 }
diff --git a/test/expect/TestJit.test_constant_prop_nested.expect b/test/expect/TestJit.test_constant_prop_nested.expect
new file mode 100644
index 00000000000000..09ef82076edc4a
--- /dev/null
+++ b/test/expect/TestJit.test_constant_prop_nested.expect
@@ -0,0 +1,15 @@
+graph(%a : Dynamic) {
+  %1 : int = prim::Constant[value=2]()
+  %2 : Dynamic = aten::lt(%a, %1)
+  %3 : int = prim::TensorToNum(%2)
+  %c : int = prim::If(%3)
+    block0() {
+      %5 : int = prim::Constant[value=5]()
+      -> (%5)
+    }
+    block1() {
+      %6 : int = prim::Constant[value=1]()
+      -> (%6)
+    }
+  return (%c);
+}
diff --git a/test/expect/TestJit.test_constant_prop_print.expect b/test/expect/TestJit.test_constant_prop_print.expect
new file mode 100644
index 00000000000000..7cadfdbbc6b3ea
--- /dev/null
+++ b/test/expect/TestJit.test_constant_prop_print.expect
@@ -0,0 +1,12 @@
+graph(%input_tensor : Dynamic) {
+  %1 : int = prim::Constant[value=6]()
+  %2 : Dynamic = ^FIXME_zerol()()
+  %a : Dynamic = aten::add(%1, %2)
+   = prim::Print(%a)
+  %4 : int = prim::Constant[value=2]()
+  %5 : int = prim::Constant[value=1]()
+  %b : Dynamic = aten::add(%a, %4, %5)
+  %7 : int = prim::Constant[value=1]()
+  %8 : Dynamic = aten::add(%b, %input_tensor, %7)
+  return (%8);
+}
diff --git a/test/expect/TestJit.test_constant_prop_rand.expect b/test/expect/TestJit.test_constant_prop_rand.expect
new file mode 100644
index 00000000000000..a6c305258bff95
--- /dev/null
+++ b/test/expect/TestJit.test_constant_prop_rand.expect
@@ -0,0 +1,11 @@
+graph() {
+  %0 : int = prim::Constant[value=6]()
+  %1 : int = prim::Constant[value=0]()
+  %2 : int[] = prim::Constant[value=[0, -1]]()
+  %3 : int[] = prim::Constant[value=[3]]()
+  %a : Dynamic = aten::randn(%3, %0, %1, %2)
+  %5 : int = prim::Constant[value=2]()
+  %6 : int = prim::Constant[value=1]()
+  %b : Dynamic = aten::add(%a, %5, %6)
+  return (%b);
+}
diff --git a/test/expect/TestJit.test_constant_prop_simple.expect b/test/expect/TestJit.test_constant_prop_simple.expect
new file mode 100644
index 00000000000000..029f9ac05a0783
--- /dev/null
+++ b/test/expect/TestJit.test_constant_prop_simple.expect
@@ -0,0 +1,5 @@
+graph(%input_tensor : Dynamic) {
+  %1 : int = prim::Constant[value=8]()
+  %2 : Dynamic = aten::add(%1, %input_tensor)
+  return (%2);
+}
diff --git a/test/expect/TestJit.test_lstm_fusion_concat.expect b/test/expect/TestJit.test_lstm_fusion_concat.expect
index 7884a95c48c9a1..f0771c133c11d9 100644
--- a/test/expect/TestJit.test_lstm_fusion_concat.expect
+++ b/test/expect/TestJit.test_lstm_fusion_concat.expect
@@ -16,34 +16,33 @@ graph(%0 : Float(3, 10)
   %21 : Float(6, 20) = prim::FusionGroup_0[device=0](%2, %16, %20, %15, %19, %14, %18, %13, %17)
   return (%21);
 }
-with prim::FusionGroup_0 = graph(%16 : Float(3, 20)
+with prim::FusionGroup_0 = graph(%15 : Float(3, 20)
+      %25 : Float(3!, 20)
       %26 : Float(3!, 20)
-      %27 : Float(3!, 20)
+      %29 : Float(3!, 20)
       %30 : Float(3!, 20)
-      %31 : Float(3!, 20)
+      %33 : Float(3!, 20)
       %34 : Float(3!, 20)
-      %35 : Float(3!, 20)
-      %38 : Float(3!, 20)
-      %39 : Float(3!, 20)) {
-  %40 : int = prim::Constant[value=1]()
-  %41 : Float(3, 20) = aten::add(%38, %39, %40)
-  %36 : int = prim::Constant[value=1]()
-  %37 : Float(3, 20) = aten::add(%34, %35, %36)
-  %32 : int = prim::Constant[value=1]()
-  %33 : Float(3, 20) = aten::add(%30, %31, %32)
-  %28 : int = prim::Constant[value=1]()
-  %29 : Float(3, 20) = aten::add(%26, %27, %28)
-  %25 : Float(3, 20) = aten::sigmoid(%41)
-  %23 : Float(3, 20) = aten::sigmoid(%37)
-  %21 : Float(3, 20) = aten::tanh(%33)
-  %19 : Float(3, 20) = aten::sigmoid(%29)
-  %17 : Float(3, 20) = aten::mul(%23, %16)
-  %14 : Float(3, 20) = aten::mul(%25, %21)
-  %10 : int = prim::Constant[value=1]()
-  %11 : Float(3, 20) = aten::add(%17, %14, %10)
-  %7 : Float(3, 20) = aten::tanh(%11)
-  %6 : Float(3, 20) = aten::mul(%19, %7)
-  %2 : int = prim::Constant[value=0]()
-  %3 : Float(6, 20) = aten::cat(%6, %11, %2)
-  return (%3);
+      %37 : Float(3!, 20)
+      %38 : Float(3!, 20)) {
+  %39 : int = prim::Constant[value=1]()
+  %40 : Float(3, 20) = aten::add(%37, %38, %39)
+  %35 : int = prim::Constant[value=1]()
+  %36 : Float(3, 20) = aten::add(%33, %34, %35)
+  %31 : int = prim::Constant[value=1]()
+  %32 : Float(3, 20) = aten::add(%29, %30, %31)
+  %27 : int = prim::Constant[value=1]()
+  %28 : Float(3, 20) = aten::add(%25, %26, %27)
+  %24 : Float(3, 20) = aten::sigmoid(%40)
+  %22 : Float(3, 20) = aten::sigmoid(%36)
+  %20 : Float(3, 20) = aten::tanh(%32)
+  %18 : Float(3, 20) = aten::sigmoid(%28)
+  %16 : Float(3, 20) = aten::mul(%22, %15)
+  %13 : Float(3, 20) = aten::mul(%24, %20)
+  %9 : int = prim::Constant[value=1]()
+  %10 : Float(3, 20) = aten::add(%16, %13, %9)
+  %6 : Float(3, 20) = aten::tanh(%10)
+  %5 : Float(3, 20) = aten::mul(%18, %6)
+  %2 : Float(6, 20) = prim::FusedConcat[dim=0](%5, %10)
+  return (%2);
 }
diff --git a/test/expect/TestScript.test_cat_lifts.expect b/test/expect/TestScript.test_cat_lifts.expect
index ea2fa3737c0556..c8c82e5199c030 100644
--- a/test/expect/TestScript.test_cat_lifts.expect
+++ b/test/expect/TestScript.test_cat_lifts.expect
@@ -1,15 +1,18 @@
 graph(%x : Dynamic) {
   %1 : int = prim::Constant[value=1]()
-  %2 : Dynamic = aten::cat(%x, %x, %1)
-  return (%2);
+  %2 : Dynamic[] = prim::ListConstruct(%x, %x)
+  %3 : Dynamic = aten::cat(%2, %1)
+  return (%3);
 }
 graph(%x : Dynamic) {
   %1 : int = prim::Constant[value=1]()
-  %2 : Dynamic = aten::cat(%1)
-  return (%2);
+  %2 : Dynamic[] = prim::ListConstruct()
+  %3 : Dynamic = aten::cat(%2, %1)
+  return (%3);
 }
 graph(%x : Dynamic) {
   %1 : int = prim::Constant[value=1]()
-  %2 : Dynamic = aten::cat(%x, %1)
-  return (%2);
+  %2 : Dynamic[] = prim::ListConstruct(%x)
+  %3 : Dynamic = aten::cat(%2, %1)
+  return (%3);
 }
diff --git a/test/expect/TestScript.test_index_put_trace_with_view.expect b/test/expect/TestScript.test_index_put_trace_with_view.expect
index 591e499da96671..37f08643f139a4 100644
--- a/test/expect/TestScript.test_index_put_trace_with_view.expect
+++ b/test/expect/TestScript.test_index_put_trace_with_view.expect
@@ -6,6 +6,7 @@ graph(%0 : Double(100)
   %5 : Double(4) = aten::view(%2, %4)
   %6 : int = prim::Constant[value=0]()
   %7 : Long(4) = aten::_cast_Long(%1, %6)
-  %19 : Double(100) = aten::index_put(%0, %7, %5)
-  return (%19);
+  %8 : Dynamic[] = prim::ListConstruct(%7)
+  %20 : Double(100) = aten::index_put(%0, %8, %5)
+  return (%20);
 }
diff --git a/test/expect/TestScript.test_index_put_trace_without_view.expect b/test/expect/TestScript.test_index_put_trace_without_view.expect
index 42f8e49142942e..772308223b454b 100644
--- a/test/expect/TestScript.test_index_put_trace_without_view.expect
+++ b/test/expect/TestScript.test_index_put_trace_without_view.expect
@@ -3,6 +3,7 @@ graph(%0 : Double(100)
       %2 : Double(4)) {
   %3 : int = prim::Constant[value=0]()
   %4 : Long(4) = aten::_cast_Long(%1, %3)
-  %16 : Double(100) = aten::index_put(%0, %4, %2)
-  return (%16);
+  %5 : Dynamic[] = prim::ListConstruct(%4)
+  %17 : Double(100) = aten::index_put(%0, %5, %2)
+  return (%17);
 }
diff --git a/test/onnx/expect/TestOperators.test_elu.expect b/test/onnx/expect/TestOperators.test_elu.expect
new file mode 100644
index 00000000000000..a8eff9ab2c1387
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_elu.expect
@@ -0,0 +1,63 @@
+ir_version: 3
+producer_name: "pytorch"
+producer_version: "0.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Elu"
+    attribute {
+      name: "alpha"
+      f: 1
+      type: FLOAT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: FLOAT
+        shape {
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 7
+}
diff --git a/test/onnx/expect/TestOperators.test_equal.expect b/test/onnx/expect/TestOperators.test_equal.expect
index 3d8210b14bcbee..fc23156d1cbf47 100644
--- a/test/onnx/expect/TestOperators.test_equal.expect
+++ b/test/onnx/expect/TestOperators.test_equal.expect
@@ -45,7 +45,7 @@ graph {
     name: "2"
     type {
       tensor_type {
-        elem_type: INT8
+        elem_type: UINT8
         shape {
           dim {
             dim_value: 3
diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect
index e50f2e12537d56..204a59e88ef5a6 100644
--- a/test/onnx/expect/TestOperators.test_ge.expect
+++ b/test/onnx/expect/TestOperators.test_ge.expect
@@ -50,7 +50,7 @@ graph {
     name: "3"
     type {
       tensor_type {
-        elem_type: INT8
+        elem_type: UINT8
         shape {
           dim {
             dim_value: 3
diff --git a/test/onnx/expect/TestOperators.test_gt.expect b/test/onnx/expect/TestOperators.test_gt.expect
index 3cda8f244819b7..d3eb9cf08c30a6 100644
--- a/test/onnx/expect/TestOperators.test_gt.expect
+++ b/test/onnx/expect/TestOperators.test_gt.expect
@@ -45,7 +45,7 @@ graph {
     name: "2"
     type {
       tensor_type {
-        elem_type: INT8
+        elem_type: UINT8
         shape {
           dim {
             dim_value: 3
diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect
index 2aefbc6dbc8622..39ba6940e2289c 100644
--- a/test/onnx/expect/TestOperators.test_le.expect
+++ b/test/onnx/expect/TestOperators.test_le.expect
@@ -50,7 +50,7 @@ graph {
     name: "3"
     type {
       tensor_type {
-        elem_type: INT8
+        elem_type: UINT8
         shape {
           dim {
             dim_value: 3
diff --git a/test/onnx/expect/TestOperators.test_lt.expect b/test/onnx/expect/TestOperators.test_lt.expect
index 83656cb3a5ce04..cd9c4eaaaf50a7 100644
--- a/test/onnx/expect/TestOperators.test_lt.expect
+++ b/test/onnx/expect/TestOperators.test_lt.expect
@@ -45,7 +45,7 @@ graph {
     name: "2"
     type {
       tensor_type {
-        elem_type: INT8
+        elem_type: UINT8
         shape {
           dim {
             dim_value: 3
diff --git a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
index b1ff53c2e4e7d8..3c1321664dd3fd 100644
--- a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
+++ b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
@@ -10,33 +10,33 @@ graph {
       t {
         dims: 4
         data_type: INT64
-        raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000"
+        raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000"
       }
       type: TENSOR
     }
   }
   node {
-    input: "0"
-    input: "1"
     output: "2"
-    op_type: "Reshape"
-  }
-  node {
-    output: "3"
     op_type: "Constant"
     attribute {
       name: "value"
       t {
         dims: 4
         data_type: INT64
-        raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000"
+        raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000"
       }
       type: TENSOR
     }
   }
   node {
+    input: "0"
     input: "2"
+    output: "3"
+    op_type: "Reshape"
+  }
+  node {
     input: "3"
+    input: "1"
     output: "4"
     op_type: "Tile"
   }
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index 1e2c401dcc3ac0..ba8292e616686a 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -364,6 +364,10 @@ def test_pow(self):
         y = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
         self.assertONNX(lambda x, y: x.pow(y), (x, y))
 
+    def test_elu(self):
+        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
+        self.assertONNX(nn.ELU(), x)
+
     def test_selu(self):
         x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
         self.assertONNX(nn.SELU(), x)
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 85ef2eac5bf2ce..7130a7695cc69b 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -676,6 +676,52 @@ def forward(self, x):
             x = Variable(torch.randn(*shape))
             self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
 
+    def test_cumsum(self):
+        shape = (3, 4, 5)
+        for params in [{'dim': i} for i in range(len(shape))]:
+            class MyModel(torch.nn.Module):
+                def __init__(self):
+                    super(MyModel, self).__init__()
+
+                def forward(self, x):
+                    return torch.cumsum(x, **params)
+            x = Variable(torch.randn(*shape))
+            self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
+
+    def test_repeat(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, x):
+                return x.repeat(1, 2, 3, 4)
+
+        x = Variable(torch.randn(4, 3, 2, 1), requires_grad=True)
+        self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
+
+    def test_repeat_dim_overflow(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, x):
+                return x.repeat(1, 2, 3, 4)
+
+        x = Variable(torch.randn(1, 2), requires_grad=True)
+        self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
+
+    def test_repeat_dynamic(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super(MyModel, self).__init__()
+
+            def forward(self, x, y):
+                return x.repeat(y.size()[0] / 2, y.size()[1] * 2)
+
+        x = Variable(torch.randn(1, 2), requires_grad=True)
+        y = Variable(torch.randn(2, 4), requires_grad=True)
+        self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False)
+
     def test_mean(self):
         shape = (3, 4, 5)
         for params in [{}] + [{'dim': i} for i in range(len(shape))]:
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 3ef7c21d49fc90..9d39043db9b56d 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -15,7 +15,7 @@
 from torch.autograd.function import once_differentiable
 from torch.autograd.profiler import profile
 from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \
-    suppress_warnings, skipIfNoZeroSize, TEST_WITH_ROCM
+    suppress_warnings, TEST_WITH_ROCM
 from torch.autograd import Variable, Function, detect_anomaly
 from torch.autograd.function import InplaceFunction
 from torch.testing import make_non_contiguous, randn_like
@@ -1851,6 +1851,16 @@ def backward(ctx, grad_output):
         out.sum().backward()
         self.assertEqual(x.grad.data, y_data)
 
+    def test_broadcast_tensors(self):
+        f_args_variable = (torch.randn(3, requires_grad=True),
+                           torch.randn(1, 2, 1, requires_grad=True),
+                           torch.randn(1, 1, requires_grad=True),
+                           torch.randn(5, 1, 1, requires_grad=True))
+        f_args_tensor = deepcopy(unpack_variables(f_args_variable))
+        run_functional_checks(self, "test_broadcast_tensors", "broadcast",
+                              lambda a, b, c, d: torch.broadcast_tensors(a, b, c, d),
+                              True, f_args_variable, f_args_tensor)
+
     def test_cat(self):
         f_args_variable = (torch.randn(1, S, S, requires_grad=True),
                            torch.randn(2, S, S, requires_grad=True),
@@ -1892,7 +1902,6 @@ def test_cat_empty_legacy(self):
                               False, f_args_variable, f_args_tensor)
         self.assertTrue(gradcheck(lambda a, b: torch.cat((a, b)), f_args_variable, eps=1e-6, atol=PRECISION))
 
-    @skipIfNoZeroSize
     def test_cat_empty(self):
         f_args_variable = (torch.randn(0, S, requires_grad=True),
                            torch.randn(S, S, requires_grad=True))
@@ -1901,7 +1910,6 @@ def test_cat_empty(self):
                               lambda a, b: torch.cat((a, b)),
                               True, f_args_variable, f_args_tensor)
 
-    @skipIfNoLapack
     def test_potrf(self):
         root = Variable(torch.tril(torch.rand(S, S)), requires_grad=True)
 
@@ -3123,7 +3131,7 @@ class dont_convert(tuple):
     ('select', (S, S, S), (1, -1), 'wrap_dim', [0]),
     ('select', (S,), (0, 2), '1d'),
     ('narrow', (S, S, S), (1, 2, 2), 'dim', [0]),
-    ('narrow', (S, S, S), (1, 0, 0), 'empty_dim', [0], [skipIfNoZeroSize]),
+    ('narrow', (S, S, S), (1, 0, 0), 'empty_dim', [0]),
     ('squeeze', (S, 1, S, 1), NO_ARGS),
     ('squeeze', (1, 1, 1, 1), NO_ARGS, 'input_sizes_are_ones'),
     ('squeeze', (S, 1, S, 1), (1,), '1_dim', [0]),
diff --git a/test/test_distributions.py b/test/test_distributions.py
index 7effb9012e9fc6..8a607ece6931c5 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -42,8 +42,8 @@
                                  Independent, Laplace, LogisticNormal,
                                  LogNormal, LowRankMultivariateNormal,
                                  Multinomial, MultivariateNormal,
-                                 Normal, OneHotCategorical, Pareto, Poisson,
-                                 RelaxedBernoulli, RelaxedOneHotCategorical,
+                                 NegativeBinomial, Normal, OneHotCategorical, Pareto,
+                                 Poisson, RelaxedBernoulli, RelaxedOneHotCategorical,
                                  StudentT, TransformedDistribution, Uniform,
                                  Weibull, constraints, kl_divergence)
 from torch.distributions.constraint_registry import biject_to, transform_to
@@ -123,6 +123,16 @@ def is_all_nan(tensor):
         {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True),
          'total_count': torch.tensor(0.)},
     ]),
+    Example(NegativeBinomial, [
+        {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10},
+        {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': 10},
+        {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': torch.tensor([10])},
+        {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': torch.tensor([10, 8])},
+        {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True),
+         'total_count': torch.tensor([[10., 8.], [5., 3.]])},
+        {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True),
+         'total_count': torch.tensor(0.)},
+    ]),
     Example(Multinomial, [
         {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10},
         {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': 10},
@@ -442,6 +452,12 @@ def is_all_nan(tensor):
         {'probs': torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True),
          'total_count': 10},
     ]),
+    Example(NegativeBinomial, [
+        {'probs': torch.tensor([[-0.0000001, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True),
+         'total_count': 10},
+        {'probs': torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True),
+         'total_count': 10},
+    ]),
     Example(Cauchy, [
         {'loc': 0.0, 'scale': -1.0},
         {'loc': torch.tensor([0.0]), 'scale': 0.0},
@@ -911,6 +927,37 @@ def test_binomial_enumerate_support(self):
         bin1 = Binomial(torch.tensor(5), torch.tensor(0.5))
         self.assertEqual(bin1.enumerate_support(), torch.arange(6))
 
+    def test_negative_binomial(self):
+        p = torch.tensor(torch.arange(0.05, 1, 0.1), requires_grad=True)
+        for total_count in [1, 2, 10]:
+            self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, p), [p])
+            self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, None, p.log()), [p])
+        self.assertRaises(NotImplementedError, NegativeBinomial(10, p).rsample)
+        self.assertRaises(NotImplementedError, NegativeBinomial(10, p).entropy)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_negative_binomial_log_prob(self):
+        probs = torch.tensor(torch.arange(0.05, 1, 0.1))
+        for total_count in [1, 2, 10]:
+
+            def ref_log_prob(idx, x, log_prob):
+                p = probs.view(-1)[idx].item()
+                expected = scipy.stats.nbinom(total_count, 1 - p).logpmf(x)
+                self.assertAlmostEqual(log_prob, expected, places=3)
+
+            self._check_log_prob(NegativeBinomial(total_count, probs), ref_log_prob)
+            logits = probs_to_logits(probs, is_binary=True)
+            self._check_log_prob(NegativeBinomial(total_count, logits=logits), ref_log_prob)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_negative_binomial_log_prob_vectorized_count(self):
+        probs = torch.tensor([0.2, 0.7, 0.9])
+        for total_count, sample in [(torch.tensor([10]), torch.tensor([7., 3., 9.])),
+                                    (torch.tensor([1, 2, 10]), torch.tensor([0., 1., 9.]))]:
+            log_prob = NegativeBinomial(total_count, probs).log_prob(sample)
+            expected = scipy.stats.nbinom(total_count.cpu().numpy(), 1 - probs.cpu().numpy()).logpmf(sample)
+            self.assertAlmostEqual(log_prob, expected, places=4)
+
     def test_multinomial_1d(self):
         total_count = 10
         p = torch.tensor([0.1, 0.2, 0.3], requires_grad=True)
@@ -3475,7 +3522,7 @@ def setUp(self):
             ),
             (
                 Binomial(10, simplex_tensor),
-                scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor)
+                scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor.numpy())
             ),
             (
                 Cauchy(random_var, positive_var),
@@ -3862,6 +3909,9 @@ def get_constraints(self, is_cuda=False):
             constraints.greater_than(0),
             constraints.greater_than(2),
             constraints.greater_than(-2),
+            constraints.greater_than_eq(0),
+            constraints.greater_than_eq(2),
+            constraints.greater_than_eq(-2),
             constraints.less_than(tensor([-10., -2, 0, 2, 10])),
             constraints.less_than(0),
             constraints.less_than(2),
@@ -3871,6 +3921,10 @@ def get_constraints(self, is_cuda=False):
                                  tensor([-3., 3, 1, 5, 5])),
             constraints.interval(-2, -1),
             constraints.interval(1, 2),
+            constraints.half_open_interval(tensor([-4., -2, 0, 2, 4]),
+                                           tensor([-3., 3, 1, 5, 5])),
+            constraints.half_open_interval(-2, -1),
+            constraints.half_open_interval(1, 2),
             constraints.simplex,
             constraints.lower_cholesky,
         ]
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 00865d9f576b74..afe9e6d60c653c 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -1,4 +1,4 @@
-from common import TestCase, run_tests, skipIfNoZeroSize
+from common import TestCase, run_tests
 import torch
 import warnings
 from torch import tensor
@@ -93,7 +93,6 @@ def test_empty_index(self):
         y[mask] = -1
         self.assertEqual(x, y)
 
-    @skipIfNoZeroSize
     def test_empty_ndim_index(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -104,14 +103,12 @@ def test_empty_ndim_index(self):
             self.assertEqual(torch.empty(2, 0, 6, 4, 5, device=device),
                              x[:, torch.empty(0, 6, dtype=torch.int64, device=device)])
 
-    @skipIfNoZeroSize
     def test_empty_ndim_index_bool(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
             x = torch.randn(5, device=device)
             self.assertRaises(IndexError, lambda: x[torch.empty(0, 2, dtype=torch.uint8, device=device)])
 
-    @skipIfNoZeroSize
     def test_empty_slice(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -475,26 +472,18 @@ def test_boolean_indexing_twodim(self):
     def test_boolean_indexing_weirdness(self):
         # Weird boolean indexing things
         a = torch.ones((2, 3, 4))
-        if torch._C._use_zero_size_dim():
-            self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape)
-        else:
-            self.assertEqual((0,), a[False, True, ...].shape)
+        self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape)
         self.assertEqual(torch.ones(1, 2), a[True, [0, 1], True, True, [1], [[2]]])
-        if torch._C._use_zero_size_dim():
-            self.assertRaises(RuntimeError, lambda: a[False, [0, 1], ...])
+        self.assertRaises(RuntimeError, lambda: a[False, [0, 1], ...])
 
     def test_boolean_indexing_weirdness_tensors(self):
         # Weird boolean indexing things
         false = torch.tensor(False)
         true = torch.tensor(True)
         a = torch.ones((2, 3, 4))
-        if torch._C._use_zero_size_dim():
-            self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape)
-        else:
-            self.assertEqual((0,), a[False, True, ...].shape)
+        self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape)
         self.assertEqual(torch.ones(1, 2), a[true, [0, 1], true, true, [1], [[2]]])
-        if torch._C._use_zero_size_dim():
-            self.assertRaises(RuntimeError, lambda: a[false, [0, 1], ...])
+        self.assertRaises(RuntimeError, lambda: a[false, [0, 1], ...])
 
     def test_boolean_indexing_alldims(self):
         true = torch.tensor(True)
diff --git a/test/test_jit.py b/test/test_jit.py
index ab4c907e72d19f..b3bbe9892bc7db 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1122,13 +1122,95 @@ def test_fn(ten, mask):
         ten = torch.rand(3, 3)
         self.assertEqual(test_fn(ten, mask), traced_test_fn(ten, mask))
 
+    def test_constant_prop_simple(self):
+        @torch.jit.script
+        def constant_prop(input_tensor):
+            a = 2 * 3
+            b = a + 2
+            return b + input_tensor
+
+        x = torch.tensor(2)
+        out_ref = constant_prop(x)
+        self.run_pass('constant_propagation', constant_prop.graph)
+        out_test = constant_prop(torch.tensor(2))
+        self.assertEqual(out_ref, out_test)
+        self.assertExpected(canonical(constant_prop.graph))
+
+    def test_constant_prop_nested(self):
+        @torch.jit.script
+        def constant_prop(a):
+            b = 2 + 1
+            if a < 2:
+                c = b + 2
+            else:
+                c = b - 2
+            return c
+
+        out_ref = constant_prop(torch.tensor(2))
+        self.run_pass('constant_propagation', constant_prop.graph)
+        out_test = constant_prop(torch.tensor(2))
+        self.assertEqual(out_ref, out_test)
+        self.assertExpected(canonical(constant_prop.graph))
+
+    def test_constant_prop_print(self):
+        @torch.jit.script
+        def constant_prop(input_tensor):
+            a = 2 * 3 + FIXME_zerol()
+            print(a)
+            b = a + 2
+            return b + input_tensor
+
+        self.run_pass('constant_propagation', constant_prop.graph)
+        self.assertExpected(canonical(constant_prop.graph))
+
+    def test_constant_prop_rand(self):
+        @torch.jit.script
+        def constant_prop():
+            a = torch.randn([3])
+            b = a + 2
+            return b
+
+        self.run_pass('constant_propagation', constant_prop.graph)
+        self.assertExpected(canonical(constant_prop.graph))
+
+    # TODO: implement
+    @unittest.expectedFailure
+    def test_constant_prop_if_constant(self):
+        @torch.jit.script
+        def constant_prop():
+            b = 3
+            if True:
+                b = 1
+            if False:
+                b = 2
+            return b
+
+        self.run_pass('constant_propagation', constant_prop.graph)
+        self.assertExpected(canonical(constant_prop.graph))
+
+    # TODO: implement
+    @unittest.expectedFailure
+    def test_constant_prop_loop_constant(self):
+        @torch.jit.script
+        def constant_prop():
+            b = 0
+            while True:
+                b = 1
+            while False:
+                b = 2
+            return b
+
+        self.run_pass('constant_propagation', constant_prop.graph)
+        self.assertExpected(canonical(constant_prop.graph))
+
 
 class TestBatched(TestCase):
     # generate random examples and create an batchtensor with them
     def rand_batch(self, *dims):
         dims = [dim for dim in dims if dim != ()]
-        xs = [torch.rand(1, *(random.randint(1, size) if b else size for b, size in dims[1:])) for i in range(dims[0])]
-        xb = BatchTensor(xs, torch.tensor([b for b, d in dims[1:]]))
+        xs = [torch.rand(1, *(random.randint(1, size) if b else size for b, size in dims[1:]),
+                         requires_grad=True) for i in range(dims[0])]
+        xb = BatchTensor(xs, torch.tensor([b for b, d in dims[1:]]).byte())
         return xs, xb
 
     def test_create_batchtensor(self):
@@ -1156,20 +1238,20 @@ def tanh(a):
 
     def test_batch_elementwise_binary(self):
         @torch.jit.batch(batch_size=4)
-        def mul(a, b):
-            return a * b
+        def add(a, b):
+            return a + b
 
         xs, batch = self.rand_batch(4, (True, 3), (False, 2))
         xs2, batch2 = xs, batch
-        res_batch = mul(batch, batch2)
-        res = [torch.mul(xs[j], xs2[j]) for j in range(4)]
+        res_batch = add(batch, batch2)
+        res = [torch.add(xs[j], xs2[j]) for j in range(4)]
         self.assertEqual(res, res_batch.examples())
 
         # test broadcast
         xs, batch = self.rand_batch(4, (False, 3), (False, 2))
         b = torch.rand(3, 2)
-        res_batch = mul(batch, b)
-        res = [torch.mul(xs[j], b) for j in range(4)]
+        res_batch = add(batch, b)
+        res = [torch.add(xs[j], b) for j in range(4)]
         self.assertEqual(res, res_batch.examples())
 
     def test_batch_mm(self):
@@ -1216,6 +1298,33 @@ def matmul_test(xs, batch, xs2, batch2):
         xs2, batch2 = self.rand_batch(4, (False, 2), (True, 3))
         matmul_test(xs, batch, xs2, batch2)
 
+    def test_batch_select(self):
+        @torch.jit.batch(batch_size=4)
+        def select(x):
+            return torch.select(x, 1, 0)
+
+        xs, batch = self.rand_batch(4, (True, 3), (True, 2))
+        res_batch = select(batch)
+        res = [torch.select(xs[j], 1, 0) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+        xs, batch = self.rand_batch(4, (False, 3), (True, 2))
+        res_batch = select(batch)
+        res = [torch.select(xs[j], 1, 0) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+    def test_batch_index_select(self):
+        @torch.jit.batch(batch_size=4)
+        def index_select(x, ind):
+            return x.index_select(1, ind)
+
+        xs, batch = self.rand_batch(4, (False, 5), (True, 2))
+        ind = [torch.randint(0, 4, (1,), dtype=torch.long) for i in range(4)]
+        ind_batch = BatchTensor(ind, torch.tensor([]).byte())
+        res_batch = index_select(batch, ind_batch)
+        res = [torch.index_select(xs[j], 1, ind[j]) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
     def test_batch_where(self):
         @torch.jit.batch(batch_size=4)
         def where(c, a, b):
@@ -1232,43 +1341,300 @@ def where(c, a, b):
         res = [torch.where(xs_cond[j], xs[j], xs2[j]) for j in range(4)]
         self.assertEqual(res, res_batch.examples())
 
-    @unittest.skip("Need support for scalar arguments")
-    def test_lstm_cell(self):
-        def LSTMCell(x, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c):
-            i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
-            f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f
-            o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o
-            # activations
-            i_t = torch.sigmoid(i_t)
-            f_t = torch.sigmoid(f_t)
-            o_t = torch.sigmoid(o_t)
-            # cell computations
-            c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c
-            c_t = torch.tanh(c_t)
-            c_t = torch.mul(c, f_t) + torch.mul(i_t, c_t)
-            h_t = torch.mul(o_t, torch.tanh(c_t))
-            return h_t
+    def test_batch_argmax(self):
+        @torch.jit.batch(batch_size=4)
+        def argmax(a):
+            return torch.argmax(a, 1)
+
+        xs, batch = self.rand_batch(4, (True, 5), (True, 6))
+        res_batch = argmax(batch)
+        res = [torch.argmax(xs[j], 1) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+        @torch.jit.batch(batch_size=4)
+        def argmax(a):
+            return torch.argmax(a, 1, False)
+
+        res_batch = argmax(batch)
+        res = [torch.argmax(xs[j], 1, False) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+    def test_batch_topk(self):
+        @torch.jit.batch(batch_size=4)
+        def topk(a):
+            return torch.topk(a, 3, 1)
+
+        xs, batch = self.rand_batch(4, (False, 5), (True, 6))
+
+        # along static dim
+        res_batch = topk(batch)
+        res = [torch.topk(xs[j], 3, 1)[0] for j in range(4)]
+        res_idx = [torch.topk(xs[j], 3, 1)[1] for j in range(4)]
+        self.assertEqual(res, res_batch[0].examples())
+        self.assertEqual(res_idx, res_batch[1].examples())
+
+        @torch.jit.batch(batch_size=4)
+        def topk(a):
+            return torch.topk(a, 1, 2)
+
+        # along dynamic dim
+        res_batch = topk(batch)
+        res = [torch.topk(xs[j], 1, 2)[0] for j in range(4)]
+        res_idx = [torch.topk(xs[j], 1, 2)[1] for j in range(4)]
+        self.assertEqual(res, res_batch[0].examples())
+        self.assertEqual(res_idx, res_batch[1].examples())
+
+    def test_batch_softmax(self):
+        @torch.jit.batch(batch_size=4)
+        def softmax(a):
+            return torch.softmax(a, 1)
+
+        xs, batch = self.rand_batch(4, (False, 5), (True, 6))
+
+        # along static dim
+        res_batch = softmax(batch)
+        res = [torch.softmax(xs[j], 1) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+        @torch.jit.batch(batch_size=4)
+        def softmax(a):
+            return torch.softmax(a, 2)
+
+        # along dynamic dim
+        res_batch = softmax(batch)
+        res = [torch.softmax(xs[j], 2) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+    def test_batch_view(self):
+        @torch.jit.batch(batch_size=4)
+        def view(a):
+            return a.view([4, -1, 3])
+
+        xs, batch = self.rand_batch(4, (True, 5), (False, 3))
+        res_batch = view(batch)
+        res = [xs[j].view([1, -1, 3]) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+    def test_batch_cat(self):
+        @torch.jit.batch(batch_size=4)
+        def cat2(a, b):
+            return torch.cat([a, b], 2)
+
+        xs, batch = self.rand_batch(4, (True, 5), (False, 3))
+        xs2, batch2 = xs, batch
+        res_batch = cat2(batch, batch2)
+        res = [torch.cat([xs[j], xs2[j]], 2) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
 
+    def test_batch_sum(self):
         @torch.jit.batch(batch_size=4)
-        def LSTMCell_batch(x, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c):
-            i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
-            f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f
-            o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o
-            # activations
-            i_t = torch.sigmoid(i_t)
-            f_t = torch.sigmoid(f_t)
-            o_t = torch.sigmoid(o_t)
-            # cell computations
-            c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c
-            c_t = torch.tanh(c_t)
-            c_t = torch.mul(c, f_t) + torch.mul(i_t, c_t)
-            h_t = torch.mul(o_t, torch.tanh(c_t))
-            return h_t
+        def batch_sum(a):
+            return a.sum()
+
+        xs, batch = self.rand_batch(4, (True, 5), (False, 3))
+        res_batch = batch_sum(batch)
+        res = [xs[j].sum().unsqueeze(0) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+    def test_if_else(self):
+        def single_if(a, b):
+            if a > b:
+                a = a + b
+            else:
+                a = a - b
+            return a
+
+        batch_if = torch.jit.batch(batch_size=4)(single_if)
+
+        a, batch_a = self.rand_batch(4, ())
+        b, batch_b = self.rand_batch(4, ())
+        res_batch = batch_if(batch_a, batch_b)
+        res = [single_if(a[j], b[j]) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+        script_if = torch.jit.script(single_if)
+        graph = torch.to_batch_graph(script_if.graph)
+        self.assertExpected(str(graph))
+
+    def test_if_else_with_scalar(self):
+        def single_if(a, b):
+            if a > 0.1:
+                a = a + b
+            else:
+                a = a - b
+            return a
+
+        batch_if = torch.jit.batch(batch_size=4)(single_if)
+
+        a, batch_a = self.rand_batch(4, ())
+        b, batch_b = self.rand_batch(4, ())
+        res_batch = batch_if(batch_a, batch_b)
+        res = [single_if(a[j], b[j]) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+        script_if = torch.jit.script(single_if)
+        graph = torch.to_batch_graph(script_if.graph)
+        self.assertExpected(str(graph))
+
+    def test_if_noelse(self):
+        def single_if(a, b):
+            if a > b:
+                a = a + b
+            return a
+
+        batch_if = torch.jit.batch(batch_size=4)(single_if)
+
+        a, batch_a = self.rand_batch(4, ())
+        b, batch_b = self.rand_batch(4, ())
+        res_batch = batch_if(batch_a, batch_b)
+        res = [single_if(a[j], b[j]) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+        script_if = torch.jit.script(single_if)
+        graph = torch.to_batch_graph(script_if.graph)
+        self.assertExpected(str(graph))
+
+    def test_if_noelse_with_scalar(self):
+        def single_if(a, b):
+            if a > 0.1:
+                a = a + b
+            return a
+
+        batch_if = torch.jit.batch(batch_size=4)(single_if)
+
+        a, batch_a = self.rand_batch(4, ())
+        b, batch_b = self.rand_batch(4, ())
+        res_batch = batch_if(batch_a, batch_b)
+        res = [single_if(a[j], b[j]) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+        script_if = torch.jit.script(single_if)
+        graph = torch.to_batch_graph(script_if.graph)
+        self.assertExpected(str(graph))
+
+    def test_while(self):
+        def single_while(a, b):
+            while a > b:
+                a = a - b
+            return a
+
+        batch_while = torch.jit.batch(batch_size=4)(single_while)
+
+        a, batch_a = self.rand_batch(4, ())
+        b = [torch.abs(torch.rand(1)) for i in range(4)]
+        batch_b = BatchTensor(b, torch.tensor([]).byte())
+        res_batch = batch_while(batch_a, batch_b)
+        res = [single_while(a[j], b[j]) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+        script_while = torch.jit.script(single_while)
+        graph = torch.to_batch_graph(script_while.graph)
+        self.assertExpected(str(graph))
+
+    def test_for(self):
+        def single_for(x, y):
+            for _ in range(10):
+                x = x + y
+            return x
+
+        batch_for = torch.jit.batch(batch_size=4)(single_for)
+
+        a, batch_a = self.rand_batch(4, ())
+        b, batch_b = self.rand_batch(4, ())
+        res_batch = batch_for(batch_a, batch_b)
+        res = [single_for(a[j], b[j]) for j in range(4)]
+        self.assertEqual(res, res_batch.examples())
+
+        script_for = torch.jit.script(single_for)
+        graph = torch.to_batch_graph(script_for.graph)
+        self.assertExpected(str(graph))
+
+    def test_lstm(self):
+        def LSTM(x_all, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c):
+            for i in range(x_all.size(1)):
+                x = x_all.select(1, i)
+                i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
+                f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f
+                o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o
+                # activations
+                i_t = torch.sigmoid(i_t)
+                f_t = torch.sigmoid(f_t)
+                o_t = torch.sigmoid(o_t)
+                # cell computations
+                c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c
+                c_t = torch.tanh(c_t)
+                c_t = torch.mul(c_t, f_t) + torch.mul(i_t, c_t)
+                h_t = torch.mul(o_t, torch.tanh(c_t))
+                h = h_t
+                c = c_t
+            return h
+
+        LSTM_batch = torch.jit.batch(batch_size=4)(LSTM)
 
         batch_size, input_size, hidden_size = 4, 3, 2
+        xs, batch = self.rand_batch(batch_size, (True, 4), (False, input_size))
+        hx, h_batch = self.rand_batch(batch_size, (False, hidden_size))
+        cx, c_batch = self.rand_batch(batch_size, (False, hidden_size))
+
+        # input to hidden weights
+        w_xi = torch.rand(input_size, hidden_size)
+        w_xf = torch.rand(input_size, hidden_size)
+        w_xo = torch.rand(input_size, hidden_size)
+        w_xc = torch.rand(input_size, hidden_size)
+        # hidden to hidden weights
+        w_hi = torch.rand(hidden_size, hidden_size)
+        w_hf = torch.rand(hidden_size, hidden_size)
+        w_ho = torch.rand(hidden_size, hidden_size)
+        w_hc = torch.rand(hidden_size, hidden_size)
+        # bias terms
+        b_i = torch.rand(hidden_size)
+        b_f = torch.rand(hidden_size)
+        b_o = torch.rand(hidden_size)
+        b_c = torch.rand(hidden_size)
+
+        ys = [LSTM(xs[j], hx[j], cx[j], w_xi, w_xf, w_xo, w_xc,
+                   w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) for j in range(batch_size)]
+        ybs = LSTM_batch(batch, h_batch, c_batch, w_xi, w_xf, w_xo, w_xc,
+                         w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c)
+        self.assertEqual(ys, ybs.examples())
+
+    def test_greedy_search(self):
+        def greedy(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc,
+                   b_i, b_f, b_o, b_c, w_hs, b_s, iter_num):
+            iter_count = torch.zeros_like(iter_num)
+            while(iter_count < iter_num):
+                iter_count += 1
+                # LSTM Cell
+                i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
+                f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f
+                o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o
+                # activations
+                i_t = torch.sigmoid(i_t)
+                f_t = torch.sigmoid(f_t)
+                o_t = torch.sigmoid(o_t)
+                # cell computations
+                c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c
+                c_t = torch.tanh(c_t)
+                c_t = torch.mul(c_t, f_t) + torch.mul(i_t, c_t)
+                h_t = torch.mul(o_t, torch.tanh(c_t))
+                h = h_t
+                c = c_t
+                # calculate feature with max probability
+                s_t = torch.matmul(h_t, w_hs) + b_s
+                p_t = torch.softmax(s_t, 1)
+                i_t = torch.argmax(p_t, 1)
+                x = embed.index_select(1, i_t).squeeze(1)
+            return h
+
+        greedy_batch = torch.jit.batch(batch_size=4)(greedy)
+
+        batch_size, input_size, hidden_size, vocab_size = 4, 6, 8, 7
         xs, batch = self.rand_batch(batch_size, (False, input_size))
         hx, h_batch = self.rand_batch(batch_size, (False, hidden_size))
         cx, c_batch = self.rand_batch(batch_size, (False, hidden_size))
+        embed, embed_batch = self.rand_batch(batch_size, (False, vocab_size), (False, input_size))
+        iter_num = [torch.randint(2, 5, (1,)) for i in range(batch_size)]
+        iter_num_batch = BatchTensor(iter_num, torch.tensor([]).byte())
 
         # input to hidden weights
         w_xi = torch.rand(input_size, hidden_size)
@@ -1285,11 +1651,102 @@ def LSTMCell_batch(x, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i,
         b_f = torch.rand(hidden_size)
         b_o = torch.rand(hidden_size)
         b_c = torch.rand(hidden_size)
+        # hidden to vocab weights, bias
+        w_hs = torch.rand(hidden_size, vocab_size)
+        b_s = torch.rand(vocab_size)
+
+        ys = [greedy(xs[j], hx[j], cx[j], embed[j], w_xi, w_xf, w_xo, w_xc,
+                     w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num[j]) for j in range(batch_size)]
+        ybs = greedy_batch(batch, h_batch, c_batch, embed_batch, w_xi, w_xf, w_xo, w_xc,
+                           w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num_batch)
+        self.assertEqual(ys, ybs.examples())
 
-        ys = [LSTMCell(xs[j].squeeze(0), hx[j], cx[j], w_xi, w_xf, w_xo, w_xc,
-                       w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) for j in range(batch_size)]
-        ybs = LSTMCell_batch(batch, h_batch, c_batch, w_xi, w_xf, w_xo, w_xc,
-                             w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c)
+    def test_beam_search(self):
+        def beam(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc,
+                 b_i, b_f, b_o, b_c, w_hs, b_s, iter_num, idx):
+            k = 5
+            vocab_size = embed.size(1)
+            iter_count = torch.zeros_like(iter_num)
+            max_len = idx.size(2)
+            while(iter_count < iter_num):
+                iter_count += 1
+                # LSTM Cell
+                i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
+                f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f
+                o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o
+                # activations
+                i_t = torch.sigmoid(i_t)
+                f_t = torch.sigmoid(f_t)
+                o_t = torch.sigmoid(o_t)
+                # cell computations
+                c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c
+                c_t = torch.tanh(c_t)
+                c_t = torch.mul(c_t, f_t) + torch.mul(i_t, c_t)
+                h_t = torch.mul(o_t, torch.tanh(c_t))
+                h = h_t
+                c = c_t
+                # calculate features with max probability
+                s_t = torch.matmul(h_t, w_hs) + b_s
+                s_t = s_t.view([1, s_t.size(1) * s_t.size(2)])
+                p_t = torch.softmax(s_t, 1)
+                prob_t, idx_t = torch.topk(p_t, k, 1)
+                if(int(idx_t.dim()) > 1):
+                    idx_t_tmp = idx_t.squeeze(0)
+                else:
+                    idx_t_tmp = idx_t
+                new_y = torch.fmod(idx_t_tmp, vocab_size)
+                pre_y = idx_t_tmp / vocab_size
+                x = embed.index_select(1, new_y)
+                h = h_t.index_select(1, pre_y)
+                c = c_t.index_select(1, pre_y)
+                iter = int(iter_count[0])
+                idx = torch.cat([idx.narrow(2, 0, iter).index_select(1, pre_y),
+                                torch.fmod(idx_t, vocab_size).unsqueeze(-1),
+                                idx.narrow(2, iter, max_len - iter)], 2)
+                idx = idx.narrow(2, 0, max_len)
+            return idx
+
+        beam_batch = torch.jit.batch(batch_size=4)(beam)
+
+        k = 5
+        batch_size, input_size, hidden_size, vocab_size = 4, 6, 8, 7
+        max_len = 5
+        xs, batch = self.rand_batch(batch_size, (False, 1), (False, input_size))
+        hx, h_batch = self.rand_batch(batch_size, (False, 1), (False, hidden_size))
+        cx, c_batch = self.rand_batch(batch_size, (False, 1), (False, hidden_size))
+        embed, embed_batch = self.rand_batch(batch_size, (False, vocab_size), (False, input_size))
+        iter_num = [torch.randint(2, max_len + 1, (1,)) for i in range(batch_size)]
+        iter_num_batch = BatchTensor(iter_num, torch.tensor([]).byte())
+
+        # input to hidden weights
+        w_xi = torch.rand(input_size, hidden_size)
+        w_xf = torch.rand(input_size, hidden_size)
+        w_xo = torch.rand(input_size, hidden_size)
+        w_xc = torch.rand(input_size, hidden_size)
+        # hidden to hidden weights
+        w_hi = torch.rand(hidden_size, hidden_size)
+        w_hf = torch.rand(hidden_size, hidden_size)
+        w_ho = torch.rand(hidden_size, hidden_size)
+        w_hc = torch.rand(hidden_size, hidden_size)
+        # bias terms
+        b_i = torch.rand(1, hidden_size)
+        b_f = torch.rand(1, hidden_size)
+        b_o = torch.rand(1, hidden_size)
+        b_c = torch.rand(1, hidden_size)
+        # hidden to vocab weights, bias
+        w_hs = torch.rand(hidden_size, vocab_size)
+        b_s = torch.rand(1, vocab_size)
+
+        idx_batch = torch.jit.BatchTensor(torch.zeros([batch_size, k, max_len], dtype=torch.long),
+                                          torch.zeros([batch_size, 1, max_len]).byte(),
+                                          torch.tensor([0, 1]).byte())
+        idx = [torch.zeros([1, k, max_len], dtype=torch.long) for _ in range(batch_size)]
+
+        ys = [beam(xs[j], hx[j], cx[j], embed[j], w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc,
+                   b_i, b_f, b_o, b_c, w_hs, b_s, iter_num[j], idx[j]).narrow(2, 0, int(iter_num[j]))
+              for j in range(batch_size)]
+        ybs = beam_batch(batch, h_batch, c_batch, embed_batch, w_xi, w_xf, w_xo, w_xc,
+                         w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num_batch, idx_batch)
         self.assertEqual(ys, ybs.examples())
 
 
@@ -3648,10 +4105,10 @@ def test_unknown_builtin(self):
             def unknown_builtin(x):
                 return x.splork(3)
 
-    def test_expected_tensor_found_tuple(self):
-        with self.assertRaisesRegex(RuntimeError, 'expected a tensor value but found'):
+    def test_return_tuple(self):
+        with self.assertRaisesRegex(RuntimeError, 'only supported return types'):
             @torch.jit.script
-            def return_tuple_wrong(x):
+            def return_tuple(x):
                 a = (x, x)
                 return a, x
 
@@ -4370,6 +4827,17 @@ def tuple_arg(x):
                 # type: (Tuple[Tensor, Tensor]) -> Tensor
                 return x + 1
 
+    def test_script_non_tensor_args_outputs(self):
+        @torch.jit.script
+        def fn(x, y):
+            # type: (Tensor, float) -> float
+            return float((x + y).sum())
+
+        x = torch.ones(2, 2)
+        z = fn(x, 1)
+        self.assertIsInstance(z, float)
+        self.assertEqual(z, 8.)
+
     @unittest.skip('https://github.com/pytorch/pytorch/issues/9595')
     def test_inline_and_run_annotated_script_fn(self):
         @torch.jit.script
@@ -4912,11 +5380,9 @@ def forward(self, x, y):
     'test_expand_new_dim',
     'test_expand_new_dim_front_old_front_1',
     'test_expand_scalar_to_dims',
-    'test_expand_scalar_to_scalar',
     'test_expand_size',
     'test_permute',
     'test_permute_neg_dim',
-    'test_permute_scalar',
     'test_repeat',
     'test_repeat_scalar',
     'test_repeat_single_number',
@@ -4924,12 +5390,10 @@ def forward(self, x, y):
     'test_reshape',
     'test_reshape_1d',
     'test_reshape_scalar_to_1d',
-    'test_reshape_scalar_to_scalar',
     'test_reshape_size',
     'test_view',
     'test_view_1d',
     'test_view_scalar_to_1d',
-    'test_view_scalar_to_scalar',
     'test_view_size',
     'test_split_dim',
     'test_split_dim_neg0',
diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py
index 1463d15cf22d0c..de65e6fc8ce7a0 100644
--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
@@ -693,14 +693,18 @@ def _backward(self, module, input, output, grad_output, create_graph=False):
 
         return module.backward(input, grad_output)
 
-    def _forward_criterion(self, criterion, input, target):
+    def _forward_criterion(self, criterion, input, target, extra_args=None):
+        if extra_args is None:
+            extra_args = tuple()
         with torch.no_grad():
-            return criterion.forward(input, target)
+            return criterion.forward(input, target, *extra_args)
 
-    def _backward_criterion(self, criterion, input, target, gradOutput=None):
+    def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None):
+        if extra_args is None:
+            extra_args = tuple()
         # Ignore gradOutput. It's used for non-legacy tests.
         with torch.no_grad():
-            return criterion.backward(input, target)
+            return criterion.backward(input, target, *extra_args)
 
     def _zero_grad_parameters(self, module):
         return module.zeroGradParameters()
diff --git a/test/test_nn.py b/test/test_nn.py
index ccd698747ae8d5..8682463cf9bc6c 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -36,7 +36,7 @@
     TEST_CUDNN_VERSION
 from common_nn import NNTestCase, ModuleTest, CriterionTest, TestBase, \
     module_tests, criterion_tests, loss_reference_fns, get_reduction, \
-    get_weight, smoothl1loss_reference, kldivloss_reference
+    get_weight, smoothl1loss_reference, kldivloss_reference, ctcloss_reference
 
 
 if TEST_SCIPY:
@@ -383,6 +383,8 @@ class NewCriterionTest(InputVariableMixin, CriterionTest):
     def __init__(self, *args, **kwargs):
         super(NewCriterionTest, self).__init__(*args, **kwargs)
         self.check_gradgrad = kwargs.get('check_gradgrad', True)
+        self.check_half = kwargs.get('check_half', True)
+        self.convert_target = kwargs.get('convert_target', True)
 
     def _do_extra_tests(self, test_case, module, input, target):
         if not self.check_gradgrad:
@@ -407,7 +409,7 @@ def apply_fn(input1, input2, *params):
         gradcheck(apply_fn, inputs)
         gradgradcheck(apply_fn, inputs)
 
-    def test_cuda(self, test_case, dtype=None):
+    def test_cuda(self, test_case, dtype=None, extra_args=None):
         def convert_dtype(obj, dtype, requires_grad=False):
             if isinstance(obj, torch.Tensor):
                 return torch.tensor(obj.data, dtype=dtype, requires_grad=requires_grad)
@@ -430,7 +432,7 @@ def convert_dtype(obj, dtype, requires_grad=False):
             if dtype is not None:
                 cpu_input = convert_dtype(cpu_input, dtype, True)
                 # NLLLoss requires target to be LongTensor
-                if not isinstance(cpu_target, torch.LongTensor):
+                if not isinstance(cpu_target, torch.LongTensor) and self.convert_target:
                     cpu_target = convert_dtype(cpu_target, dtype)
                 cpu_module.type(dtype)
                 gpu_module.type(dtype)
@@ -447,13 +449,13 @@ def convert_dtype(obj, dtype, requires_grad=False):
                 # Loss modules with weights require consistent input/module weight types
                 cpu_module = self.constructor(*self.constructor_args)
 
-            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target)
-            gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target)
+            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
+            gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
             # dtype can be None, so set precision in this way instead of a precision map
             test_case.assertEqual(cpu_output, gpu_output, 1e-1 if dtype == torch.half else 4e-4)
 
-            cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target)
-            gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target)
+            cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
+            gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
             test_case.assertEqual(cpu_gradInput, gpu_gradInput, 1e-1 if dtype == torch.half else 4e-4)
         except NotImplementedError:
             pass
@@ -465,6 +467,10 @@ def _get_target(self):
     def constructor_args(self):
         return self._get_arg('constructor_args', False)
 
+    @property
+    def extra_args(self):
+        return self._get_arg('extra_args', False)
+
 
 class TestNN(NNTestCase):
     _do_cuda_memory_leak_check = True
@@ -479,20 +485,24 @@ def _backward(self, module, input, output, grad_output, create_graph=False):
             return None
         return input.grad.data
 
-    def _forward_criterion(self, criterion, input, target):
+    def _forward_criterion(self, criterion, input, target, extra_args=None):
+        if extra_args is None:
+            extra_args = tuple()
         if isinstance(input, tuple):
-            args = input + (target,)
+            args = input + (target,) + extra_args
             output = criterion(*args)
         else:
-            output = criterion(input, target)
+            output = criterion(input, target, *extra_args)
         return output.item()
 
-    def _backward_criterion(self, criterion, input, target, gradOutput=None):
+    def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None):
+        if extra_args is None:
+            extra_args = tuple()
         input_tuple = input if isinstance(input, tuple) else (input,)
         for i in input_tuple:
             if i.grad is not None:
                 i.grad.data.zero_()
-        args = input_tuple + (target,)
+        args = input_tuple + (target,) + extra_args
         if gradOutput is None:
             gradOutput = torch.ones(())
         criterion(*args).backward(gradOutput.type_as(input_tuple[0]))
@@ -1585,6 +1595,7 @@ def test(nonlinearity, *args, **kwargs):
         test('relu6')
         test('elu')
         test('selu')
+        test('celu')
         test('rrelu')
         test('rrelu', inplace=True)
         test('hardtanh')
@@ -3578,6 +3589,19 @@ def test_NLLLoss_mismatched_batch(self):
         with self.assertRaisesRegex(ValueError, 'Expected.*batch_size'):
             F.nll_loss(x, t)
 
+    @unittest.skipIf(not (TEST_CUDNN and TEST_CUDNN_VERSION >= 7000), "needs cudnn >= 7.0")
+    def test_CTCLoss_cudnn(self):
+        target_lengths = [30, 25, 20]
+        input_lengths = [50, 50, 50]
+        targets = torch.randint(1, 15, (sum(target_lengths),), dtype=torch.int)
+        log_probs = torch.randn(50, 3, 15, dtype=torch.float, device='cuda').log_softmax(2)
+        res = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)
+        expected = ctcloss_reference(log_probs, targets.cuda(), input_lengths, target_lengths).float()
+        with torch.backends.cudnn.flags(enabled=False):
+            res2 = torch.nn.functional.ctc_loss(log_probs, targets.cuda().long(), input_lengths, target_lengths)
+        self.assertEqual(res, expected)
+        self.assertEqual(res2, res)
+
     def test_RNN_cell_no_broadcasting(self):
         def test(cell_module, input, hx, input_size, hidden_size):
             cell = cell_module(input_size, hidden_size)
@@ -4351,7 +4375,7 @@ def _verify_pixel_shuffle(self, input, output, upscale_factor):
                     self.assertEqual(output[:, c, h, w], input[:, channel_idx, height_idx, weight_idx])
 
     def test_inplace_thnn(self):
-        modules = [nn.ReLU, nn.ELU, nn.SELU, nn.RReLU]
+        modules = [nn.ReLU, nn.ELU, nn.SELU, nn.CELU, nn.RReLU]
         for mod in modules:
             r = mod(inplace=True)
             input = torch.randn(5, 5, requires_grad=True)
@@ -4812,6 +4836,12 @@ def test_triplet_margin_loss_swap_no_reduce(self):
         self.assertEqual(F.triplet_margin_loss(input1, input2, input3, swap=True, reduction='none'),
                          loss_reference_fns['TripletMarginLoss'](input1, input2, input3, swap=True, reduction='none'))
 
+    def test_pointwise_loss_target_grad_none_reduction(self):
+        i = torch.randn(5, 10)
+        t = torch.randn(5, 10, requires_grad=True)
+        self.assertEqual(F.mse_loss(i, t, reduction='none').size(), t.size())
+        self.assertEqual(F.l1_loss(i, t, reduction='none').size(), t.size())
+
     def test_cosine_similarity(self):
         input1 = torch.randn(4, 4, requires_grad=True)
         input2 = torch.randn(4, 4, requires_grad=True)
@@ -4842,30 +4872,30 @@ def test_grid_sample(self):
         def test_cpu_against_cuda(N, C, H, W, padding_mode):
             def test_shape(N, C, IH, IW, H, W, padding_mode):
 
-                input_cpu = Variable(torch.randn(C, N, IH, IW).transpose(0, 1), requires_grad=True)
-                grid_cpu = Variable(torch.randn(H, N, W, 2).transpose(0, 1), requires_grad=True)
+                input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_()
+                grid_cpu = torch.randn(H, N, W, 2).transpose(0, 1).requires_grad_()
                 out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode)
                 self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W]))
 
-                input_cuda = Variable(input_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
-                grid_cuda = Variable(grid_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
+                input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
+                grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
                 out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode)
                 self.assertEqual(out_cpu, out_cuda)
 
-                gradients = out_cpu.data.new(out_cpu.size()).normal_()
+                gradients = torch.randn_like(out_cpu)
                 out_cpu.backward(gradients)
                 out_cuda.backward(gradients.cuda())
                 self.assertEqual(input_cpu.grad, input_cuda.grad)
                 self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5)
 
                 # check that zero-dimensional input strides don't error out
-                base_input = torch.randn(C, IH, IW)
-                input_cpu = Variable(base_input.expand(input_cuda.size()), requires_grad=True)
+                base_input = torch.randn(N, C, 1, IW)
+                input_cpu = base_input.expand_as(input_cuda).requires_grad_()
                 grid_cpu = torch.randn(N, H, W, 2, requires_grad=True)
                 out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode)
 
-                input_cuda = Variable(base_input.cuda().expand(input_cuda.size()), requires_grad=True)
-                grid_cuda = Variable(grid_cpu.data.cuda(), requires_grad=True)
+                input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_()
+                grid_cuda = grid_cpu.detach().cuda().requires_grad_()
                 out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode)
                 self.assertEqual(out_cpu, out_cuda)
 
@@ -4873,21 +4903,21 @@ def test_shape(N, C, IH, IW, H, W, padding_mode):
             test_shape(N, C, H, W, H, W, padding_mode)
 
             # test larger output
-            N = random.randint(1, 8)
-            C = random.randint(1, 8)
-            IH = random.randint(1, 8)
-            IW = random.randint(1, 8)
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
             H = random.randint(IH + 1, 12)
             W = random.randint(IW + 1, 12)
             test_shape(N, C, IH, IW, H, W, padding_mode)
 
             # test smaller output
-            N = random.randint(1, 8)
-            C = random.randint(1, 8)
-            IH = random.randint(1, 8)
-            IW = random.randint(1, 8)
-            H = random.randint(1, IH)
-            W = random.randint(1, IW)
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
+            H = random.randint(2, IH)
+            W = random.randint(2, IW)
             test_shape(N, C, IH, IW, H, W, padding_mode)
 
         # test known input on CPU
@@ -4926,42 +4956,38 @@ def test_shape(N, C, IH, IW, H, W, padding_mode):
             # test CUDA against CPU
             if TEST_CUDA:
                 test_cpu_against_cuda(N, C, H, W, padding_mode)
-
-                # test channels >1024, which doesn't work on cudnn 7102 and further
-                N, C, H, W = 1, 1025, 3, 3
-                self.assertTrue(gradcheck(
-                    lambda inp, grid: F.grid_sample(inp, grid, padding_mode=padding_mode),
-                    (input, grid)))
-                test_cpu_against_cuda(N, C, H, W, padding_mode)
+                if TEST_CUDNN:
+                    with cudnn.flags(enabled=False):
+                        test_cpu_against_cuda(N, C, H, W, padding_mode)
 
     def test_grid_sample_3d(self):
         def test_cpu_against_cuda(N, C, D, H, W, padding_mode):
             def test_shape(N, C, ID, IH, IW, D, H, W, padding_mode):
 
-                input_cpu = Variable(torch.randn(C, N, ID, IH, IW).transpose(0, 1), requires_grad=True)
-                grid_cpu = Variable(torch.randn(D, N, H, W, 3).transpose(0, 1), requires_grad=True)
+                input_cpu = torch.randn(C, N, ID, IH, IW).transpose(0, 1).requires_grad_()
+                grid_cpu = torch.randn(D, N, H, W, 3).transpose(0, 1).requires_grad_()
                 out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode)
                 self.assertTrue(out_cpu.size() == torch.Size([N, C, D, H, W]))
 
-                input_cuda = Variable(input_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
-                grid_cuda = Variable(grid_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
+                input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
+                grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
                 out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode)
                 self.assertEqual(out_cpu, out_cuda)
 
-                gradients = out_cpu.data.new(out_cpu.size()).normal_()
+                gradients = torch.randn_like(out_cpu)
                 out_cpu.backward(gradients)
                 out_cuda.backward(gradients.cuda())
                 self.assertEqual(input_cpu.grad, input_cuda.grad)
                 self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5)
 
                 # check that zero-dimensional input strides don't error out
-                base_input = torch.randn(C, ID, IH, IW)
-                input_cpu = Variable(base_input.expand(input_cuda.size()), requires_grad=True)
+                base_input = torch.randn(N, C, 1, IH, IW)
+                input_cpu = base_input.expand_as(input_cuda).requires_grad_()
                 grid_cpu = torch.randn(N, D, H, W, 3, requires_grad=True)
                 out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode)
 
-                input_cuda = Variable(base_input.cuda().expand(input_cuda.size()), requires_grad=True)
-                grid_cuda = Variable(grid_cpu.data.cuda(), requires_grad=True)
+                input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_()
+                grid_cuda = grid_cpu.detach().cuda().requires_grad_()
                 out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode)
                 self.assertEqual(out_cpu, out_cuda)
 
@@ -4969,35 +4995,35 @@ def test_shape(N, C, ID, IH, IW, D, H, W, padding_mode):
             test_shape(N, C, D, H, W, D, H, W, padding_mode)
 
             # test larger output
-            N = random.randint(1, 8)
-            C = random.randint(1, 8)
-            ID = random.randint(1, 8)
-            IH = random.randint(1, 8)
-            IW = random.randint(1, 8)
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            ID = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
             D = random.randint(ID + 1, 12)
             H = random.randint(IH + 1, 12)
             W = random.randint(IW + 1, 12)
             test_shape(N, C, ID, IH, IW, D, H, W, padding_mode)
 
             # test smaller output
-            N = random.randint(1, 8)
-            C = random.randint(1, 8)
-            ID = random.randint(1, 8)
-            IH = random.randint(1, 8)
-            IW = random.randint(1, 8)
-            D = random.randint(1, ID)
-            H = random.randint(1, IH)
-            W = random.randint(1, IW)
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            ID = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
+            D = random.randint(2, ID)
+            H = random.randint(2, IH)
+            W = random.randint(2, IW)
             test_shape(N, C, ID, IH, IW, D, H, W, padding_mode)
 
         # test known input on CPU
         for padding_mode in ['zeros', 'border']:
             # do gradcheck
-            N = random.randint(1, 8)
-            C = random.randint(1, 8)
-            D = random.randint(1, 8)
-            H = random.randint(1, 8)
-            W = random.randint(1, 8)
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            D = random.randint(2, 8)
+            H = random.randint(2, 8)
+            W = random.randint(2, 8)
             input = torch.randn(N, C, D, H, W, requires_grad=True)
             grid = torch.randn(N, D, H, W, 3, requires_grad=True)
             self.assertTrue(gradcheck(
@@ -5540,6 +5566,11 @@ def test_unfold_invalid_arg(self):
             unfold = nn.Unfold(kernel_size=(1, 3), padding=(1, 1), dilation=(1, 2))
             unfold(torch.randn(1, 2, 2, 2))
 
+    def test_softmin(self):
+        x = torch.randn(2, 16)
+        self.assertEqual(F.softmin(x, 1), F.softmax(-x, 1))
+        self.assertEqual(F.softmin(x, 0), F.softmax(-x, 0))
+
     def test_adaptive_log_softmax(self):
         # args validation
         with self.assertRaises(ValueError):
@@ -6006,15 +6037,20 @@ def add(test_name, fn):
     add(test_name, lambda self, test=test: test(self))
     cuda_test_name = test_name + '_cuda'
     # With dtype enable, it's good enough to test against three floating types
+    kwargs = {}
+    if 'extra_args' in get_function_arglist(test.test_cuda):
+        kwargs['extra_args'] = test.extra_args
+
     if 'dtype' in get_function_arglist(test.test_cuda):
         add(cuda_test_name + '_float', lambda self,
-            test=test: test.test_cuda(self, dtype=torch.float))
+            test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.float, **kwargs))
         add(cuda_test_name + '_double', lambda self,
-            test=test: test.test_cuda(self, dtype=torch.double))
-        add(cuda_test_name + '_half', lambda self,
-            test=test: test.test_cuda(self, dtype=torch.half))
+            test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.double, **kwargs))
+        if getattr(test, 'check_half', True):
+            add(cuda_test_name + '_half', lambda self,
+                test=test: test.test_cuda(self, dtype=torch.half, **kwargs))
     else:
-        add(cuda_test_name, lambda self, test=test: test.test_cuda(self))
+        add(cuda_test_name, lambda self, test=test, kwargs=kwargs: test.test_cuda(self, **kwargs))
 
 
 def wrap_functional(fn, **kwargs):
@@ -6174,6 +6210,45 @@ def forward(self, *args):
         check_sum_reduction=True,
         check_gradgrad=False,
     ),
+    dict(
+        module_name='CTCLoss',
+        constructor_args=(14,),  # blank=14
+        extra_args=([50, 50, 50], [30, 25, 20]),  # input_lengths, target_lengths
+        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
+        target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long),
+        reference_fn=lambda i, t, il, tl, m:
+            ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        check_half=False,
+    ),
+    dict(
+        module_name='CTCLoss',
+        desc='1d_target',
+        constructor_args=(14,),  # blank=14
+        extra_args=([50, 50, 50], [30, 25, 20]),  # input_lengths, target_lengths
+        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
+        target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long),
+        reference_fn=lambda i, t, il, tl, m:
+            ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        check_half=False,
+    ),
+    dict(
+        module_name='CTCLoss',
+        desc='2d_int_target',
+        constructor_args=(0,),  # blank=0
+        extra_args=([50, 50, 50], [30, 25, 20]),  # input_lengths, target_lengths
+        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
+        target_fn=lambda: torch.randint(1, 15, (3, 30), dtype=torch.int),
+        reference_fn=lambda i, t, il, tl, m:
+            ctcloss_reference(i, t, il, tl, blank=0, reduction=get_reduction(m)),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        check_half=False,
+        convert_target=False,
+    ),
 ]
 
 
@@ -7766,6 +7841,21 @@ def multimarginloss_weights_no_reduce_test():
         check_inplace=True,
         desc='scalar'
     ),
+    dict(
+        module_name='CELU',
+        input_size=(3, 2, 5),
+        constructor_args=(2.,),
+        check_inplace=True,
+        reference_fn=lambda x, _: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1))
+    ),
+    dict(
+        module_name='CELU',
+        input_size=(),
+        constructor_args=(2.,),
+        check_inplace=True,
+        reference_fn=lambda x, _: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1)),
+        desc='scalar'
+    ),
     dict(
         module_name='GLU',
         input_size=(5, 6),
diff --git a/test/test_optim.py b/test/test_optim.py
index 41c3bfc1964f33..2d5b876dd3a8e1 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -31,7 +31,6 @@ def wrapper(closure, params, state):
 
 
 class TestOptim(TestCase):
-
     def _test_rosenbrock(self, constructor, old_fn):
         params_t = torch.Tensor([1.5, 1.5])
         state = {}
@@ -505,6 +504,20 @@ def forward(self, x):
         return self.conv2(F.relu(self.conv1(x)))
 
 
+class LambdaLRTestObject:
+    def __init__(self, value):
+        self.value = value
+
+    def __call__(self, epoch):
+        return self.value * epoch
+
+    def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            return self.__dict__ == other.__dict__
+        else:
+            return False
+
+
 class TestLRScheduler(TestCase):
     def setUp(self):
         self.net = SchedulerTestNet()
@@ -672,6 +685,28 @@ def test_reduce_lr_on_plateau_state_dict(self):
             if key not in {'optimizer', 'is_better'}:
                 self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True)
 
+    def test_lambda_lr_state_dict_fn(self):
+        scheduler = LambdaLR(self.opt, lr_lambda=lambda x: x)
+        state = scheduler.state_dict()
+        self.assertIsNone(state['lr_lambdas'][0])
+
+        scheduler_copy = LambdaLR(self.opt, lr_lambda=lambda x: x)
+        scheduler_copy.load_state_dict(state)
+        for key in scheduler.__dict__.keys():
+            if key not in {'optimizer', 'lr_lambdas'}:
+                self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True)
+
+    def test_lambda_lr_state_dict_obj(self):
+        scheduler = LambdaLR(self.opt, lr_lambda=LambdaLRTestObject(10))
+        state = scheduler.state_dict()
+        self.assertIsNotNone(state['lr_lambdas'][0])
+
+        scheduler_copy = LambdaLR(self.opt, lr_lambda=LambdaLRTestObject(-1))
+        scheduler_copy.load_state_dict(state)
+        for key in scheduler.__dict__.keys():
+            if key not in {'optimizer'}:
+                self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True)
+
     def _check_scheduler_state_dict(self, constr, constr2, epochs=10):
         scheduler = constr()
         for _ in range(epochs):
diff --git a/test/test_torch.py b/test/test_torch.py
index 2a8c897713111f..edd69473f8505b 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -22,7 +22,7 @@
 from torch import multiprocessing as mp
 from common import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MKL, \
     TEST_LIBROSA, run_tests, download_file, skipIfNoLapack, suppress_warnings, \
-    IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, skipIfNoZeroSize, TEST_WITH_ROCM
+    IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ROCM
 from multiprocessing.reduction import ForkingPickler
 
 if TEST_NUMPY:
@@ -866,7 +866,6 @@ def test_multidim(x, dim):
     def test_dim_reduction(self):
         self._test_dim_reduction(self, lambda t: t)
 
-    @skipIfNoZeroSize
     def test_reduction_empty(self):
         fns_to_test = [
             # name, function, identity
@@ -930,7 +929,6 @@ def test_reduction_empty(self):
             self.assertEqual(torch.ones((2, 1, 4), device=device), xb.all(1, keepdim=True))
             self.assertEqual(torch.ones((), device=device), xb.all())
 
-    @skipIfNoZeroSize
     def test_pairwise_distance_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -1690,6 +1688,7 @@ def test_einsum(self):
             ("...ii->...i", I),       # batch diagonal
             # -- Other
             ("bn,anm,bm->ba", l, w, r),  # as torch.bilinear
+            ("... ii->...i  ", I),       # batch diagonal with spaces
         ]
         for test in test_list:
             actual = torch.einsum(test[0], test[1:])
@@ -2240,7 +2239,6 @@ def test_tensor_factory_cuda_type(self):
         self.assertTrue(x.is_cuda)
         torch.set_default_tensor_type(saved_type)
 
-    @skipIfNoZeroSize
     def test_tensor_factories_empty(self):
         # ensure we can create empty tensors from each factory function
         shapes = [(5, 0, 1), (0,), (0, 0, 1, 0, 2, 0, 0)]
@@ -2927,7 +2925,6 @@ def _test_in_place_broadcastable(t0, t1, t2=None):
     def test_broadcast(self):
         self._test_broadcast(self, lambda t: t)
 
-    @skipIfNoZeroSize
     def test_broadcast_empty(self):
         # empty + empty
         self.assertRaises(RuntimeError, lambda: torch.randn(5, 0) + torch.randn(0, 5))
@@ -2943,6 +2940,17 @@ def test_broadcast_empty(self):
                          torch.randn(0, 7, 0, 6, 5, 0, 1) + torch.randn(1, 1, 5, 1, 7))
         self.assertRaises(RuntimeError, lambda: torch.randn(7, 0) + torch.randn(2, 1))
 
+    def test_broadcast_tensors(self):
+        x0 = torch.randn(2, 1, 3)
+        x1 = torch.randn(3)
+        x2 = torch.randn(3, 1)
+        expected_size = (2, 3, 3)
+
+        y0, y1, y2 = torch.broadcast_tensors(x0, x1, x2)
+        self.assertTrue(y0.size() == expected_size)
+        self.assertTrue(y1.size() == expected_size)
+        self.assertTrue(y2.size() == expected_size)
+
     @staticmethod
     def _test_contiguous(self, cast):
         x = cast(torch.randn(1, 16, 5, 5))
@@ -2957,9 +2965,7 @@ def test_contiguous(self):
         return self._test_contiguous(self, lambda t: t)
 
     def test_empty_tensor_props(self):
-        sizes = [(0,)]
-        if torch._C._use_zero_size_dim():
-            sizes += [(0, 3), (5, 0), (5, 0, 3, 0, 2), (0, 3, 0, 2), (0, 5, 0, 2, 0)]
+        sizes = [(0,), (0, 3), (5, 0), (5, 0, 3, 0, 2), (0, 3, 0, 2), (0, 5, 0, 2, 0)]
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for size in sizes:
             for device in devices:
@@ -3476,9 +3482,6 @@ def test_cat_empty_legacy(self):
 
     @staticmethod
     def _test_cat_empty(self, use_cuda=False):
-        if not torch._C._use_zero_size_dim():
-            return
-
         dtype = torch.float32
         device = 'cuda' if use_cuda else 'cpu'
 
@@ -3524,9 +3527,6 @@ def test_narrow(self):
         self.assertEqual(x.narrow(-2, -1, 1), torch.Tensor([[6, 7, 8]]))
 
     def test_narrow_empty(self):
-        if not torch._C._use_zero_size_dim():
-            return
-
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
             x = torch.randn(2, 3, 4, device=device)
@@ -3658,7 +3658,7 @@ def test_randn(self):
         self.assertEqual(res1, res2)
 
     def test_slice(self):
-        empty = torch.empty(0, 4) if torch._C._use_zero_size_dim() else torch.Tensor()
+        empty = torch.empty(0, 4)
         x = torch.arange(0., 16).view(4, 4)
         self.assertEqual(x[:], x)
         self.assertEqual(x[:4], x)
@@ -4951,10 +4951,7 @@ def consec(size, start=1):
         reference = conv_fn(consec((3, 3, 3)))
 
         # empty tensor indexing
-        if torch._C._use_zero_size_dim():
-            self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new(0, 3, 3))
-        else:
-            self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new())
+        self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new(0, 3, 3))
 
         self.assertEqual(reference[0], consec((3, 3)), 0)
         self.assertEqual(reference[1], consec((3, 3), 10), 0)
@@ -5000,14 +4997,9 @@ def consec(size, start=1):
         self.assertEqual(reference[None, 2:5, None, None], reference.unsqueeze(0)[:, 2:5].unsqueeze(2).unsqueeze(2))
 
         # indexing 0-length slice
-        if torch._C._use_zero_size_dim():
-            self.assertEqual(torch.empty(0, 5, 5), reference[slice(0)])
-            self.assertEqual(torch.empty(0, 5), reference[slice(0), 2])
-            self.assertEqual(torch.empty(0, 5), reference[2, slice(0)])
-        else:
-            self.assertEqual(torch.tensor([]), reference[slice(0)])
-            self.assertEqual(torch.tensor([]), reference[slice(0), 2])
-            self.assertEqual(torch.tensor([]), reference[2, slice(0)])
+        self.assertEqual(torch.empty(0, 5, 5), reference[slice(0)])
+        self.assertEqual(torch.empty(0, 5), reference[slice(0), 2])
+        self.assertEqual(torch.empty(0, 5), reference[2, slice(0)])
         self.assertEqual(torch.tensor([]), reference[2, 1:1, 2])
 
         # indexing with step
@@ -5717,7 +5709,6 @@ def check(src, idx):
         check(src, idx)
         check(src.transpose(1, 2), idx)
 
-    @skipIfNoZeroSize
     def test_take_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -5748,7 +5739,6 @@ def test_put_accumulate(self):
         dst.put_(idx, src, accumulate=True)
         self.assertEqual(dst.tolist(), [[5, 7], [1, 1]])
 
-    @skipIfNoZeroSize
     def test_put_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -6070,7 +6060,6 @@ def _test_view(self, cast):
     def test_view(self):
         TestTorch._test_view(self, lambda x: x)
 
-    @skipIfNoZeroSize
     def test_view_empty(self):
         x = torch.randn(0, 6)
         self.assertEqual((1, 0, 6, 1, 1), x.view(1, 0, 6, 1, 1).shape)
@@ -6096,12 +6085,8 @@ def test_reshape(self):
         self.assertEqual(empty, empty.reshape(-1))
         self.assertEqual(empty, empty.reshape([0]))
         # TODO: fix these once we have multi-dimensional empty tensors
-        if torch._C._use_zero_size_dim():
-            self.assertEqual(empty.reshape([0, 1]).shape, (0, 1))
-            self.assertEqual(empty.reshape([1, -1]).shape, (1, 0))
-        else:
-            self.assertEqual(empty.reshape([0, 1]).shape, (0,))
-            self.assertEqual(empty.reshape([1, -1]).shape, (0,))
+        self.assertEqual(empty.reshape([0, 1]).shape, (0, 1))
+        self.assertEqual(empty.reshape([1, -1]).shape, (1, 0))
         self.assertRaises(RuntimeError, lambda: empty.reshape(1))
 
         x = torch.randn(3, 3)
@@ -6109,7 +6094,6 @@ def test_reshape(self):
         self.assertEqual(x.data_ptr(), x.reshape_as(torch.rand(1, 9, 1)).data_ptr())
         self.assertRaises(RuntimeError, lambda: x.reshape_as(torch.rand(10)))
 
-    @skipIfNoZeroSize
     def test_empty_reshape(self):
         x = torch.randn(0, 6)
         self.assertEqual((1, 0, 6, 1, 1), x.reshape(1, 0, 6, 1, 1).shape)
@@ -6119,7 +6103,6 @@ def test_empty_reshape(self):
         # match NumPy semantics -- don't infer the size of dimension with a degree of freedom
         self.assertRaises(RuntimeError, lambda: x.reshape(0, -1))
 
-    @skipIfNoZeroSize
     def test_tensor_shape_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -6185,7 +6168,6 @@ def test_tensor_shape_empty(self):
             self.assertEqual([(0, 1, 3, 0)], [z.shape for z in torch.split(x, 0, dim=0)])
 
     # functions that operate over a dimension but don't reduce.
-    @skipIfNoZeroSize
     def test_dim_function_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -6309,7 +6291,6 @@ def test_dim_function_empty(self):
             c = torch.randn((0, 1, 2), device=device)
             self.assertEqual(c, c.index_select(0, ind_empty))
 
-    @skipIfNoZeroSize
     def test_blas_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -6379,7 +6360,6 @@ def fn(torchfn, *args):
             A_LU, pivots = fn(torch.btrifact, (2, 0, 0))
             self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape])
 
-    @skipIfNoZeroSize
     def test_blas_alpha_beta_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -6405,7 +6385,6 @@ def test_blas_alpha_beta_empty(self):
             self.assertEqual(torch.full((2, 3), beta * value, device=device),
                              torch.addmm(input=input, mat1=mat, mat2=mat2, alpha=alpha, beta=beta, out=out))
 
-    @skipIfNoZeroSize
     @skipIfNoLapack
     def test_lapack_empty(self):
         # FIXME: these are just a selection of LAPACK functions -- we need a general strategy here.
@@ -6896,9 +6875,6 @@ def test_nonzero(self):
                         self.assertNotEqual(tensor[dst1[i, 0], dst1[i, 1], dst1[i, 2]].item(), 0)
 
     def test_nonzero_empty(self):
-        if not torch._C._use_zero_size_dim():
-            return
-
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
             x = torch.randn(0, 2, 0, 5, 0, device=device)
@@ -7523,15 +7499,11 @@ def test_load_error_msg(self):
         expected_err_msg = (".*You can only torch.load from a file that is seekable. " +
                             "Please pre-load the data into a buffer like io.BytesIO and " +
                             "try to load from it instead.")
-        if PY3:
-            import urllib.request
-            import io
-            resource = urllib.request.urlopen('https://download.pytorch.org/test_data/linear.pt')
-            self.assertRaisesRegex(io.UnsupportedOperation, expected_err_msg, lambda: torch.load(resource))
-        else:
-            import urllib
-            resource = urllib.urlopen('https://download.pytorch.org/test_data/linear.pt')
-            self.assertRaisesRegex(AttributeError, expected_err_msg, lambda: torch.load(resource))
+
+        resource = FilelikeMock(data=b"data")
+        delattr(resource, "tell")
+        delattr(resource, "seek")
+        self.assertRaisesRegex(AttributeError, expected_err_msg, lambda: torch.load(resource))
 
     def test_from_buffer(self):
         a = bytearray([1, 2, 3, 4])
@@ -7894,10 +7866,7 @@ def test_from_numpy(self):
 
         # check zero dimensional
         x = np.zeros((0, 2))
-        if torch._C._use_zero_size_dim():
-            self.assertEqual(torch.from_numpy(x).shape, (0, 2))
-        else:
-            self.assertEqual(torch.from_numpy(x).shape, (0,))
+        self.assertEqual(torch.from_numpy(x).shape, (0, 2))
 
         # check ill-sized strides raise exception
         x = np.array([3., 5., 8.])
@@ -7947,6 +7916,20 @@ def test_ctor_with_numpy_array(self):
                 for i in range(len(array)):
                     self.assertEqual(tensor[i], array[i])
 
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_ctor_with_numpy_scalar_ctor(self):
+        dtypes = [
+            np.double,
+            np.float,
+            np.float16,
+            np.int64,
+            np.int32,
+            np.int16,
+            np.uint8
+        ]
+        for dtype in dtypes:
+            self.assertEqual(dtype(42), torch.tensor(dtype(42)).item())
+
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_numpy_index(self):
         i = np.int32([0, 1, 2])
@@ -8034,6 +8017,17 @@ def test_numpy_array_interface(self):
             for i in range(len(x)):
                 self.assertEqual(geq2_x[i], geq2_array[i])
 
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_multiplication_numpy_scalar(self):
+        np_sc = np.float64(2.0)
+        t = torch.ones(2, requires_grad=True)
+        r1 = np_sc * t
+        self.assertIsInstance(r1, torch.Tensor)
+        self.assertTrue(r1.requires_grad)
+        r2 = t * np_sc
+        self.assertIsInstance(r2, torch.Tensor)
+        self.assertTrue(r2.requires_grad)
+
     def test_error_msg_type_translation(self):
         with self.assertRaisesRegex(
                 RuntimeError,
diff --git a/third_party/eigen b/third_party/eigen
index e9e95489a0b241..cafae68f33f7f4 160000
--- a/third_party/eigen
+++ b/third_party/eigen
@@ -1 +1 @@
-Subproject commit e9e95489a0b241412e31f0525e85b2fab386c786
+Subproject commit cafae68f33f7f41270b2e8c2dd181f510aa4d918
diff --git a/third_party/onnx b/third_party/onnx
index c761845c7f6880..32ac71b1b9c1bd 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit c761845c7f6880ab7eb7e2866d673834c7149e89
+Subproject commit 32ac71b1b9c1bd7f196eed3b311734ec6ab3c367
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 14fd6d7cf5e09c..a66cb77f8ce9dd 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -201,6 +201,9 @@
 - name: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad)
   self, weight, bias: conv_tbc_backward(grad, self, weight, bias, pad)
 
+- name: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank)
+  log_probs: _ctc_loss_backward(grad, log_probs, targets, input_lengths, target_lengths, result0, result1, blank)
+
 - name: det(Tensor self)
   self: det_backward(grad, self, result)
 
@@ -308,6 +311,12 @@
   self: gesv_backward_self(grad, self, A)
   A: gesv_backward_A(grad, self, A, result0)
 
+- name: grid_sampler_2d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode)
+  input, grid: grid_sampler_2d_backward(grad, input, grid, interpolation_mode, padding_mode)
+
+- name: grid_sampler_3d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode)
+  input, grid: grid_sampler_3d_backward(grad, input, grid, interpolation_mode, padding_mode)
+
 - name: gt_(Tensor self, Scalar other)
   self: zeros_like(self)
 
@@ -802,8 +811,8 @@
 - name: relu(Tensor self)
   self: threshold_backward(grad, self, 0, 0)
 
-- name: elu_forward(Tensor self, Scalar alpha, Scalar scale)
-  self: elu_backward(grad, alpha, scale, output)
+- name: elu_forward(Tensor self, Scalar alpha, Scalar scale, Scalar input_scale)
+  self: elu_backward(grad, alpha, scale, input_scale, output)
 
 - name: glu_forward(Tensor self, int64_t dim)
   self: glu_backward(grad, self, dim)
@@ -974,12 +983,6 @@
 - name: thnn_conv_dilated3d_backward(Tensor grad_output, Tensor self, Tensor weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, Tensor columns, Tensor ones, std::array<bool,3> output_mask)
   grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, {{0, 0, 0}}, 1, false, false, false, grad_input_mask)
 
-- name: thnn_grid_sampler_bilinear2d_forward(Tensor self, Tensor grid, int64_t padding_mode)
-  self, grid: thnn_grid_sampler_bilinear2d_backward(grad, self, grid, padding_mode)
-
-- name: thnn_grid_sampler_bilinear3d_forward(Tensor self, Tensor grid, int64_t padding_mode)
-  self, grid: thnn_grid_sampler_bilinear3d_backward(grad, self, grid, padding_mode)
-
 # NN double backwards support
 
 - name: adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self)
@@ -1006,9 +1009,9 @@
   grad_output: avg_pool3d(grad, kernel_size, stride, padding, ceil_mode, count_include_pad)
   self: zeros_like(self)
 
-- name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Tensor output)
-  grad_output: elu_backward(grad, alpha, scale, output)
-  output: grad * grad_output * (output < 0).toType(grad.type())
+- name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output)
+  grad_output: elu_backward(grad, alpha, scale, input_scale, output)
+  output: grad * grad_output * input_scale * (output < 0).toType(grad.type())
 
 - name: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList output_size, Tensor indices)
   grad_output: max_pool_double_backward(grad, indices, 2)
@@ -1145,6 +1148,8 @@
   output: -2 * output * grad * grad_output
 
 # cudnn
+- name: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic)
+  log_probs: result1
 
 - name: cudnn_convolution_transpose(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic)
   self, weight, bias: cudnn_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask)
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 45af42655f96cc..2bee61b024317e 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -340,6 +340,8 @@ def save_variables(saved_variables, is_output):
             elif arg['type'] == 'TensorList':
                 name += '_'
                 expr = 'make_saved_variable_list({})'.format(arg['name'])
+            elif arg['type'] == 'IntList':
+                expr = expr + ".vec()"
             stmts.append('grad_fn->{} = {};'.format(name, expr))
         return stmts
 
diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
index f859f814b4f8bc..0622fae5f2e8e8 100644
--- a/tools/autograd/templates/Functions.cpp
+++ b/tools/autograd/templates/Functions.cpp
@@ -175,7 +175,7 @@ Tensor prod_safe_zeros_backward(const Tensor &grad, const Tensor& inp, int64_t d
     return grad;
   }
 
-  std::vector<int64_t> ones_size(inp.sizes());
+  auto ones_size = inp.sizes().vec();
   ones_size[dim] = 1;
   Tensor ones = at::ones(ones_size, grad.type());
   Tensor exclusive_normal_nocp = at::cat({ones, inp.narrow(dim, 0, inp.size(dim) - 1)}, dim);
@@ -328,7 +328,7 @@ Tensor cumprod_backward(const Tensor &grad, const Tensor &input, int64_t dim) {
     return sum_scan_exclusive(result * grad, dim) / input;
   }
 
-  std::vector<int64_t> ones_size(input.sizes());
+  auto ones_size = input.sizes().vec();
   ones_size[dim] = 1;
   Tensor ones = at::ones({1}, grad.type()).expand(ones_size);
   Tensor grad_input = at::zeros(input.sizes(), grad.type());
@@ -461,7 +461,7 @@ Tensor mm_mat2_backward(const Tensor & grad, const Tensor & mat1, IntList sizes,
 }
 
 Tensor renorm_backward(const Tensor & grad, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
-  auto transposed_sizes = std::vector<int64_t>(self.transpose(dim, 0).sizes());
+  auto transposed_sizes = self.transpose(dim, 0).sizes().vec();
   auto flatten = [&](const Tensor & t) {
     return t.transpose(dim, 0).contiguous().view({t.size(dim), -1});
   };
@@ -637,7 +637,7 @@ Tensor split_with_sizes_backward(const std::vector<torch::autograd::Variable> &g
       grads_all_defined[j] = grads[j];
     } else {
       auto length = split_sizes[j];
-      std::vector<int64_t> grad_size(sizes);
+      auto grad_size = sizes.vec();
       grad_size[dim] = length;
       grads_all_defined[j] = at::zeros(grad_size, type);
     }
@@ -659,7 +659,7 @@ Tensor split_backward(const std::vector<torch::autograd::Variable> &grads,
 
 Tensor max_pool_double_backward(const Tensor & grad, const Tensor & indices, int dim) {
   AT_ASSERT(indices.dim() >= dim);
-  auto size = std::vector<int64_t>(indices.sizes().slice(0, indices.dim() - dim));
+  auto size = indices.sizes().slice(0, indices.dim() - dim).vec();
   size.push_back(-1);
   auto indices_view = indices.view(size);
   return grad.contiguous().view(size).gather(-1, indices_view).view(indices.sizes());
@@ -686,7 +686,7 @@ Tensor glu_double_backward(const Tensor & grad, const Tensor & grad_output, cons
 
 Tensor glu_double_backward_grad_output(const Tensor & grad, const Tensor & input, int64_t dim) {
   if (dim < 0) dim += input.dim();
-  std::vector<int64_t> sizes = input.sizes();
+  auto sizes = input.sizes().vec();
   sizes[dim] /= 2;
   auto tmp = grad * glu_backward(at::ones(sizes, input.type()), input, dim);
   return tmp.narrow(dim, 0, sizes[dim]) + tmp.narrow(dim, sizes[dim], sizes[dim]);
@@ -1545,27 +1545,27 @@ Tensor symeig_backward(const std::vector<torch::autograd::Variable> &grads, cons
                     bool eigenvectors, bool upper, const Tensor& lambda, const Tensor& v) {
     auto glambda = grads[0];
     auto gv = grads[1];
-    
+
     auto vt = v.t();
-    
+
     if (!eigenvectors) {
         throw std::runtime_error(std::string("cannot compute backward without "
                                              "computing eigenvectors in forward pass"));
     }
-    
+
     Tensor result;
     if (gv.defined()) {
         Tensor F = lambda.unsqueeze(0).expand_as(self).clone();
         F.sub_(at::unsqueeze(lambda, 1));
         F.diagonal().fill_(INFINITY);
         F.pow_(-1);
-        
+
         F.mul_(vt.mm(gv));
         result = v.mm(F.mm(vt));
     } else {
         result = at::zeros_like(self);
     }
-    
+
     if (glambda.defined()) {
         result.add_((v * glambda).mm(vt));
     }
diff --git a/tools/autograd/templates/Functions.h b/tools/autograd/templates/Functions.h
index ae95bf7197770e..00d927f1fdf7f8 100644
--- a/tools/autograd/templates/Functions.h
+++ b/tools/autograd/templates/Functions.h
@@ -29,7 +29,7 @@ struct TypeAndSize {
   TypeAndSize() : type(nullptr) {}
   /* implicit */
   TypeAndSize(const Tensor & t)
-    : sizes(t.sizes())
+    : sizes(t.sizes().vec())
     , type(&t.type()) {}
 
   Tensor zeros() { return at::zeros(sizes, *type); }
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 2f1adf0ab59f4b..bd4c59cfe9d380 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -398,7 +398,7 @@ Tensor VariableType::contiguous(const Tensor & self) const {
 static std::vector<std::vector<int64_t>> to_args_sizes(TensorList tensors) {
   std::vector<std::vector<int64_t>> args_sizes(tensors.size());
   for (size_t i = 0; i < tensors.size(); ++i) {
-    args_sizes[i] = tensors[i].sizes();
+    args_sizes[i] = tensors[i].sizes().vec();
   }
   return args_sizes;
 }
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 4a0dbd04c905f1..8f79c2830e96c0 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -97,7 +97,11 @@ if [[ $(uname) == 'Darwin' ]]; then
     LDFLAGS="$LDFLAGS -Wl,-rpath,@loader_path"
     LD_POSTFIX=".dylib"
 else
-    LDFLAGS="$LDFLAGS -Wl,-rpath,\$ORIGIN"
+    if [[ $USE_ROCM -eq 1 ]]; then
+        LDFLAGS="$LDFLAGS -Wl,-rpath,\\\\\\\$ORIGIN"
+    else
+        LDFLAGS="$LDFLAGS -Wl,-rpath,\$ORIGIN"
+    fi
 fi
 CPP_FLAGS=" -std=c++11 "
 GLOO_FLAGS=""
diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py
index abbadc70691b46..77b101dedf0f3e 100644
--- a/tools/clang_tidy.py
+++ b/tools/clang_tidy.py
@@ -7,6 +7,7 @@
 import subprocess
 import sys
 
+
 DEFAULT_FILE_PATTERN = r".*\.[ch](pp)?"
 
 # @@ -start,count +start,count @@
@@ -26,6 +27,11 @@ def run_shell_command(arguments, process_name=None):
         return output.decode()
 
 
+def normalize_directory_path(path):
+    """Normalizes a directory path."""
+    return path.rstrip('/')
+
+
 def transform_globs_into_regexes(globs):
     """Turns glob patterns into regular expressions."""
     return [glob.replace("*", ".*").replace("?", ".") for glob in globs]
@@ -49,16 +55,37 @@ def git_diff(args, verbose):
     return run_shell_command(command, process_name="git diff")
 
 
-def filter_files(files, file_patterns):
+def filter_files(files, file_patterns, verbose):
     """Returns all files that match any of the patterns."""
     filtered = []
     for file in files:
+        has_match = False
         for pattern in file_patterns:
-            if pattern.match(file):
+            if pattern.search(file):
                 filtered.append(file)
+                has_match = True
+        if not has_match and verbose:
+            message = "{} does not match any ".format(file)
+            message += "file pattern in {{{}}}".format(', '.join(map(str, file_patterns)))
+            print(message)
     return filtered
 
 
+def remove_recursive_files(files, paths, verbose):
+    """
+    Removes all files that are not immediately under one of the given paths.
+    """
+    for file in files:
+        if os.path.dirname(file) in paths:
+            yield file
+        else:
+            if verbose:
+
+                message = "{} ({}) does not match any ".format(file, os.path.dirname(file))
+                message += "non-recursive path in {{{}}}".format(", ".join(paths))
+                print(message)
+
+
 def get_changed_files(revision, paths, verbose):
     """Runs git diff to get the paths of all changed files."""
     # --diff-filter AMU gets us files that are (A)dded, (M)odified or (U)nmerged (in the working copy).
@@ -152,7 +179,17 @@ def parse_options():
     )
     parser.add_argument("-r", "--revision", help="Git revision to get changes from")
     parser.add_argument(
-        "-p", "--paths", nargs="+", default=["."], help="Lint only the given paths"
+        "-p",
+        "--paths",
+        nargs="+",
+        default=["."],
+        help="Lint only the given paths (recursively)",
+    )
+    parser.add_argument(
+        "-n",
+        "--no-recursive",
+        action="store_true",
+        help="If paths are supplied with -p/--paths, do not recurse into paths",
     )
     parser.add_argument(
         "-s",
@@ -173,12 +210,15 @@ def parse_options():
 
 def main():
     options = parse_options()
+    paths = map(normalize_directory_path, options.paths)
     if options.revision:
-        files = get_changed_files(options.revision, options.paths, options.verbose)
+        files = get_changed_files(options.revision, paths, options.verbose)
     else:
-        files = get_all_files(options.paths)
+        files = get_all_files(paths)
+    if options.no_recursive:
+        files = remove_recursive_files(files, paths, options.verbose)
     file_patterns = get_file_patterns(options.glob, options.regex)
-    files = filter_files(files, file_patterns)
+    files = filter_files(files, file_patterns, options.verbose)
 
     # clang-tidy error's when it does not get input files.
     if not files:
diff --git a/tools/cpp_build/build_caffe2.sh b/tools/cpp_build/build_caffe2.sh
index b35435acb388c6..6a50c14e05523e 100755
--- a/tools/cpp_build/build_caffe2.sh
+++ b/tools/cpp_build/build_caffe2.sh
@@ -24,6 +24,7 @@ cmake -DUSE_CUDA:BOOL=$USE_CUDA \
       -DCMAKE_BUILD_TYPE:STRING=$BUILD_TYPE \
       -DCMAKE_INSTALL_PREFIX:STRING=$INSTALL_PREFIX \
       -DCMAKE_INSTALL_MESSAGE=NEVER \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \
       -G "$GENERATE" \
       $PYTORCHPATH/
 $MAKE -j "$JOBS" install
diff --git a/tools/cpp_build/build_libtorch.sh b/tools/cpp_build/build_libtorch.sh
index 92a9b9981ed697..6dd9a589cf1074 100755
--- a/tools/cpp_build/build_libtorch.sh
+++ b/tools/cpp_build/build_libtorch.sh
@@ -24,6 +24,7 @@ cmake -DUSE_CUDA:BOOL=$USE_CUDA \
       -DCMAKE_INSTALL_MESSAGE=NEVER \
       -Dnanopb_BUILD_GENERATOR:BOOL=OFF \
       -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \
       -DVERBOSE:BOOL=${VERBOSE:-0} \
       -G "$GENERATE" \
       $PYTORCHPATH/torch
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index ad9ad2e05c4f4c..5a76d447ad2498 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -52,28 +52,6 @@ def jit_type_of(arg):
         typ = '{}?'.format(typ)
     return typ
 
-# map from aten 'simple_type' to the function that will cast a attribute value
-# to that type
-FROM_ATTRIBUTE = {
-    'Device': 'as_device(node->is(attr::{}))',
-    'IntList': 'std::vector<int64_t>(node->is(attr::{}))',
-    'Layout': 'static_cast<at::Layout>(node->i(attr::{}))',
-    'Scalar': 'Scalar(node->t(attr::{}))',
-    'ScalarType': 'static_cast<at::ScalarType>(node->i(attr::{}))',
-    'Tensor': 'node->t(attr::{})',
-    'bool': 'bool(node->i(attr::{}))',
-    'double': 'node->f(attr::{})',
-    'int64_t': 'node->i(attr::{})',
-    'std::array<bool,2>': 'as_bool_array<2>(node->is(attr::{}))',
-    'std::array<bool,3>': 'as_bool_array<3>(node->is(attr::{}))',
-    'std::array<bool,4>': 'as_bool_array<4>(node->is(attr::{}))',
-}
-
-
-def from_attribute(arg):
-    simple_type = arg['simple_type']
-    return FROM_ATTRIBUTE[simple_type].format(arg['name'])
-
 
 # map from aten 'simple_type' to the function that will turn a tensor into
 # that type
@@ -84,6 +62,7 @@ def from_attribute(arg):
     'Scalar': '{}.toScalar()',
     'ScalarType': 'static_cast<at::ScalarType>({}.toInt())',
     'Tensor': '{}.toTensor()',
+    'TensorList': '{}.toTensorList()->elements()',
     'bool': 'bool({}.toInt())',
     'double': '{}.toDouble()',
     'int64_t': '{}.toInt()',
@@ -98,15 +77,13 @@ def from_ivalue(arg, value):
     return FROM_IVALUE[simple_type].format(value)
 
 
-KW_ACCESS = CodeTemplate("""(node->${method}(Symbol::attr("${name}")))""")
-
 CALL_NAMESPACE = CodeTemplate("""\
 auto result = at::${name}(
     ${args}
 );
 """)
 CALL_METHOD = CodeTemplate("""\
-DeviceGuard device_guard(deviceForInputs(stack, ${num_dynamic_inputs}));
+DeviceGuard device_guard(deviceForInputs(stack, ${num_inputs}));
 auto result = (${first}).${name}(
     ${args}
 );
@@ -122,24 +99,20 @@ def from_ivalue(arg, value):
 );
 """)
 
-# TODO (apaszke): remove the attributed codepath once we remove them
 CONSTRUCTOR = CodeTemplate("""\
-[](Node *node) {
-  ${kw_assignments}
-  return Operation([=](Stack & stack) {
+[](Stack & stack) {
     autograd::profiler::RecordFunction record("${name}");
     ${call}
-    drop(stack, ${num_dynamic_inputs});
+    drop(stack, ${num_inputs});
     pack(stack, std::move(result));
     return 0;
-  });
 }
 """)
 
 OPERATOR = CodeTemplate("""\
 Operator(
     "${signature}",
-    ${ops}
+    ${op}
 ),
 """)
 
@@ -171,9 +144,6 @@ def is_jit_op(decl):
     # we currently only support vararg tensor lists when they are the _first_ argument
     # and the only tensor argument
     arguments = decl['arguments']
-    # Only support a single TensorList arg
-    if sum(arg['simple_type'] == 'TensorList' for arg in arguments) > 1:
-        return False
 
     return ((not decl['api_name'].endswith('_') or is_magic_method(decl['api_name'])) and
             not decl['name'].endswith('_out') and
@@ -197,7 +167,7 @@ def gen_jit_dispatch(declarations, out, template_path):
 
     ops = []
 
-    def get_invocation(decl, args, num_dynamic_inputs):
+    def get_invocation(decl, args, num_inputs):
 
         # because the arg list can get lengthy we put them on a separate line
         def pack_arguments(args):
@@ -211,109 +181,36 @@ def pack_arguments(args):
         elif 'namespace' in decl['method_of']:
             return CALL_NAMESPACE.substitute(name=decl['name'],
                                              args=pack_arguments(args),
-                                             num_dynamic_inputs=num_dynamic_inputs)
+                                             num_inputs=num_inputs)
         else:
             return CALL_METHOD.substitute(
                 name=decl['name'], first=args[0], args=pack_arguments(args[1:]),
-                num_dynamic_inputs=num_dynamic_inputs)
+                num_inputs=num_inputs)
 
-    def emit_decl_variant(decl, is_positional_arg, has_tensorlist):
-        # is_positional_arg is a boolean list the same length as decl['arguments']
-        # that indicates if the argument should come from the postional list
-        # of inputs. If false, the argument comes from the constant attributes
+    def emit_decl_variant(decl):
         kw_assignments = []
         arguments = []
-
-        if has_tensorlist:
-            kw_assignments.append('size_t varargs_length = node->inputs().size();')
-            # arguments look like: [tensor list], arg1, arg2, arg3
-            # we use peek(<i>, static_inputs) to read the non-vararg inputs
-            # from the end of the stack
-            static_inputs = sum(is_positional_arg) - 1
-            num_dynamic_inputs = 'varargs_length'
-            tensorlist_idx = [i for i, arg in enumerate(decl['arguments']) if arg['simple_type'] == 'TensorList'][0]
-        else:
-            static_inputs = sum(is_positional_arg)
-            num_dynamic_inputs = static_inputs
+        num_inputs = len(decl['arguments'])
 
         real_inputs = 0
-        for i, arg in enumerate(decl['arguments']):
-            # This conditional allows us to process argument lists with a flattened argument list
-            # with a single TensorList. Given the sequence of arguments:
-            # a b c [d e f g] h i # [] is the list
-            #
-            # 1. For the section where we are processing positional inputs before the
-            #    TensorList:
-            #    a b c [d e f g] h i # [] is the list
-            #    ~~~~~~~~~~~~ <- N
-            #   we set this view_length to the total number of varargs inputs (i.e. the length)
-            #   of the whole argument list. This means that indexing into the list using peek()
-            #   we will retrieve arguments ar their true indices (i.e. peek at 0 points to a,
-            #   1 points to b, etc...). Similarly, we can use peekSlice() to index into the
-            #   list itself this way.
-            # 2. After the list:
-            #    a b c [d e f g] h i # [] is the list
-            #                 ~~~~~~ <- N
-            #   Here we set the view length to static_inputs. In our example,
-            #   we effectively ignore the fact that we have a list here. What is
-            #   significant is that our index i is equivalent when the view length
-            #   is right-justified, whether we have the list or not. Concretely,
-            #   indexing h or i from `a b c [d e f g] h i` is equvalent to indexing
-            #   h or i from `a b c h i`.
-            view_length = 'varargs_length' if has_tensorlist and i < tensorlist_idx else static_inputs
-
-            if arg['simple_type'] == 'TensorList':
-                # NOTE: don't advance real_inputs here. After this we are going
-                # to switch over to indexing from the end as if we only had
-                # the static arguments.
-                arguments.append('toTensors(peekSlice(stack, {}, varargs_length - {}, varargs_length))'
-                                 .format(real_inputs, static_inputs))
-            elif arg['simple_type'] in default_only_types:
+        for arg in decl['arguments']:
+            if arg['simple_type'] in default_only_types:
                 arguments.append(arg['default'])
-            elif is_tensor_arg(arg) or is_positional_arg[i]:
-                value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, view_length)
+            else:
+                value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, num_inputs)
                 arguments.append(from_ivalue(arg, value))
                 real_inputs += 1
-            else:
-                assign = "auto {} = {};".format(arg['name'], from_attribute(arg))
-                kw_assignments.append(assign)
-                arguments.append(arg['name'])
 
-        call = get_invocation(decl, arguments, num_dynamic_inputs)
+        call = get_invocation(decl, arguments, num_inputs)
 
         returns = decl['returns']
-        all_scalars = all(r['dynamic_type'] != 'TensorList' for r in returns)
 
         constructor = CONSTRUCTOR.substitute(name=decl['name'],
                                              call=call,
                                              kw_assignments=kw_assignments,
-                                             num_dynamic_inputs=num_dynamic_inputs)
+                                             num_inputs=num_inputs)
         return constructor
 
-    def emit_decl(decl):
-        arguments = decl['arguments']
-        has_tensorlist = any(arg['simple_type'] == 'TensorList' for arg in arguments)
-        num_tensor_args = sum(map(is_tensor_arg, arguments))
-
-        # Right now, we generate dispatch methods that either take all non-tensor arguments
-        # as attributes, or don't use any attributes at all. In the future we might want to
-        # have something in the middle too (might be useful for e.g. constant propagation
-        # into attributes, as that would allow us to avoid reparsing tensors into scalar
-        # args at every invocation).
-
-        all_real_arguments_are_inputs = tuple(arg['simple_type'] not in default_only_types for arg in arguments)
-        only_tensors_are_inputs = tuple(is_tensor_arg(arg) for arg in arguments)
-
-        variants = [emit_decl_variant(decl, all_real_arguments_are_inputs, has_tensorlist)]
-        # in some cases there are no inputs that are possibly attributes, so the
-        # variants are actually the same. If so avoid generating both to save compilation
-        # time.
-        if all_real_arguments_are_inputs != only_tensors_are_inputs:
-            variants += [',', emit_decl_variant(decl, only_tensors_are_inputs, has_tensorlist)]
-
-        ops.append(OPERATOR.substitute(signature=signature(decl),
-                                       ops=variants))
-
     # This function declares an order on declarations. This is necessary because
     # there is some ambiguity in the choice of overload: if an argument is overloaded
     # to accept both Scalar and Tensor, the schema with the Tensor should come first
@@ -376,7 +273,8 @@ def declkey(decl):
 
     jit_decls = sort_decls(jit_decls)
     for decl in jit_decls:
-        emit_decl(decl)
+        ops.append(OPERATOR.substitute(signature=signature(decl),
+                                       op=emit_decl_variant(decl)))
 
     # Sort the generated snippets to ensure that the generation is deterministic
     env = {
diff --git a/tools/jit/templates/register_aten_ops.cpp b/tools/jit/templates/register_aten_ops.cpp
index 06ad9c2840b1cc..3dc973463d6e90 100644
--- a/tools/jit/templates/register_aten_ops.cpp
+++ b/tools/jit/templates/register_aten_ops.cpp
@@ -29,7 +29,6 @@ using autograd::Variable;
 using autograd::variable_list;
 using at::Scalar;
 using at::Tensor;
-using at::TensorList;
 using at::TensorOptions;
 using at::DeviceGuard;
 
@@ -42,26 +41,20 @@ int deviceForInputs(Stack & stack, size_t N) {
   return t.type().is_cuda() ? (int) t.get_device() : -1;
 }
 
-std::vector<at::Tensor> toTensors(at::ArrayRef<IValue> ivalues) {
-  return fmap(ivalues, [](const IValue& v) {
-    return v.toTensor();
-  });
-}
-
 template<size_t N>
-std::array<bool, N> as_bool_array(const std::vector<int64_t>& vec) {
+std::array<bool, N> as_bool_array(at::ArrayRef<int64_t> vec) {
   std::array<bool, N> res;
   JIT_ASSERT(vec.size() == N);
   std::copy(vec.begin(), vec.end(), res.begin());
   return res;
 }
 
-at::Device as_device(const std::vector<int64_t>& elements) {
+at::Device as_device(ArrayRef<int64_t> elements) {
   return at::Device(static_cast<at::Device::Type>(elements[0]), elements[1]);
 }
 
 RegisterOperators reg({
-${constructors}
+  ${constructors}
 });
 
 } // anon namespace
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 88546fda7ed604..057bf6efeac3dd 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -102,6 +102,7 @@ add_custom_command(
   "${TOOLS_PATH}/autograd/gen_autograd.py"
   "${TOOLS_PATH}/autograd/gen_autograd_functions.py"
   "${TOOLS_PATH}/autograd/gen_variable_type.py"
+  "${TOOLS_PATH}/jit/gen_jit_dispatch.py"
   "${TOOLS_PATH}/jit/templates/register_aten_ops.cpp"
   "${TOOLS_PATH}/jit/templates/aten_interned_strings.h"
   WORKING_DIRECTORY "${TORCH_SRC_DIR}/..")
@@ -138,6 +139,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp
@@ -161,8 +163,6 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp
   ${TORCH_SRC_DIR}/csrc/jit/tracer.cpp
   ${TORCH_SRC_DIR}/csrc/jit/type.cpp
-  ${TORCH_SRC_DIR}/csrc/onnx/onnx.cpp
-  ${TORCH_SRC_DIR}/csrc/onnx/onnx.npb.cpp
   ${TORCH_SRC_DIR}/csrc/torch.cpp
   ${TORCH_SRC_DIR}/csrc/utils/tensor_flatten.cpp
   ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp
@@ -267,6 +267,12 @@ if(OPENMP_FOUND)
   target_link_libraries(torch -fopenmp)
 endif()
 
+if (NOT NO_API AND NOT USE_ROCM)
+  target_include_directories(torch PUBLIC
+    ${TORCH_SRC_DIR}/csrc/api
+    ${TORCH_SRC_DIR}/csrc/api/include)
+endif()
+
 if(USE_CUDA)
   if(MSVC)
     set(TORCH_CUDA_LIBRARIES
@@ -365,7 +371,7 @@ install(TARGETS torch
   ARCHIVE DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
 # JIT Tests. TODO: Put into test/cpp/jit folder
-if (NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
+if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
   add_executable(test_jit ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp)
   target_link_libraries(test_jit torch ${TORCH_CUDA_LIBRARIES})
   target_compile_definitions(test_jit PUBLIC USE_CATCH _FORCE_INLINES)
@@ -379,10 +385,6 @@ if (NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
 endif()
 
 if (BUILD_TORCH_TEST AND NOT NO_API AND NOT USE_ROCM)
-  target_include_directories(torch PUBLIC
-    ${TORCH_SRC_DIR}/csrc/api
-    ${TORCH_SRC_DIR}/csrc/api/include)
-
   set(TORCH_API_TEST_DIR "${TORCH_SRC_DIR}/../test/cpp/api")
 
   add_executable(test_api
diff --git a/torch/__init__.py b/torch/__init__.py
index 3fbb0b76fcc386..a40111bcca6b02 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -298,3 +298,8 @@ def manager_path():
 # attach docstrings to torch and tensor functions
 from . import _torch_docs, _tensor_docs, _storage_docs
 del _torch_docs, _tensor_docs, _storage_docs
+
+
+def compiled_with_cxx11_abi():
+    r"""Returns whether PyTorch was built with _GLIBCXX_USE_CXX11_ABI=1"""
+    return _C._GLIBCXX_USE_CXX11_ABI
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 2194310a46d522..af367c3e544905 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -402,16 +402,6 @@ PyObject *THPModule_isDefaultTypeCuda(PyObject *_unused, PyObject *arg) {
   END_HANDLE_TH_ERRORS
 }
 
-PyObject *THPModule_useZeroSizeDim(PyObject *_unused, PyObject *arg) {
-  HANDLE_TH_ERRORS
-#ifdef USE_TH_SIZE_ZERO_DIM
-  Py_RETURN_TRUE;
-#else
-  Py_RETURN_FALSE;
-#endif
-  END_HANDLE_TH_ERRORS
-}
-
 static PyMethodDef TorchMethods[] = {
   {"_initExtension",  (PyCFunction)THPModule_initExtension,   METH_O,       NULL},
   {"_autograd_init",  (PyCFunction)THPAutograd_initExtension, METH_NOARGS,  NULL},
@@ -442,7 +432,6 @@ static PyMethodDef TorchMethods[] = {
   {"set_flush_denormal", (PyCFunction)THPModule_setFlushDenormal, METH_O,     NULL},
   {"get_default_dtype", (PyCFunction)THPModule_getDefaultDtype, METH_NOARGS,  NULL},
   {"_is_default_type_cuda", (PyCFunction)THPModule_isDefaultTypeCuda, METH_NOARGS,  NULL},
-  {"_use_zero_size_dim", (PyCFunction)THPModule_useZeroSizeDim, METH_NOARGS,  NULL},
   {NULL, NULL, 0, NULL}
 };
 
@@ -624,6 +613,13 @@ static PyObject* initModule() {
 
   ASSERT_TRUE(PyModule_AddObject(module, "has_mkl", at::hasMKL() ? Py_True : Py_False) == 0);
 
+#ifdef _GLIBCXX_USE_CXX11_ABI
+  ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI",
+        _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False) == 0);
+#else
+  ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI", Py_False) == 0);
+#endif
+
   auto& defaultGenerator = at::globalContext().defaultGenerator(at::kCPU);
   THPDefaultGenerator = (THPGenerator*)THPGenerator_NewWithGenerator(
     defaultGenerator);
diff --git a/torch/csrc/api/include/torch/nn/cursor.h b/torch/csrc/api/include/torch/nn/cursor.h
index c0f56eea72fbd0..2ae5c5d93752c1 100644
--- a/torch/csrc/api/include/torch/nn/cursor.h
+++ b/torch/csrc/api/include/torch/nn/cursor.h
@@ -48,7 +48,7 @@ class CursorBase {
 
   /// A `(key, value)` pair exposed by cursor iterators.
   struct Item {
-    Item(const std::string& key_, T& module_);
+    Item(const std::string& key_, T& value_);
 
     T& operator*();
     const T& operator*() const;
diff --git a/torch/csrc/autograd/anomaly_mode.h b/torch/csrc/autograd/anomaly_mode.h
index 7327d03f11b887..1f12f0a65c7460 100644
--- a/torch/csrc/autograd/anomaly_mode.h
+++ b/torch/csrc/autograd/anomaly_mode.h
@@ -18,7 +18,7 @@ struct AnomalyMode {
 
 
 struct AnomalyMetadata {
-  virtual ~AnomalyMetadata(){};
+  virtual ~AnomalyMetadata() = default;
   virtual void store_stack() = 0;
   virtual void print_stack() = 0;
 };
diff --git a/torch/csrc/autograd/aten_variable_hooks.cpp b/torch/csrc/autograd/aten_variable_hooks.cpp
index 7a2c3974c2227c..2f3899e4f8b59a 100644
--- a/torch/csrc/autograd/aten_variable_hooks.cpp
+++ b/torch/csrc/autograd/aten_variable_hooks.cpp
@@ -6,6 +6,7 @@ namespace torch { namespace autograd {
 struct VariableHooks : public at::VariableHooksInterface {
   VariableHooks(at::VariableHooksArgs) {}
   void registerVariableTypeFor(at::Context*, at::Backend, at::ScalarType) const override;
+  at::Type& getVariableType(const at::Type&) const override;
 };
 
 // Sigh, the registry doesn't support namespaces :(
@@ -20,4 +21,8 @@ void VariableHooks::registerVariableTypeFor(at::Context* context, at::Backend ba
   register_variable_type_for(baseType);
 }
 
+at::Type& VariableHooks::getVariableType(const at::Type& baseType) const {
+  return *VariableType::getType(baseType);
+}
+
 }} // torch::autograd
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 8309ba1ce1038c..74e15f5caefe9d 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -159,7 +159,7 @@ struct GraphTask {
   std::unordered_map<Function*, ExecInfo> exec_info;
   std::vector<Variable> captured_vars;
 
-  void init_to_execute(Function& graph_root, const edge_list& captures);
+  void init_to_execute(Function& graph_root, const edge_list& outputs);
 
   // The value of worker_device in the thread that created this task.
   // See Note [Reentrant backwards]
@@ -499,14 +499,14 @@ struct ClearCallbacks {
   std::mutex& callbacks_lock;
 };
 
-auto Engine::execute(const edge_list& input_roots,
+auto Engine::execute(const edge_list& roots,
                      const variable_list& inputs,
                      bool keep_graph,
                      bool create_graph,
                      const edge_list& outputs) -> variable_list {
   std::call_once(start_threads_flag, &Engine::start_threads, this);
 
-  validate_outputs(input_roots, const_cast<variable_list&>(inputs), [](const std::string& msg) {
+  validate_outputs(roots, const_cast<variable_list&>(inputs), [](const std::string& msg) {
     return msg;
   });
 
@@ -517,7 +517,7 @@ auto Engine::execute(const edge_list& input_roots,
   std::unique_lock<std::mutex> lock(graph_task.mutex);
 
   // Now compute the dependencies for all executable functions and queue the root
-  auto graph_root = std::make_shared<GraphRoot>(input_roots, inputs);
+  auto graph_root = std::make_shared<GraphRoot>(roots, inputs);
   compute_dependencies(graph_root.get(), graph_task);
   if (!outputs.empty()) {
     graph_task.init_to_execute(*graph_root, outputs);
diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h
index db8b3357ac2536..94490303ccc240 100644
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@@ -57,7 +57,7 @@ struct TORCH_API Engine {
   ReadyQueue& ready_queue(int device);
   void start_threads();
   virtual void thread_init(int device);
-  virtual void thread_main(GraphTask *task);
+  virtual void thread_main(GraphTask *graph_task);
   virtual void thread_on_exception(FunctionTask& task, std::exception& e);
 
   std::once_flag start_threads_flag;
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index b02bdf3928f2ff..46a80b90b29ffa 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -328,7 +328,7 @@ struct TORCH_API Function : std::enable_shared_from_this<Function> {
 /// See Function::is_traceable() for definition.
 struct TraceableFunction : public Function {
   using Function::Function;
-  bool is_traceable() final override {
+  bool is_traceable() final {
     return true;
   }
 };
diff --git a/torch/csrc/autograd/function_hook.h b/torch/csrc/autograd/function_hook.h
index 03c52fea54535c..f3cf5b2e793c6a 100644
--- a/torch/csrc/autograd/function_hook.h
+++ b/torch/csrc/autograd/function_hook.h
@@ -10,12 +10,12 @@ struct Variable;
 using variable_list = std::vector<Variable>;
 
 struct FunctionPreHook {
-  virtual ~FunctionPreHook() {}
+  virtual ~FunctionPreHook() = default;
   virtual variable_list operator()(const variable_list& grads) = 0;
 };
 
 struct FunctionPostHook {
-  virtual ~FunctionPostHook() {}
+  virtual ~FunctionPostHook() = default;
   virtual variable_list operator()(const variable_list& grad_input, const variable_list& grad_output) = 0;
 };
 
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
index 44d4b7f106c860..db86ae428d4060 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -6,9 +6,9 @@
 namespace torch { namespace autograd {
 
 struct AccumulateGrad : public Function {
-  explicit AccumulateGrad(Variable variable);
+  explicit AccumulateGrad(Variable variable_);
 
-  variable_list apply(variable_list&& inputs) override;
+  variable_list apply(variable_list&& grads) override;
 
   Variable variable;
 };
diff --git a/torch/csrc/autograd/functions/basic_ops.cpp b/torch/csrc/autograd/functions/basic_ops.cpp
index b04b0f25ca42d5..c4a54d99d08702 100644
--- a/torch/csrc/autograd/functions/basic_ops.cpp
+++ b/torch/csrc/autograd/functions/basic_ops.cpp
@@ -11,7 +11,7 @@
 
 namespace torch { namespace autograd {
 
-auto Error::apply(variable_list&& grad_outputs) -> variable_list {
+auto Error::apply(variable_list&& inputs) -> variable_list {
   throw std::runtime_error(msg);
 }
 
diff --git a/torch/csrc/autograd/functions/tensor.h b/torch/csrc/autograd/functions/tensor.h
index aa4b422136930f..1a21a360ba9fc2 100644
--- a/torch/csrc/autograd/functions/tensor.h
+++ b/torch/csrc/autograd/functions/tensor.h
@@ -13,7 +13,7 @@
 namespace torch { namespace autograd {
 
 struct CopyBackwards : public Function {
-  variable_list apply(variable_list&& inputs) override;
+  variable_list apply(variable_list&& grads) override;
 
   at::Type *src_type;
   int32_t src_device = -1;
@@ -23,9 +23,12 @@ struct CopyBackwards : public Function {
 // grad[idx] is defined by the relative sizes, strides, and offset of base and
 // view.
 struct CopySlices : public Function {
-  CopySlices(const Variable& base, at::TensorGeometry view, std::shared_ptr<Function> fn);
+  CopySlices(
+      const Variable& base_var,
+      at::TensorGeometry view_,
+      std::shared_ptr<Function> fn_);
 
-  variable_list apply(variable_list&& grads) override;
+  variable_list apply(variable_list&& inputs) override;
   void release_variables() override;
 
   at::TensorGeometry base;
diff --git a/torch/csrc/autograd/input_buffer.h b/torch/csrc/autograd/input_buffer.h
index 2e0febfc84b0bc..f1c02e0d78e565 100644
--- a/torch/csrc/autograd/input_buffer.h
+++ b/torch/csrc/autograd/input_buffer.h
@@ -22,14 +22,14 @@ struct InputBuffer {
   InputBuffer& operator=(InputBuffer&& other) = default;
 
   // Accumulates the variable at a specified index.
-  void add(size_t idx, Variable var);
+  void add(size_t pos, Variable var);
 
   int device() const;
 
   Variable operator[](size_t pos) { return buffer[pos]; }
 
   // Returns the inputs as a list of variables. Destroys given InputBuffer.
-  static std::vector<Variable> variables(InputBuffer&& buffer);
+  static std::vector<Variable> variables(InputBuffer&& g);
 
 private:
   std::vector<Variable> buffer;
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index dd77dc193ba9bd..ba0fee1510baa2 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -185,7 +185,7 @@ struct TORCH_API RecordFunction {
 using thread_event_lists = std::vector<std::vector<Event>>;
 // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
 // there no autograd functions are being executed when these function are used.
-TORCH_API void enableProfiler(ProfilerState state);
+TORCH_API void enableProfiler(ProfilerState new_state);
 TORCH_API thread_event_lists disableProfiler();
 
 } // namespace profiler
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 08e494530040eb..e9d29bd0caa688 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -45,7 +45,7 @@ namespace torch { namespace autograd {
 
 VariableInfo::VariableInfo(const Variable& var)
   : type(&var.type())
-  , size(var.sizes())
+  , size(var.sizes().vec())
   , requires_grad(var.requires_grad()) {
   if (var.type().is_cuda()) {
     device = var.get_device();
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index cd8329cad01434..1aa21f84d45cf2 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -154,14 +154,6 @@ static Variable applySlicing(const Variable& self, PyObject* index, variable_lis
       result = applySelect(result, dim, THPUtils_unpackLong(obj));
     } else if (PySlice_Check(obj)) {
       result = applySlice(result, dim, obj);
-#ifndef USE_TH_SIZE_ZERO_DIM
-      if (result.numel() == 0) {
-        // TODO: currently we don't have support for 0-sized dims, so slicing a dim
-        // to size 0 will return a size 0 tensor. for now, just shortcircuit slicing
-        // and return that size 0 tensor.
-        return result;
-      }
-#endif
       dim++;
     } else if (obj == Py_Ellipsis) {
       dim += self.dim() - specified_dims;
diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h
index 61a1d3b3eac172..037f06a7f95c11 100644
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@@ -45,10 +45,10 @@ class TORCH_API SavedVariable {
   std::weak_ptr<Function> grad_accumulator_;
   VariableVersion version_counter_;
 
-  uint32_t saved_version_;
-  uint32_t output_nr_;
+  uint32_t saved_version_ = 0;
+  uint32_t output_nr_ = 0;
   bool was_default_constructed_ = true;
-  bool requires_grad_;
-  bool has_grad_fn_;
+  bool requires_grad_ = false;
+  bool has_grad_fn_ = false;
 };
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index 9bbae25d9c4d96..30aded0a85e73a 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -22,7 +22,7 @@
 namespace torch {
 namespace autograd {
 Variable::Impl::Impl(at::Tensor data, bool requires_grad, Edge gradient_edge)
-    : TensorImpl(VariableType::getType(data), nullptr),
+    : TensorImpl(data.type().backend(), data.type().scalarType(), nullptr, /* is variable */ true),
       data_(std::move(data)),
       grad_fn_(std::move(gradient_edge.function)),
       requires_grad_(false),
@@ -118,7 +118,9 @@ void Variable::Impl::backward(
 
 void Variable::Impl::set_data(Tensor new_data) {
   if (new_data.type() != data_.type()) {
-    type_ = VariableType::getType(new_data.type());
+    scalar_type_ = new_data.type().scalarType();
+    backend_ = new_data.type().backend();
+    is_variable_ = true;
     // Clear grad_accumulator if it exists, since it stores the old type info.
     grad_accumulator_.reset();
   }
@@ -154,8 +156,8 @@ std::shared_ptr<Function>& Variable::ViewImpl::get_grad_fn() {
     AT_ASSERT(output_nr_ == 0);
     auto fn = std::make_shared<generated::AsStridedBackward>();
     fn->self_geometry = at::TensorGeometry(base_);
-    fn->size = sizes();
-    fn->stride = strides();
+    fn->size = sizes().vec();
+    fn->stride = strides().vec();
     fn->storage_offset = data_.storage_offset();
     fn->set_next_edges(collect_next_edges(base_));
     fn->add_input_metadata(base_.type(), sizes());
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index c97a0322359a4d..d46008bbdd10b0 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -263,7 +263,7 @@ struct Variable::Impl : public at::TensorImpl {
   TORCH_API explicit Impl(
       at::Tensor data,
       bool requires_grad = false,
-      Edge edge = Edge());
+      Edge gradient_edge = Edge());
 
   ~Impl() override;
 
@@ -327,9 +327,6 @@ struct Variable::Impl : public at::TensorImpl {
   /// Reset all expensive fields to free up resources
   void release_resources() override;
 
-  // Make this field public so we can access it from `Variable`.
-  using at::TensorImpl::type_;
-
   std::string name;
   at::Tensor data_;
 
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 0e869876e8e1fa..8237239f99b639 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -74,7 +74,7 @@ tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t bu
   }
 
   tensor_list2d outputs(devices.size());
-  outputs[0] = tensors;
+  outputs[0] = tensors.vec();
   for (auto & o : outputs)
     o.reserve(tensors.size());
 
diff --git a/torch/csrc/distributed/c10d/ddp.h b/torch/csrc/distributed/c10d/ddp.h
new file mode 100644
index 00000000000000..7b26c1475fc1c6
--- /dev/null
+++ b/torch/csrc/distributed/c10d/ddp.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <torch/csrc/utils/tensor_flatten.h>
+
+#include <c10d/ProcessGroup.hpp>
+
+#include <ATen/ATen.h>
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+namespace c10d {
+inline void distBroadcastCoalesced(
+    std::vector<at::Tensor>& tensors,
+    int64_t bufferSize,
+    ProcessGroup& processGroup) {
+  auto tensorGroups = torch::utils::take_tensors(tensors, bufferSize);
+  // We store single-element vectors in `flatTensors` because
+  // `ProcessGroup::broadcast` takes a reference to a vector, which must be
+  // alive until the `wait()` call on the returned `Work` completes.
+  std::vector<std::vector<at::Tensor>> flatTensors;
+  std::vector<std::shared_ptr<ProcessGroup::Work>> work;
+  flatTensors.reserve(tensorGroups.size());
+  work.reserve(tensorGroups.size());
+  for (const auto& group : tensorGroups) {
+    // Flatten each group of tensors (whose size equals `bufferSize`) into a
+    // single tensor.
+    flatTensors.push_back({torch::utils::flatten_dense_tensors(group.tensors)});
+    BroadcastOptions broadcastOptions;
+    broadcastOptions.rootRank = 0;
+    broadcastOptions.rootTensor = 0;
+    // Enqueue a work item and collect the `Work` (essntially a "future") so we
+    // can `wait()` for its completion after we have collected all `Work` items.
+    work.push_back(
+        processGroup.broadcast(flatTensors.back(), broadcastOptions));
+  }
+  // Now loop through each group, wait for the broadcast to complete, and
+  // un-flatten the broadcast tensor back into device-local individual tensors.
+  for (size_t group = 0; group < tensorGroups.size(); ++group) {
+    auto& tensors = tensorGroups[group].tensors;
+    work[group]->wait();
+    const auto synced =
+        torch::utils::unflatten_dense_tensors(flatTensors[group][0], tensors);
+    AT_ASSERT(synced.size() == tensors.size());
+    for (size_t i = 0; i < synced.size(); ++i) {
+      // Copy into the per-process tensors.
+      tensors[i].copy_(synced[i], /*non_blocking=*/true);
+    }
+  }
+}
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 2bd7a871dc36fc..797fcbcdd2432e 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -13,9 +13,10 @@
 #include <gloo/transport/tcp/device.h>
 #include <pybind11/chrono.h>
 
-#include "torch/csrc/Exceptions.h"
-#include "torch/csrc/utils/object_ptr.h"
-#include "torch/csrc/utils/pybind.h"
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/distributed/c10d/ddp.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/pybind.h>
 
 namespace torch {
 namespace distributed {
@@ -199,6 +200,8 @@ PyObject* c10d_init(PyObject* _unused) {
           &::c10d::ProcessGroup::Work::wait,
           py::call_guard<py::gil_scoped_release>());
 
+  module.def("_dist_broadcast_coalesced", &::c10d::distBroadcastCoalesced);
+
   Py_RETURN_TRUE;
 }
 
diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h
index d6bd90cb708784..f404b4ce9a05c6 100644
--- a/torch/csrc/jit/argument_spec.h
+++ b/torch/csrc/jit/argument_spec.h
@@ -59,20 +59,21 @@ struct ArgumentSpec {
     for(int32_t i = 0; i < num_inputs; i++) {
       auto & pod = pods[i];
       pod.is_tensor = static_cast<uint32_t>(inputs[i].isTensor());
-      if (!pod.is_tensor) continue;
-      at::Tensor t = inputs[i].toTensor();
-      pod.defined = t.defined();
-      if (pod.defined) {
-        pod.type = static_cast<int>(t.type().scalarType());
-        pod.device = (!t.type().is_cuda()) ? -1 : t.get_device();
-        pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad();
-        total_dims += t.ndimension();
-        auto sizes = t.sizes();
-        std::copy(sizes.begin(),sizes.end(), next_dim);
-        next_dim += sizes.size();
-        auto strides = t.strides();
-        std::copy(strides.begin(), strides.end(), next_dim);
-        next_dim += strides.size();
+      if (pod.is_tensor) {
+        at::Tensor t = inputs[i].toTensor();
+        pod.defined = t.defined();
+        if (pod.defined) {
+          pod.type = static_cast<int>(t.type().scalarType());
+          pod.device = (!t.type().is_cuda()) ? -1 : t.get_device();
+          pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad();
+          total_dims += t.ndimension();
+          auto sizes = t.sizes();
+          std::copy(sizes.begin(),sizes.end(), next_dim);
+          next_dim += sizes.size();
+          auto strides = t.strides();
+          std::copy(strides.begin(), strides.end(), next_dim);
+          next_dim += strides.size();
+        }
       }
       // each POD has a running tally of all dimensions including its own
       pod.total_dims = total_dims;
diff --git a/torch/csrc/jit/attributes.h b/torch/csrc/jit/attributes.h
index f69790cab52e00..53b87af9ef991d 100644
--- a/torch/csrc/jit/attributes.h
+++ b/torch/csrc/jit/attributes.h
@@ -28,7 +28,7 @@ struct AttributeValue {
   Symbol name;
   virtual AttributeKind kind() const = 0;
   virtual Ptr clone() const = 0;
-  virtual ~AttributeValue() {}
+  virtual ~AttributeValue() = default;
 };
 
 template<typename T, AttributeKind Kind>
@@ -101,7 +101,7 @@ struct AttributeError : public std::exception {
 // we return Derived* pointers because Nodes are normally held as pointers.
 template<typename Derived>
 struct Attributes {
-  Attributes() {}
+  Attributes() = default;
   void copyAttributes(const Attributes & rhs) {
     values_.clear();
     for(auto & i : rhs.values_) {
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index c830dc45a537f5..7f250bf7c452aa 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -9,6 +9,7 @@
 #include <torch/csrc/jit/assertions.h>
 
 #include <algorithm>
+#include <memory>
 
 namespace torch { namespace jit {
 
@@ -564,14 +565,13 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) {
   reverse_block->owningNode()->destroy();
 }
 
-Gradient differentiate(std::shared_ptr<Graph>& _graph, const std::vector<bool>& requires_grad) {
+Gradient differentiate(std::shared_ptr<Graph>& graph, const std::vector<bool>& requires_grad) {
   Gradient grad_desc;
   // Take ownership of the graph
-  JIT_ASSERTM(
-      _graph.use_count() == 1,
-      "differentiate will mutate and destroy the graph, so it requires "
-      "graph.use_count() == 1, but found ", _graph.use_count());
-  std::swap(_graph, grad_desc.f);
+  JIT_ASSERTM(graph.use_count() == 1,
+              "differentiate will mutate and destroy the graph, so it requires "
+              "graph.use_count() == 1, but found %d", graph.use_count());
+  std::swap(graph, grad_desc.f);
   // XXX: Take care when handling outputs - they can be duplicated!
 
   WithInsertPoint guard(grad_desc.f->block());
diff --git a/torch/csrc/jit/autodiff.h b/torch/csrc/jit/autodiff.h
index 6dd2be9db0e779..ea2b7a1170efeb 100644
--- a/torch/csrc/jit/autodiff.h
+++ b/torch/csrc/jit/autodiff.h
@@ -4,7 +4,9 @@
 #include "torch/csrc/jit/ir.h"
 
 #include <ATen/ATen.h>
+
 #include <vector>
+#include <memory>
 
 namespace torch { namespace jit {
 
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index 3c4ad0c130ea31..47e593bbb125e2 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -22,8 +22,13 @@ Value* insertConstant(
     n->f_(attr::value, val.toDouble());
     n->output()->setType(FloatType::get());
   } else if(val.isIntList()) {
-    n->is_(attr::value, val.toIntList()->elements());
+    n->is_(attr::value, val.toIntList()->elements().vec());
     n->output()->setType(ListType::ofInts());
+  } else if(val.isTensorList()) {
+    n->ts_(attr::value, fmap(val.toTensorList()->elements(), [](const at::Tensor & t) {
+      return autograd::Variable(t).data();
+    }));
+    n->output()->setType(ListType::ofTensors());
   } else {
     throw std::runtime_error("Unsupported value kind: " + val.tagKind());
   }
@@ -66,6 +71,14 @@ RegisterOperators reg({
             push(stack, is);
             return 0;
           };
+        } else if(type->isSubtypeOf(ListType::ofTensors())) {
+          auto ts = fmap(node->ts(attr::value), [](const at::Tensor & t) -> at::Tensor {
+            return autograd::make_variable(t);
+          });
+          return [ts](Stack& stack) {
+            push(stack, ts);
+            return 0;
+          };
         } else {
           std::stringstream ss;
           ss << "constant literal not supported for: " << type->str();
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 71dec999c40216..20208af5496c28 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -1,6 +1,7 @@
 #include "torch/csrc/jit/export.h"
-#include "torch/csrc/onnx/onnx.h"
 #include "torch/csrc/autograd/symbolic.h"
+#include "onnx/onnx.pb.h"
+#include "torch/csrc/onnx/onnx.h"
 
 #include "torch/csrc/utils/functional.h"
 #include <torch/csrc/jit/assertions.h>
@@ -18,7 +19,8 @@ namespace torch { namespace jit {
 
 namespace {
 
-namespace onnx = ::torch::onnx;
+namespace onnx_torch = ::torch::onnx;
+namespace onnx = ::ONNX_NAMESPACE;
 
 std::string value_name(Value* n) {
   return n->uniqueName();
@@ -26,7 +28,7 @@ std::string value_name(Value* n) {
 
 struct ExportContext {
   size_t num_blocks = 0;
-  onnx::OperatorExportTypes operator_export_type;
+  onnx_torch::OperatorExportTypes operator_export_type;
 };
 
 void encodeGraph(onnx::GraphProto * p_g, const std::shared_ptr<Graph> & g,
@@ -43,34 +45,37 @@ void encodeTensor(onnx::TensorProto * p, const at::Tensor & tensor,
   for(auto d : tensor.sizes()) {
     p->add_dims(d);
   }
-  onnx::DataType onnx_type;
+  onnx::TensorProto_DataType onnx_type;
   // Most integral types and float16 need to be serialized as int32
   at::ScalarType cast_type = tensor.type().scalarType();
   switch(tensor.type().scalarType()) {
     case at::kDouble:
-      onnx_type = onnx::kDOUBLE;
+      onnx_type = onnx::TensorProto_DataType_DOUBLE;
       break;
     case at::kFloat:
-      onnx_type = onnx::kFLOAT;
+      onnx_type = onnx::TensorProto_DataType_FLOAT;
       break;
     case at::kHalf:
-      onnx_type = onnx::kFLOAT16;
+      onnx_type = onnx::TensorProto_DataType_FLOAT16;
       cast_type = at::kInt;
       break;
     case at::kByte:
+      onnx_type = onnx::TensorProto_DataType_UINT8;
+      cast_type = at::kInt;
+      break;
     case at::kChar:
-      onnx_type = onnx::kINT8;
+      onnx_type = onnx::TensorProto_DataType_INT8;
       cast_type = at::kInt;
       break;
     case at::kShort:
-      onnx_type = onnx::kINT16;
+      onnx_type = onnx::TensorProto_DataType_INT16;
       cast_type = at::kInt;
       break;
     case at::kInt:
-      onnx_type = onnx::kINT32;
+      onnx_type = onnx::TensorProto_DataType_INT32;
       break;
     case at::kLong:
-      onnx_type = onnx::kINT64;
+      onnx_type = onnx::TensorProto_DataType_INT64;
       break;
     default:
       AT_ERROR("unexpected tensor scalar type");
@@ -85,13 +90,14 @@ void encodeTensor(onnx::TensorProto * p, const at::Tensor & tensor,
   if (external_ref) {
     // For now, we use the name of the tensor as the external lookup name to
     // avoid ONNX protobuf changes.
-    JIT_ASSERT(external_ref.value() == p->get_name());
+    JIT_ASSERT(external_ref.value() == p->name());
     JIT_ASSERT(raw_data_export_map != nullptr);
     JIT_ASSERT(raw_data_export_map->count(external_ref.value()) == 0);
     (*raw_data_export_map)[external_ref.value()] = t;
-    p->set_external_data_present();
+    p->set_raw_data("__EXTERNAL");
   } else {
-    p->set_raw_data(t);
+    JIT_ASSERT(t.is_contiguous());
+    p->set_raw_data(std::string(static_cast<char*>(t.data_ptr()),  t.type().elementSizeInBytes() * t.numel()));
   }
 }
 
@@ -102,50 +108,50 @@ void addAttribute(onnx::NodeProto * n_p, jit::Node * n, jit::Symbol name, Export
   switch(n->kindOf(name)) {
     case AttributeKind::f:
       attr->set_f(n->f(name));
-      attr->set_type(onnx::aFLOAT);
+      attr->set_type(onnx::AttributeProto_AttributeType_FLOAT);
       break;
     case AttributeKind::fs:
-      attr->set_type(onnx::aFLOATS);
+      attr->set_type(onnx::AttributeProto_AttributeType_FLOATS);
       for(auto & v : n->fs(name))
         attr->add_floats(v);
       break;
     case AttributeKind::i:
-      attr->set_type(onnx::aINT);
+      attr->set_type(onnx::AttributeProto_AttributeType_INT);
       attr->set_i(n->i(name));
       break;
     case AttributeKind::is:
-      attr->set_type(onnx::aINTS);
+      attr->set_type(onnx::AttributeProto_AttributeType_INTS);
       for(auto & v : n->is(name))
         attr->add_ints(v);
       break;
     case AttributeKind::s:
-      attr->set_type(onnx::aSTRING);
+      attr->set_type(onnx::AttributeProto_AttributeType_STRING);
       attr->set_s(n->s(name));
       break;
     case AttributeKind::ss:
-      attr->set_type(onnx::aSTRINGS);
+      attr->set_type(onnx::AttributeProto_AttributeType_STRINGS);
       for(auto & v : n->ss(name))
         attr->add_strings(v);
       break;
     case AttributeKind::t: {
-      attr->set_type(onnx::aTENSOR);
+      attr->set_type(onnx::AttributeProto_AttributeType_TENSOR);
       auto t = attr->mutable_t();
       encodeTensor(t, n->t(name));
     } break;
     case AttributeKind::ts:
-      attr->set_type(onnx::aTENSORS);
+      attr->set_type(onnx::AttributeProto_AttributeType_TENSORS);
       for(auto & v : n->ts(name)) {
         auto t = attr->add_tensors();
         encodeTensor(t, v);
       }
       break;
     case AttributeKind::g: {
-      attr->set_type(onnx::aGRAPH);
+      attr->set_type(onnx::AttributeProto_AttributeType_GRAPH);
       auto g = attr->mutable_g();
       encodeGraph(g, n->g(name), {}, ctx, nullptr);
     } break;
     case AttributeKind::gs:
-      attr->set_type(onnx::aGRAPHS);
+      attr->set_type(onnx::AttributeProto_AttributeType_GRAPHS);
       for(auto & v : n->gs(name)) {
         auto g = attr->add_graphs();
         encodeGraph(g, v, {}, ctx, nullptr);
@@ -154,49 +160,52 @@ void addAttribute(onnx::NodeProto * n_p, jit::Node * n, jit::Symbol name, Export
   }
 }
 
-void encodeTypeProtoTensorType(onnx::TypeProtoTensor* tensor_type, Value* n) {
+void encodeTypeProtoTensorType(onnx::TypeProto_Tensor* tensor_type, Value* n) {
   onnx::TensorShapeProto* shape = tensor_type->mutable_shape();
   if (TensorTypePtr node_type = n->type()->cast<TensorType>()) {
     const std::vector<std::int64_t>& sizes = node_type->sizes();
-    for (std::int64_t s : sizes) {
-      shape->add_dim(s);
+    for (size_t i = 0; i < sizes.size(); i++) {
+      shape->add_dim();
+      shape->mutable_dim(i)->set_dim_value(sizes[i]);
     }
-    onnx::DataType onnx_type;
+    onnx::TensorProto_DataType onnx_type;
     switch(node_type->scalarType()) {
       case at::kDouble:
-        onnx_type = onnx::kDOUBLE;
+        onnx_type = onnx::TensorProto_DataType_DOUBLE;
         break;
       case at::kFloat:
-        onnx_type = onnx::kFLOAT;
+        onnx_type = onnx::TensorProto_DataType_FLOAT;
         break;
       case at::kHalf:
-        onnx_type = onnx::kFLOAT16;
+        onnx_type = onnx::TensorProto_DataType_FLOAT16;
         break;
       case at::kByte:
+        onnx_type = onnx::TensorProto_DataType_UINT8;
+        break;
       case at::kChar:
-        onnx_type = onnx::kINT8;
+        onnx_type = onnx::TensorProto_DataType_INT8;
         break;
       case at::kShort:
-        onnx_type = onnx::kINT16;
+        onnx_type = onnx::TensorProto_DataType_INT16;
         break;
       case at::kInt:
-        onnx_type = onnx::kINT32;
+        onnx_type = onnx::TensorProto_DataType_INT32;
         break;
       case at::kLong:
-        onnx_type = onnx::kINT64;
+        onnx_type = onnx::TensorProto_DataType_INT64;
         break;
       default:
         AT_ERROR("unexpected tensor scalar type");
         break;
     }
-    tensor_type->set_data_type(onnx_type);
+    tensor_type->set_elem_type(onnx_type);
   }
 }
 
 void encodeValueInfo(onnx::ValueInfoProto* v, Value* n) {
   v->set_name(value_name(n));
   onnx::TypeProto* t = v->mutable_type();
-  onnx::TypeProtoTensor* tensor_type = t->mutable_tensor_type();
+  onnx::TypeProto_Tensor* tensor_type = t->mutable_tensor_type();
   encodeTypeProtoTensorType(tensor_type, n);
 }
 
@@ -226,7 +235,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
     encodeValueInfo(v, output);
   }
   for (auto node : b->nodes()) {
-    bool is_raw_export = ctx->operator_export_type == onnx::OperatorExportTypes::RAW;
+    bool is_raw_export = ctx->operator_export_type == onnx_torch::OperatorExportTypes::RAW;
     if (node->kind() == prim::Undefined && !is_raw_export) {
       // Undefined nodes are used to implement optional inputs. One
       // way to "not provide" an optional input is to create an
@@ -253,7 +262,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
       JIT_ASSERT(!node->kind().is_onnx());
       p_n->set_domain(node->kind().domainString());
     }
-    else if (ctx->operator_export_type != onnx::OperatorExportTypes::ONNX_ATEN_FALLBACK) {
+    else if (ctx->operator_export_type != onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK) {
       JIT_ASSERT(node->kind().is_onnx());
     }
     p_n->set_op_type(node->kind().toUnqualString());
@@ -263,7 +272,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
     if (is_raw_export && node->blocks().size() > 0) {
       auto blocks = p_n->add_attribute();
       blocks->set_name("_blocks");
-      blocks->set_type(onnx::aGRAPHS);
+      blocks->set_type(onnx::AttributeProto_AttributeType_GRAPHS);
       for (auto block : node->blocks()) {
         auto graph = blocks->add_graphs();
         encodeBlock(graph, block, initializers, ctx, raw_data_export_map);
@@ -274,7 +283,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
 
       auto body = p_n->add_attribute();
       body->set_name("body");
-      body->set_type(onnx::aGRAPH);
+      body->set_type(onnx::AttributeProto_AttributeType_GRAPH);
       auto g = body->mutable_g();
       encodeBlock(g, node->blocks()[0], {}, ctx, raw_data_export_map);
     }
@@ -283,13 +292,13 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
 
       auto true_branch = p_n->add_attribute();
       true_branch->set_name("then_branch");
-      true_branch->set_type(onnx::aGRAPH);
+      true_branch->set_type(onnx::AttributeProto_AttributeType_GRAPH);
       auto true_g = true_branch->mutable_g();
       encodeBlock(true_g, node->blocks()[0], {}, ctx, raw_data_export_map);
 
       auto false_branch = p_n->add_attribute();
       false_branch->set_name("else_branch");
-      false_branch->set_type(onnx::aGRAPH);
+      false_branch->set_type(onnx::AttributeProto_AttributeType_GRAPH);
       auto false_g = false_branch->mutable_g();
       encodeBlock(false_g, node->blocks()[1], {}, ctx, raw_data_export_map);
     }
@@ -300,7 +309,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
   for (auto & tensor : initializers) {
     // TODO: stop using positions to determine which initializers
     // match to which inputs
-    std::string name = p_g->get_input_name(inputs_count++);
+    std::string name = p_g->input(inputs_count++).name();
     auto p = p_g->add_initializer();
     p->set_name(name);
     if (raw_data_export_map) {
@@ -314,8 +323,8 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
 void encodeModel(onnx::ModelProto* p_m, const std::shared_ptr<Graph>& g,
                  const std::vector<at::Tensor>& initializers,
                  RawDataExportMap* raw_data_export_map = nullptr,
-                 onnx::OperatorExportTypes operator_export_type
-                   = onnx::OperatorExportTypes::ONNX) {
+                 onnx_torch::OperatorExportTypes operator_export_type
+                   = onnx_torch::OperatorExportTypes::ONNX) {
   onnx::GraphProto* p_g = p_m->mutable_graph();
   ExportContext ctx;
   ctx.operator_export_type = operator_export_type;
@@ -334,7 +343,7 @@ std::string getNodeStackTraceString(Node* n) {
 }
 } // namespace
 
-void validateGraph(const std::shared_ptr<Graph>& graph, onnx::OperatorExportTypes operator_export_type) {
+void validateGraph(const std::shared_ptr<Graph>& graph, onnx_torch::OperatorExportTypes operator_export_type) {
   for (auto node : graph->nodes()) {
       // Macro'ed so we get a marginally better line number on failed export
 #define FAIL_EXPORT(name) \
@@ -356,7 +365,7 @@ void validateGraph(const std::shared_ptr<Graph>& graph, onnx::OperatorExportType
             "Cannot export individual pack_padded_sequence or pad_packed_sequence; these operations must occur in pairs.\n\nUsage of this operation occurred at:\n" +
             getNodeStackTraceString(node));
       }
-      bool is_aten_fallback = operator_export_type == onnx::OperatorExportTypes::ONNX_ATEN_FALLBACK;
+      bool is_aten_fallback = operator_export_type == onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK;
       if (!node->kind().is_onnx() && !is_aten_fallback && node->kind() != prim::Undefined) {
         FAIL_EXPORT(
             "Couldn't export operator " + node->kind().toDisplayString() + "\n\nDefined at:\n" +
@@ -367,6 +376,182 @@ void validateGraph(const std::shared_ptr<Graph>& graph, onnx::OperatorExportType
   }
 }
 
+// Pretty printing
+namespace {
+constexpr char indent_char = ' ';
+constexpr size_t indent_multiplier = 2;
+
+std::string idt(size_t indent) {
+  return std::string(indent * indent_multiplier, indent_char);
+}
+
+std::string nlidt(size_t indent) {
+  return std::string("\n") + idt(indent);
+}
+
+void dump(const onnx::TensorProto& tensor, std::ostream& stream) {
+  stream << "TensorProto shape: [";
+  for (int i = 0; i < tensor.dims_size(); ++i) {
+    stream << tensor.dims(i) << (i == tensor.dims_size() - 1 ? "" : " ");
+  }
+  stream << "]";
+}
+
+void dump(const onnx::TensorShapeProto& shape, std::ostream& stream) {
+  for (int i = 0; i < shape.dim_size(); ++i) {
+    auto &dim = shape.dim(i);
+    if (dim.has_dim_value()) {
+      stream << dim.dim_value();
+    } else {
+      stream << "?";
+    }
+    stream << (i == shape.dim_size() - 1 ? "" : " ");
+  }
+}
+
+void dump(const onnx::TypeProto_Tensor& tensor_type, std::ostream& stream) {
+  stream << "Tensor dims: ";
+  dump(tensor_type.shape(), stream);
+}
+
+void dump(const onnx::TypeProto& type, std::ostream& stream) {
+  dump(type.tensor_type(), stream);
+}
+
+void dump(const onnx::ValueInfoProto& value_info, std::ostream& stream) {
+  stream << "{name: \"" << value_info.name()
+         << "\", type:";
+  dump(value_info.type(), stream);
+  stream << "}";
+}
+
+void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent);
+
+void dump(const onnx::AttributeProto& attr, std::ostream& stream, size_t indent) {
+  stream << "{ name: '" << attr.name() << "', type: ";
+  if (attr.has_f()) {
+    stream << "float, value: " << attr.f();
+  } else if (attr.has_i()) {
+    stream << "int, value: " << attr.i();
+  } else if (attr.has_s()) {
+    stream << "string, value: '" << attr.s() << "'";
+  } else if (attr.has_g()) {
+    stream << "graph, value:\n";
+    dump(attr.g(), stream, indent+1);
+    stream << nlidt(indent);
+  } else if (attr.has_t()) {
+    stream << "tensor, value:";
+    dump(attr.t(), stream);
+  } else if (attr.floats_size()) {
+    stream << "floats, values: [";
+    for (int i = 0; i < attr.floats_size(); ++i)
+      stream << attr.floats(i) << (i == attr.floats_size() - 1 ? "" : " ");
+    stream << "]";
+  } else if (attr.ints_size()) {
+    stream << "ints, values: [";
+    for (int i = 0; i < attr.ints_size(); ++i)
+      stream << attr.ints(i) << (i == attr.ints_size() - 1 ? "" : " ");
+    stream << "]";
+  } else if (attr.strings_size()) {
+    stream << "strings, values: [";
+    for (int i = 0; i < attr.strings_size(); ++i)
+      stream << "'" << attr.strings(i) << "'" << (i == attr.strings_size() - 1 ? "" : " ");
+    stream << "]";
+  } else if (attr.tensors_size()) {
+    stream << "tensors, values: [";
+    for (auto& t : attr.tensors()) {
+      dump(t, stream);
+    }
+    stream << "]";
+  } else if (attr.graphs_size()) {
+    stream << "graphs, values: [";
+    for (auto& g : attr.graphs()) {
+      dump(g, stream, indent+1);
+    }
+    stream << "]";
+  } else {
+    stream << "UNKNOWN";
+  }
+  stream << "}";
+}
+
+void dump(const onnx::NodeProto& node, std::ostream& stream, size_t indent) {
+  stream << "Node {type: \"" << node.op_type() << "\", inputs: [";
+  for (int i = 0; i < node.input_size(); ++i) {
+    stream << node.input(i) << (i == node.input_size() - 1 ? "" : ",");
+  }
+  stream << "], outputs: [";
+  for (int i = 0; i < node.output_size(); ++i) {
+    stream << node.output(i) << (i == node.output_size() - 1 ? "" : ",");
+  }
+  stream << "], attributes: [";
+  for (int i = 0; i < node.attribute_size(); ++i) {
+    dump(node.attribute(i), stream, indent+1);
+    stream << (i == node.attribute_size() - 1 ? "" : ",");
+  }
+  stream << "]}";
+}
+
+void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent) {
+  stream << idt(indent) << "GraphProto {" << nlidt(indent+1)
+         << "name: \"" << graph.name() << "\"" << nlidt(indent+1)
+         << "inputs: [";
+  for (int i = 0; i < graph.input_size(); ++i) {
+    dump(graph.input(i), stream);
+    stream << (i == graph.input_size() - 1 ? "" : ",");
+  }
+  stream << "]" << nlidt(indent+1)
+         << "outputs: [";
+  for (int i = 0; i < graph.output_size(); ++i) {
+    dump(graph.output(i), stream);
+    stream << (i == graph.output_size() - 1 ? "" : ",");
+  }
+  stream << "]" << nlidt(indent+1)
+         << "initializers: [";
+  for (int i = 0; i < graph.initializer_size(); ++i) {
+    dump(graph.initializer(i), stream);
+    stream << (i == graph.initializer_size() - 1 ? "" : ",");
+  }
+  stream << "]" << nlidt(indent+1)
+         << "nodes: [" << nlidt(indent+2);
+  for (int i = 0; i < graph.node_size(); ++i) {
+    dump(graph.node(i), stream, indent+2);
+    if (i != graph.node_size() - 1) stream << "," << nlidt(indent+2);
+  }
+  stream << nlidt(indent+1) << "]\n" << idt(indent) << "}\n";
+}
+
+void dump(const onnx::OperatorSetIdProto& operator_set_id, std::ostream& stream) {
+  stream << "OperatorSetIdProto { domain: " << operator_set_id.domain() << "}";
+}
+
+void dump(const onnx::ModelProto& model, std::ostream& stream, size_t indent) {
+  stream << idt(indent)
+         << "ModelProto {" << nlidt(indent+1)
+         << "producer_name: \"" << model.producer_name() << "\"" << nlidt(indent+1)
+         << "domain: \"" << model.domain() << "\"" << nlidt(indent+1)
+         << "doc_string: \"" << model.doc_string() << "\"";
+  if (model.has_graph()) {
+    stream << nlidt(indent+1) << "graph:\n";
+    dump(model.graph(), stream, indent+2);
+  }
+  if (model.opset_import_size()) {
+    stream << idt(indent+1) << "opset_import: [";
+    for (auto &opset_imp : model.opset_import()) {
+      dump(opset_imp, stream);
+    }
+    stream << "],\n";
+  }
+  stream << idt(indent) << "}\n";
+}
+} // namespace
+
+std::string prettyPrint(const onnx::ModelProto& model) {
+  std::stringstream ss;
+  dump(model, ss, 0);
+  return ss.str();
+}
+
 }
 
 namespace {
@@ -376,14 +561,15 @@ RawDataExportMap ToModelProto(
     const std::vector<at::Tensor> & initializers,
     int64_t onnx_opset_version,
     bool defer_weight_export,
-    onnx::OperatorExportTypes operator_export_type,
+    onnx_torch::OperatorExportTypes operator_export_type,
     onnx::ModelProto *model_proto) {
-  if (operator_export_type != onnx::OperatorExportTypes::RAW) {
+  if (operator_export_type != onnx_torch::OperatorExportTypes::RAW) {
     validateGraph(graph, operator_export_type);
   }
 
   model_proto->set_producer_name("pytorch");
   model_proto->set_producer_version("0.3");
+  model_proto->set_ir_version(onnx::IR_VERSION);
   auto* imp = model_proto->add_opset_import();
   // This is the version of ONNX operator set we are targeting
   imp->set_version(onnx_opset_version);
@@ -411,12 +597,12 @@ std::string PrettyPrintExportedGraph(
                         int64_t onnx_opset_version,
                         bool defer_weight_export,
                         ::torch::onnx::OperatorExportTypes operator_export_type) {
-  ::torch::onnx::ModelProto model_proto;
+  ::ONNX_NAMESPACE::ModelProto model_proto;
   RawDataExportMap raw_data_export_map;
   raw_data_export_map = ToModelProto(
     graph, initializers, onnx_opset_version, defer_weight_export, operator_export_type,
     &model_proto);
-  return model_proto.prettyPrint();
+  return prettyPrint(model_proto);
 }
 
 // export_raw_ir will export IR ops without turning them into ONNX ops.
@@ -430,21 +616,12 @@ std::tuple<std::string, RawDataExportMap> ExportGraph(
                         int64_t onnx_opset_version,
                         bool defer_weight_export,
                         ::torch::onnx::OperatorExportTypes operator_export_type) {
-  ::torch::onnx::ModelProto model_proto;
+  ::ONNX_NAMESPACE::ModelProto model_proto;
   RawDataExportMap raw_data_export_map;
   raw_data_export_map = ToModelProto(
     graph, initializers, onnx_opset_version, defer_weight_export, operator_export_type,
     &model_proto);
-
-  size_t out_size;
-  pb_get_encoded_size(&out_size, onnx_ModelProto_fields, &model_proto.proto);
-
-  // Allocate storage and export the graph
-  std::string out(out_size, '\0');
-  pb_ostream_t ostream = pb_ostream_from_buffer(reinterpret_cast<pb_byte_t *>(&out[0]), out_size);
-  pb_encode(&ostream, onnx_ModelProto_fields, &model_proto.proto);
-
-  return std::make_tuple(out, raw_data_export_map);
+  return std::make_tuple(model_proto.SerializeAsString(), raw_data_export_map);
 }
 
 }}
diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp
index 8d20045efefe6a..22f8b40ba30542 100644
--- a/torch/csrc/jit/fusion_compiler.cpp
+++ b/torch/csrc/jit/fusion_compiler.cpp
@@ -345,18 +345,14 @@ std::vector<ConcatDesc> emitCompilationUnit(std::ostream & out,
     size_t i = 0;
     for(auto o : subgraph.outputs()) {
       auto & desc = agraph.output_desc[i++];
-      if(o->node()->kind() != aten::cat) {
+      if(o->node()->kind() != prim::FusedConcat) {
         emitFormal(o, desc);
         concat_desc.emplace_back();
         flat_output_nodes.push_back(o);
       } else {
         auto cat = o->node();
-        auto tensor_inputs = cat->inputs();
-        // We need to drop the dim arg
-        tensor_inputs = tensor_inputs.slice(0, tensor_inputs.size() - 1);
-        size_t nInputs = tensor_inputs.size();
-        concat_desc.emplace_back(desc, nInputs, cat->get<int64_t>(attr::dim).value());
-        for(auto c : tensor_inputs) {
+        concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim));
+        for(auto c : cat->inputs()) {
           emitFormal(c, *concat_desc.back().subtensorDesc);
           flat_output_nodes.push_back(c);
         }
@@ -386,8 +382,9 @@ std::vector<ConcatDesc> emitCompilationUnit(std::ostream & out,
   }
 
   for(auto n : subgraph.nodes()) {
-    if(n->kind() == aten::cat)
-      continue; // Concat nodes by narrowing the output Tensors before the kernel runs
+    // FusedConcat nodes work by narrowing the output Tensors before the kernel runs
+    if (n->kind() == prim::FusedConcat)
+      continue;
     env.s("node",valueName(n->output()));
     env.s("rhs", encodeRHS(n));
     body << format("auto ${node} = ${rhs};\n",env);
diff --git a/torch/csrc/jit/fusion_compiler.h b/torch/csrc/jit/fusion_compiler.h
index 6c4759aefb692a..c2f35ee0aa2074 100644
--- a/torch/csrc/jit/fusion_compiler.h
+++ b/torch/csrc/jit/fusion_compiler.h
@@ -86,7 +86,7 @@ struct CompiledFusionFunction {
   TH_DISALLOW_COPY_AND_ASSIGN(CompiledFusionFunction);
 
   CompiledFusionFunction(const std::string & name, AnnotatedGraph & agraph);
-  virtual ~CompiledFusionFunction() {}
+  virtual ~CompiledFusionFunction() = default;
 
   // expects outputs to be pre-allocated
   void launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index df81c378ad137d..56a836b312d0c7 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -21,6 +21,7 @@
 #include "torch/csrc/jit/passes/specialize_undef.h"
 #include "torch/csrc/jit/passes/loop_unrolling.h"
 #include "torch/csrc/jit/passes/lower_grad_of.h"
+#include "torch/csrc/jit/passes/constant_propagation.h"
 #include "torch/csrc/jit/symbolic_variable.h"
 #include "torch/csrc/jit/ivalue.h"
 
@@ -240,14 +241,7 @@ struct GraphExecutorImpl {
   , symbolically_differentiable(symbolically_differentiable)
   , may_introduce_gradient(calcMayIntroduceGradient(this->graph->block())) {}
   GraphExecutorImpl(std::shared_ptr<Graph> graph, bool optimize)
-  : GraphExecutorImpl(graph, optimize, isDifferentiable(*graph)) {
-    for(auto input : graph->inputs()) {
-      JIT_ASSERTM(input->type()->kind() != TypeKind::TupleType, "tuples cannot be inputs to the graph");
-    }
-    for(auto output : graph->outputs()) {
-      JIT_ASSERTM(output->type()->kind() != TypeKind::TupleType, "tuples cannot be outputs to the graph");
-    }
-  }
+  : GraphExecutorImpl(graph, optimize, isDifferentiable(*graph)) {}
 
   // entry point where execution begins
   void run(Stack & stack) {
@@ -516,28 +510,28 @@ void runRequiredPasses(const std::shared_ptr<Graph>& g)  {
   RemoveExpands(g);
 }
 
-void specializeToSpec(const std::shared_ptr<Graph>& graph_, const ArgumentSpec& spec) {
+void specializeToSpec(const std::shared_ptr<Graph>& graph, const ArgumentSpec& spec) {
   // clean up GradOf and AutogradAdd nodes
   // this must be first because later passes do not know what GradOfs are
   std::vector<bool> defined;
   for(size_t i = 0; i < spec.size(); ++i) {
     defined.push_back(spec.at(i).defined());
   }
-  specializeUndef(*graph_, defined);
+  specializeUndef(*graph, defined);
 
   // required passes shared with autograd fallback
-  runRequiredPasses(graph_);
+  runRequiredPasses(graph);
 
   // Decompose addmm nodes to add + mm, so expands can be inserted and
   // gradients accumulated on the backward pass
   //
   // In the future, if we need more passes like this, we should convert this
   // into a generic canonicalization pass.
-  DecomposeAddmm(graph_);
+  DecomposeAddmm(graph);
   // clean up dead constants from specialization
-  EliminateDeadCode(graph_);
+  EliminateDeadCode(graph);
   // calculate all input shapes
-  PropagateInputShapes(*graph_, spec);
+  PropagateInputShapes(*graph, spec);
 }
 
 void runOptimization(std::shared_ptr<Graph> & graph, bool graphMustSupportVariables) {
@@ -554,7 +548,7 @@ void runOptimization(std::shared_ptr<Graph> & graph, bool graphMustSupportVariab
 
     // They also may assume that concrete sizes/strides are availiable
     UnrollLoops(graph);
-
+    ConstantPropagation(graph);
     //TODO: create peephole optimizations that are safe to run
     // when we are using variables, and when we do not know sizes.
     PeepholeOptimize(graph);
diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h
index 4e862c9e0a1e44..2693af50af1025 100644
--- a/torch/csrc/jit/graph_executor.h
+++ b/torch/csrc/jit/graph_executor.h
@@ -34,7 +34,7 @@ struct GraphExecutorState {
 
 struct GraphExecutorImpl;
 struct TORCH_API GraphExecutor {
-  GraphExecutor() {}
+  GraphExecutor() = default;
   GraphExecutor(std::shared_ptr<Graph> graph, bool optimize = true);
   // note: if not specified, symbolically_differentiable is computed from the graph.
   GraphExecutor(std::shared_ptr<Graph> graph, bool optimize, bool symbolically_differentiable);
diff --git a/torch/csrc/jit/graph_node_list.h b/torch/csrc/jit/graph_node_list.h
index 996a8b2c75fa0f..054b9517776863 100644
--- a/torch/csrc/jit/graph_node_list.h
+++ b/torch/csrc/jit/graph_node_list.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "torch/csrc/jit/assertions.h"
 
 namespace torch { namespace jit {
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index 5b128fd822dafd..a453925cf2f8eb 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -1,5 +1,5 @@
 #include "torch/csrc/jit/import.h"
-#include "torch/csrc/onnx/onnx.npb.h"
+#include "onnx/onnx.pb.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/utils/functional.h"
 #include "torch/csrc/jit/assertions.h"
@@ -16,401 +16,60 @@ namespace torch { namespace jit {
 
 namespace {
 
-// Deserialized data
-
-struct Tensor_ {
-  std::vector<int64_t> dims;
-  std::vector<uint8_t> raw_data;
-  onnx_TensorProto_DataType data_type;
-};
-
-struct AttributeValue_ {
-  std::string name;
-  onnx_AttributeProto_AttributeType type;
-  double f;
-  int64_t i;
-  std::string s;
-  Tensor_ t;
-  std::string g;
-  std::vector<double> fs;
-  std::vector<int64_t> is;
-  std::vector<std::string> ss;
-  std::vector<Tensor_> ts;
-  std::vector<std::string> gs;
-};
-
-struct Value_ {
-  std::string name;
-};
-
-struct Node_ {
-  std::string op_type;
-  std::string domain;
-  std::vector<std::string> inputs;
-  std::vector<std::string> outputs;
-  std::vector<AttributeValue_> attrs;
-};
-
-struct Graph_ {
-  std::vector<Value_> inputs;
-  std::vector<Value_> outputs;
-  std::vector<Node_> nodes;
-  std::vector<Tensor_> initializers;
-};
-
-struct Model_ {
-  Graph_ graph;
-};
-
-
-// Readers
-
-struct ReaderBase {
-  ReaderBase() {}
-  ReaderBase(pb_callback_t& cb) {
-    initialize_callback(cb);
-  }
-
-  void initialize_callback(pb_callback_t& cb) {
-    cb.funcs.decode = ReaderBase::decode;
-    cb.arg = this;
-  }
-
-  virtual void decode(pb_istream_t *stream) = 0;
-
-  static bool decode(pb_istream_t *stream, const pb_field_t *, void **_self) {
-    ReaderBase* self = *reinterpret_cast<ReaderBase* const *>(_self);
-    self->decode(stream);
-    return true;
-  }
-};
-
-
-template<typename T>
-struct Reader : ReaderBase {};
-
-template<typename T>
-struct Reader<std::vector<T>> : Reader<T> {
-  Reader(pb_callback_t& cb) : Reader<T>(cb) {}
-  // Decode is going to be called repeatedly from the callback
-  // (registered in the parent class constructor) each time an
-  // element is encountered. So all we do is relay the decoding
-  // through the parent class decode and push the result, every
-  // time this decode is called.
-  virtual void decode(pb_istream_t *stream) override {
-    Reader<T>::decode(stream);
-    values.push_back(std::move(Reader<T>::value));
-  }
-  std::vector<T> values;
-};
-
-template<>
-struct Reader<std::string> : ReaderBase {
-  Reader(pb_callback_t& cb) : ReaderBase(cb) {}
-  virtual void decode(pb_istream_t *stream) override {
-    // For string and bytes, the length value has already been
-    // parsed, and is available at stream->bytes_left.
-    std::vector<uint8_t> res(stream->bytes_left);
-    if (!pb_read(stream, res.data(), stream->bytes_left)) {
-      throw std::runtime_error("Decoding failed");
-    }
-    value.assign(res.begin(), res.end());
-  }
-  std::string value;
-};
-
-template<>
-struct Reader<double> : ReaderBase {
-  Reader(pb_callback_t& cb) : ReaderBase(cb) {}
-  virtual void decode(pb_istream_t *stream) override {
-    if (!pb_decode_fixed32(stream, &value)) {
-      throw std::runtime_error("Decoding failed");
-    }
-  }
-  double value;
-};
-
-template<>
-struct Reader<int64_t> : ReaderBase {
-  Reader(pb_callback_t& cb) : ReaderBase(cb) {}
-  virtual void decode(pb_istream_t *stream) override {
-    if (!pb_decode_varint(stream, reinterpret_cast<uint64_t*>(&value))) {
-      throw std::runtime_error("Decoding failed");
-    }
-  }
-  int64_t value;
-};
-
-template<>
-struct Reader<std::vector<uint8_t>> : ReaderBase {
-  Reader(pb_callback_t& cb) : ReaderBase(cb) {}
-  virtual void decode(pb_istream_t *stream) override {
-    // For string and bytes, the length value has already been
-    // parsed, and is available at stream->bytes_left.
-    value.resize(stream->bytes_left);
-    if (!pb_read(stream, value.data(), stream->bytes_left)) {
-      throw std::runtime_error("Decoding failed");
-    }
-  }
-  std::vector<uint8_t> value;
-};
-
-template<>
-struct Reader<Tensor_> : ReaderBase {
-  Reader()
-    : proto(onnx_TensorProto_init_default)
-    , dims_reader(proto.dims)
-    , raw_data_reader(proto.raw_data)
-  {}
-
-  Reader(pb_callback_t& cb)
-    : Reader() { initialize_callback(cb); }
-
-  virtual void decode(pb_istream_t *stream) override {
-    if (!pb_decode(stream, onnx_TensorProto_fields, &proto)) {
-      throw std::runtime_error("Decoding failed");
-    }
-
-    value.dims = std::move(dims_reader.values);
-    value.raw_data = std::move(raw_data_reader.value);
-    value.data_type = proto.data_type;
-  }
-
-  onnx_TensorProto proto;
-  Reader<std::vector<int64_t>> dims_reader;
-  Reader<std::vector<uint8_t>> raw_data_reader;
-  Tensor_ value;
-};
-
-template<>
-struct Reader<AttributeValue_> : ReaderBase {
-  Reader()
-    : proto(onnx_AttributeProto_init_default)
-    , name_reader(proto.name)
-    , str_reader(proto.s)
-    , tensor_reader(proto.t)
-    , graph_reader(proto.g)
-    , floats_reader(proto.floats)
-    , ints_reader(proto.ints)
-    , strings_reader(proto.strings)
-    , tensors_reader(proto.tensors)
-    , graphs_reader(proto.graphs) {}
-
-  Reader(pb_callback_t& cb)
-    : Reader() { initialize_callback(cb); }
-
-  virtual void decode(pb_istream_t *stream) override {
-    if (!pb_decode(stream, onnx_AttributeProto_fields, &proto)) {
-      throw std::runtime_error("Decoding failed");
-    }
-
-    value.name = std::move(name_reader.value);
-    value.type = proto.type;
-    value.f = proto.f;
-    value.i = proto.i;
-    value.s = std::move(str_reader.value);
-    value.t = std::move(tensor_reader.value);
-    value.g = std::move(graph_reader.value);
-    value.fs = std::move(floats_reader.values);
-    value.is = std::move(ints_reader.values);
-    value.ss = std::move(strings_reader.values);
-    value.ts = std::move(tensors_reader.values);
-    value.gs = std::move(graphs_reader.values);
-  }
-
-  onnx_AttributeProto proto;
-  Reader<std::string> name_reader;
-  Reader<std::string> str_reader;
-  Reader<Tensor_> tensor_reader;
-  Reader<std::string> graph_reader;
-  Reader<std::vector<double>> floats_reader;
-  Reader<std::vector<int64_t>> ints_reader;
-  Reader<std::vector<std::string>> strings_reader;
-  Reader<std::vector<Tensor_>> tensors_reader;
-  Reader<std::vector<std::string>> graphs_reader;
-  AttributeValue_ value;
-};
-
-template<>
-struct Reader<Value_> : ReaderBase {
-  Reader()
-    : proto(onnx_ValueInfoProto_init_default)
-    , name_reader(proto.name) {}
-  Reader(pb_callback_t& cb)
-    : Reader() { initialize_callback(cb); }
-
-  virtual void decode(pb_istream_t *stream) override {
-    if (!pb_decode(stream, onnx_ValueInfoProto_fields, &proto)) {
-      throw std::runtime_error("Decoding failed");
-    }
-
-    value.name = std::move(name_reader.value);
-  }
-
-  onnx_ValueInfoProto proto;
-  Reader<std::string> name_reader;
-  Value_ value;
-};
-
-
-template<>
-struct Reader<Node_> : ReaderBase {
-  Reader()
-    : proto(onnx_NodeProto_init_default)
-    , op_type_reader(proto.op_type)
-    , domain_reader(proto.domain)
-    , inputs_reader(proto.input)
-    , outputs_reader(proto.output)
-    , attrs_reader(proto.attribute)
-  {}
-  Reader(pb_callback_t& cb)
-    : Reader() { initialize_callback(cb); }
-
-  virtual void decode(pb_istream_t *stream) override {
-    if (!pb_decode(stream, onnx_NodeProto_fields, &proto)) {
-      throw std::runtime_error("Decoding failed");
-    }
-
-    value.op_type = std::move(op_type_reader.value);
-    value.domain = std::move(domain_reader.value);
-    value.inputs = std::move(inputs_reader.values);
-    value.outputs = std::move(outputs_reader.values);
-    value.attrs = std::move(attrs_reader.values);
-  }
-
-  onnx_NodeProto proto;
-  Reader<std::string> op_type_reader;
-  Reader<std::string> domain_reader;
-  Reader<std::vector<std::string>> inputs_reader;
-  Reader<std::vector<std::string>> outputs_reader;
-  Reader<std::vector<AttributeValue_>> attrs_reader;
-  Node_ value;
-};
-
-
-template<>
-struct Reader<Graph_> : ReaderBase {
-  Reader()
-    : proto(onnx_GraphProto_init_default)
-    , input_reader(proto.input)
-    , output_reader(proto.output)
-    , node_reader(proto.node)
-    , initializer_reader(proto.initializer)
-  {}
-  Reader(pb_callback_t& cb)
-    : Reader() { initialize_callback(cb); }
-
-  virtual void decode(pb_istream_t *stream) override {
-    if (!pb_decode(stream, onnx_GraphProto_fields, &proto)) {
-      throw std::runtime_error("Decoding failed");
-    }
-
-    value.inputs = std::move(input_reader.values);
-    value.outputs = std::move(output_reader.values);
-    value.nodes = std::move(node_reader.values);
-    value.initializers = std::move(initializer_reader.values);
-  }
-
-  static Graph_ read(pb_istream_t *stream) {
-    Reader<Graph_> reader;
-    reader.decode(stream);
-    return reader.value;
-  }
-
-  onnx_GraphProto proto;
-  Reader<std::vector<Value_>> input_reader;
-  Reader<std::vector<Value_>> output_reader;
-  Reader<std::vector<Node_>> node_reader;
-  Reader<std::vector<Tensor_>> initializer_reader;
-  Graph_ value;
-};
-
-
-template<>
-struct Reader<Model_> : ReaderBase {
-  Reader()
-    : proto(onnx_ModelProto_init_default)
-    , graph_reader(proto.graph) {}
-  Reader(pb_callback_t& cb)
-    : Reader() { initialize_callback(cb); }
-
-  virtual void decode(pb_istream_t *stream) override {
-    if (!pb_decode(stream, onnx_ModelProto_fields, &proto)) {
-      throw std::runtime_error("Decoding failed");
-    }
-
-    value.graph = std::move(graph_reader.value);
-  }
-
-  static Model_ read(pb_istream_t *stream) {
-    Reader<Model_> reader;
-    reader.decode(stream);
-    return reader.value;
-  }
-
-  onnx_ModelProto proto;
-  Reader<Graph_> graph_reader;
-  Model_ value;
-};
-
-
 // IR graph construction
 
-at::Tensor buildTensor(const Tensor_& tensor_) {
+namespace onnx = ::ONNX_NAMESPACE;
+
+at::Tensor buildTensor(const onnx::TensorProto& tensor_proto) {
 
   at::Tensor tensor;
 
-  switch(tensor_.data_type) {
-    case onnx_TensorProto_DataType_UINT8:
+  switch(tensor_proto.data_type()) {
+    case onnx::TensorProto_DataType_UINT8:
       tensor = at::CPU(at::kByte).tensor();
       break;
-    case onnx_TensorProto_DataType_INT8:
+    case onnx::TensorProto_DataType_INT8:
       tensor = at::CPU(at::kChar).tensor();
       break;
-    case onnx_TensorProto_DataType_INT16:
+    case onnx::TensorProto_DataType_INT16:
       tensor = at::CPU(at::kShort).tensor();
       break;
-    case onnx_TensorProto_DataType_INT32:
+    case onnx::TensorProto_DataType_INT32:
       tensor = at::CPU(at::kInt).tensor();
       break;
-    case onnx_TensorProto_DataType_INT64:
+    case onnx::TensorProto_DataType_INT64:
       tensor = at::CPU(at::kLong).tensor();
       break;
-    case onnx_TensorProto_DataType_FLOAT16:
+    case onnx::TensorProto_DataType_FLOAT16:
       tensor = at::CPU(at::kHalf).tensor();
       break;
-    case onnx_TensorProto_DataType_FLOAT:
+    case onnx::TensorProto_DataType_FLOAT:
       tensor = at::CPU(at::kFloat).tensor();
       break;
-    case onnx_TensorProto_DataType_DOUBLE:
+    case onnx::TensorProto_DataType_DOUBLE:
       tensor = at::CPU(at::kDouble).tensor();
       break;
     default:
       throw std::runtime_error("Unsupported data type");
   }
 
-  tensor.resize_(tensor_.dims);
+  std::vector<int64_t> sizes = {tensor_proto.dims().begin(), tensor_proto.dims().end()};
+  tensor.resize_(sizes);
 
   JIT_ASSERT(
       tensor.storage()->pImpl()->get_size() *
           tensor.storage()->pImpl()->elementSize() ==
-      tensor_.raw_data.size());
+      tensor_proto.raw_data().size());
 
-  std::memcpy(tensor.data_ptr(), tensor_.raw_data.data(), tensor_.raw_data.size());
+  std::memcpy(tensor.data_ptr(), tensor_proto.raw_data().data(), tensor_proto.raw_data().size());
 
   return tensor;
 }
 
-Graph_ readSubgraph(const std::string& serialized_subgraph) {
-  pb_istream_t istream = pb_istream_from_buffer(reinterpret_cast<const pb_byte_t *>(serialized_subgraph.data()), serialized_subgraph.size());
-
-  return Reader<Graph_>::read(&istream);
-}
-
-void buildBlock(const Graph_& graph_, Block* block,
+void buildBlock(const onnx::GraphProto& graph_proto, Block* block,
                 std::unordered_map<std::string, Value*>& value_map);
 
-void buildBlocks(const std::vector<Graph_>& graphs_, Node* node,
+void buildBlocks(const std::vector<onnx::GraphProto>& graphs_, Node* node,
                  std::unordered_map<std::string, Value*>& value_map) {
   for (auto g_ : graphs_) {
     auto block = node->addBlock();
@@ -418,97 +77,96 @@ void buildBlocks(const std::vector<Graph_>& graphs_, Node* node,
   }
 }
 
-std::shared_ptr<Graph> buildGraph(const Graph_& graph_) {
+std::shared_ptr<Graph> buildGraph(const onnx::GraphProto& graph_proto) {
   auto graph = std::make_shared<Graph>();
   std::unordered_map<std::string, Value*> value_map;
 
-  buildBlock(graph_, graph->block(), value_map);
+  buildBlock(graph_proto, graph->block(), value_map);
 
   return graph;
 }
 
-void buildBlock(const Graph_& graph_, Block* block,
+void buildBlock(const onnx::GraphProto& graph_proto, Block* block,
                 std::unordered_map<std::string, Value*>& value_map) {
 
-  for (auto & input : graph_.inputs) {
-    value_map[input.name] = block->addInput();
+  for (auto & input : graph_proto.input()) {
+    value_map[input.name()] = block->addInput();
   }
 
-  for (auto & node_ : graph_.nodes) {
-    JIT_ASSERT(node_.op_type != "PythonOp");
+  for (auto & node_ : graph_proto.node()) {
+    JIT_ASSERT(node_.op_type() != "PythonOp");
 
-    auto node = block->owningGraph()->create(Symbol::fromDomainAndUnqualString(node_.domain, node_.op_type),
-                                             node_.outputs.size());
+    auto node = block->owningGraph()->create(Symbol::fromDomainAndUnqualString(node_.domain(), node_.op_type()),
+                                             node_.output().size());
 
-    for (auto & attr : node_.attrs) {
-      Symbol name = Symbol::attr(attr.name);
+    for (auto & attr : node_.attribute()) {
+      Symbol name = Symbol::attr(attr.name());
 
-      switch(attr.type) {
-        case onnx_AttributeProto_AttributeType_UNDEFINED:
+      switch(attr.type()) {
+        case onnx::AttributeProto_AttributeType_UNDEFINED:
           throw std::runtime_error("UNDEFINED attribute unsupported");
           break;
-        case onnx_AttributeProto_AttributeType_FLOAT:
-          node->f_(name, attr.f);
+        case onnx::AttributeProto_AttributeType_FLOAT:
+          node->f_(name, attr.f());
           break;
-        case onnx_AttributeProto_AttributeType_INT:
-          node->i_(name, attr.i);
+        case onnx::AttributeProto_AttributeType_INT:
+          node->i_(name, attr.i());
           break;
-        case onnx_AttributeProto_AttributeType_STRING:
-          node->s_(name, std::move(attr.s));
+        case onnx::AttributeProto_AttributeType_STRING:
+          node->s_(name, std::move(attr.s()));
           break;
-        case onnx_AttributeProto_AttributeType_TENSOR:
-          node->t_(name, buildTensor(attr.t));
+        case onnx::AttributeProto_AttributeType_TENSOR:
+          node->t_(name, buildTensor(attr.t()));
           break;
-        case onnx_AttributeProto_AttributeType_GRAPH:
-          node->g_(name, buildGraph(readSubgraph(attr.g)));
+        case onnx::AttributeProto_AttributeType_GRAPH:
+          node->g_(name, buildGraph(attr.g()));
           break;
-        case onnx_AttributeProto_AttributeType_FLOATS:
-          node->fs_(name, std::move(attr.fs));
+        case onnx::AttributeProto_AttributeType_FLOATS:
+          node->fs_(name, {attr.floats().begin(), attr.floats().end()});
           break;
-        case onnx_AttributeProto_AttributeType_INTS:
-          node->is_(name, std::move(attr.is));
+        case onnx::AttributeProto_AttributeType_INTS:
+          node->is_(name, {attr.ints().begin(), attr.ints().end()});
           break;
-        case onnx_AttributeProto_AttributeType_STRINGS:
-          node->ss_(name, std::move(attr.ss));
+        case onnx::AttributeProto_AttributeType_STRINGS:
+          node->ss_(name, {attr.strings().begin(), attr.strings().end()});
           break;
-        case onnx_AttributeProto_AttributeType_TENSORS:
-          node->ts_(name, fmap(attr.ts, [](const Tensor_& t) { return buildTensor(t); }));
+        case onnx::AttributeProto_AttributeType_TENSORS:
+          node->ts_(name, fmap(attr.tensors(), [](const onnx::TensorProto& t) { return buildTensor(t); }));
           break;
-        case onnx_AttributeProto_AttributeType_GRAPHS:
-          if (attr.name == "_blocks") {
-            buildBlocks(fmap(attr.gs, [](const std::string& g) { return readSubgraph(g); }), node, value_map);
+        case onnx::AttributeProto_AttributeType_GRAPHS:
+          if (attr.name() == "_blocks") {
+            buildBlocks({attr.graphs().begin(), attr.graphs().end()}, node, value_map);
           }
           else {
-            node->gs_(name, fmap(fmap(attr.gs, [](const std::string& g) { return readSubgraph(g); } ),
-                                               [](const Graph_& g_) { return buildGraph(g_); }));
+            node->gs_(name, fmap(attr.graphs(), [](const onnx::GraphProto& g_) { return buildGraph(g_); }));
           }
           break;
       }
     }
 
-    for (auto & input : node_.inputs) {
+    for (auto & input : node_.input()) {
       auto v = value_map[input];
       node->addInput(v);
     }
 
-    for (size_t i=0; i<node_.outputs.size(); i++) {
-      value_map[node_.outputs[i]] = node->outputs()[i];
+    for (int i=0; i<node_.output().size(); i++) {
+      value_map[node_.output(i)] = node->outputs()[i];
     }
 
     block->appendNode(node);
   }
 
-  for (auto & output : graph_.outputs) {
-    Value* v = value_map.at(output.name);
+  for (auto & output : graph_proto.output()) {
+    Value* v = value_map.at(output.name());
     block->registerOutput(v);
   }
 }
 
-std::shared_ptr<Graph> buildGraph(const Graph_& graph_, std::vector<at::Tensor>& initializers) {
+std::shared_ptr<Graph> buildGraph(const onnx::GraphProto& graph_proto, std::vector<at::Tensor>& initializers) {
 
-  auto graph = buildGraph(graph_);
+  auto graph = buildGraph(graph_proto);
 
-  for (auto tensor_ : graph_.initializers) {
+  for (auto tensor_ : graph_proto.initializer()) {
     initializers.push_back(buildTensor(tensor_));
   }
 
@@ -557,12 +215,10 @@ void reconstructOutputTypes(Block *b) {
 
 std::shared_ptr<Graph> ImportIRGraph(const std::string& serialized_graph,
                                      std::vector<at::Tensor>& initializers) {
+  auto model_proto = ::ONNX_NAMESPACE::ModelProto();
+  model_proto.ParseFromString(serialized_graph);
 
-  pb_istream_t istream = pb_istream_from_buffer(reinterpret_cast<const pb_byte_t *>(serialized_graph.data()), serialized_graph.size());
-
-  auto model = Reader<Model_>::read(&istream);
-
-  auto graph = buildGraph(model.graph, initializers);
+  auto graph = buildGraph(model_proto.graph(), initializers);
 
   reconstructOutputTypes(graph->block());
 
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index d3a9bd9139a96e..5363eda02ff528 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -18,6 +18,7 @@
 #include "torch/csrc/jit/passes/onnx/fixup_onnx_loop.h"
 #include "torch/csrc/jit/passes/shape_analysis.h"
 #include "torch/csrc/jit/passes/decompose_addmm.h"
+#include "torch/csrc/jit/passes/constant_propagation.h"
 #include "torch/csrc/jit/passes/loop_unrolling.h"
 #include "torch/csrc/jit/passes/to_batch.h"
 #include "torch/csrc/jit/passes/specialize_undef.h"
@@ -70,11 +71,14 @@ void initJITBindings(PyObject *module) {
    })
    .def("_jit_pass_lint", LintGraph)
    .def("_jit_pass_shape_analysis", [](Graph& graph, py::tuple inputs, bool with_grad) {
-     PropagateInputShapes(graph, ArgumentSpec(with_grad, createStack(inputs)));
+     PropagateInputShapes(graph, ArgumentSpec(with_grad, createStack(inputs, graph.inputs())));
    })
    .def("_jit_pass_remove_expands", RemoveExpands)
    .def("_jit_pass_erase_number_types", EraseNumberTypes)
    .def("_jit_pass_loop_unrolling", UnrollLoops)
+   .def("_jit_pass_constant_propagation", [](std::shared_ptr<Graph>& g) {
+     return ConstantPropagation(g);
+   })
    .def("_jit_run_cpp_tests", [] {
      // We have to release the GIL inside this method, because if we happen to
      // initialize the autograd engine in these tests, the newly spawned worker threads will
@@ -182,15 +186,16 @@ void initJITBindings(PyObject *module) {
         return ge.graph();
       })
       .def("graph_for", [](GraphExecutor& ge, py::args args) {
-        return ge.graphFor(createStack(args));
+        return ge.graphFor(createStack(args, ge.graph()->inputs()));
       })
       .def("get_debug_state", [](GraphExecutor& ge) {
         return ge.getDebugState();
       })
       .def("__call__", [](GraphExecutor& ge, py::args args) -> py::object {
-        auto stack = createStack(args);
+        const auto & graph = ge.graph();
+        auto stack = createStack(args, graph->inputs());
         ge.run(stack);
-        return wrapStack(std::move(stack));
+        return wrapStack(std::move(stack), graph->outputs());
       });
 
 
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
index 52b8cb0eaccd98..c567793552d73a 100644
--- a/torch/csrc/jit/interned_strings.h
+++ b/torch/csrc/jit/interned_strings.h
@@ -50,6 +50,7 @@ _(prim, TensorToNum) \
 _(prim, AutogradAdd) \
 _(prim, GradOf) \
 _(prim, AnyDefined) \
+_(prim, FusedConcat) \
 _(aten, __not__) \
 FORALL_ATEN_BASE_SYMBOLS(_) \
 _(onnx, Add) \
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index 65bdcf695f6de2..0c1fe17ade0dfd 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -337,9 +337,9 @@ struct PreprocessGraph {
 struct ContainerTensor : public at::TensorImpl {
 public:
   ContainerTensor()
-  : TensorImpl(&(at::globalContext().getType(at::Backend::Undefined,at::ScalarType::Undefined)), nullptr) {}
+  : TensorImpl(at::Backend::Undefined,at::ScalarType::Undefined, nullptr, /* is_variable */ false) {}
 
-  virtual ~ContainerTensor() {}
+  virtual ~ContainerTensor() = default;
   virtual at::IntList sizes() const override {
     throw std::runtime_error("sizes() on ContainerTensor");
   }
@@ -685,8 +685,8 @@ struct CodeImpl {
 
 // InterpreterState state that is held across stages and used to compute a Code
 struct InterpreterStateImpl {
-  InterpreterStateImpl(const Code & function_)
-  : function(function_.pImpl),
+  InterpreterStateImpl(const Code & code)
+  : function(code.pImpl),
     int_data(function->int_data.data()),
     bool_data(function->bool_data),
     registers(function->register_size) {
@@ -775,15 +775,15 @@ std::ostream & operator<<(std::ostream & out, const Code & code) {
 
 Code::Code(std::shared_ptr<Graph>& graph)
     : pImpl(new CodeImpl(graph)) {}
-Code::~Code() {}
+Code::~Code() = default;
 
 const std::vector<GraphExecutor*>& Code::executors() {
   return pImpl->executors();
 }
 
-InterpreterState::InterpreterState(const Code & function)
-  : pImpl(new InterpreterStateImpl(function)) {}
-InterpreterState::~InterpreterState() {}
+InterpreterState::InterpreterState(const Code & code)
+  : pImpl(new InterpreterStateImpl(code)) {}
+InterpreterState::~InterpreterState() = default;
 
 void InterpreterState::runOneStage(Stack & stack) {
   return pImpl->runOneStage(stack);
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index 7f09b22b324d11..ede14249c46dce 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -44,9 +44,9 @@ std::ostream& operator<<(std::ostream & out, const at::ArrayRef<T> & nodes) {
 }
 
 struct const_value_list_with_types {
-  const std::vector<const Value*>& values;
+  const ArrayRef<const Value*> values;
   bool use_newlines;
-  const_value_list_with_types(const std::vector<const Value*>& values, bool use_newlines = false)
+  const_value_list_with_types(ArrayRef<const Value*> values, bool use_newlines = false)
     : values(values), use_newlines(use_newlines) {}
 };
 std::ostream& operator<<(std::ostream & out, const_value_list_with_types l) {
@@ -355,7 +355,7 @@ void Graph::lint() const {
   // - every use will occur later in the topsort
 
   struct LintScope {
-    LintScope() {}
+    LintScope() = default;
     LintScope(std::unique_ptr<LintScope> parent)
     : parent(std::move(parent)) {}
     bool contains(const Value * v) {
@@ -487,13 +487,13 @@ void LintGraph(std::shared_ptr<Graph>& graph) {
   graph->lint();
 }
 
-void Block::cloneFrom(Block * src, std::function<Value*(Value*)> outer_map) {
+void Block::cloneFrom(Block * src, std::function<Value*(Value*)> value_map) {
   std::unordered_map<Value*, Value*> local_map;
   auto env = [&](Value * v) {
     auto it = local_map.find(v);
     if(it != local_map.end())
       return it->second;
-    return outer_map(v);
+    return value_map(v);
   };
 
   auto graph = owningGraph();
@@ -619,23 +619,8 @@ Value* Node::namedInput(Symbol name) const {
     // so this is completely unsafe and needs to be gone as soon as possible.
     return v;
   }
-  const auto & the_schema = schema();
-  int64_t tensor_list_pos = 0;
-  for (auto & arg : the_schema.arguments) {
-    if (*arg.type == *ListType::ofTensors())
-      break;
-    tensor_list_pos++;
-  }
   int64_t arg_pos = findArgument(schema(), name).first;
-  // XXX: we don't have a single value we could give for a Tensor[],
-  // because we flatten lists into arguments
-  JIT_ASSERT(arg_pos != tensor_list_pos);
-  // NB: if there's no tensor list, then tensor_list_pos == arguments.size(), so this is always true
-  if (arg_pos < tensor_list_pos) {
-    return input(arg_pos);
-  } else {
-    return input(inputs().size() - (the_schema.arguments.size() - arg_pos));
-  }
+  return input(arg_pos);
 }
 
 bool Node::matches(const char *signature_literal, at::ArrayRef<Symbol> const_inputs) {
@@ -646,8 +631,12 @@ bool Node::matches(const char *signature_literal, at::ArrayRef<Symbol> const_inp
   return true;
 }
 
+void Node::dump() const {
+  std::cout << *this << "\n";
+}
+
 void Node::findSchema() const {
-  schema_ = &getOperatorFor(this).schema;
+  schema_ = &getOperatorFor(this).schema();
 }
 
 PythonOp* defaultAllocPythonOp(Graph*g) {
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 9af468e6ee06e7..b2caa642b6fe20 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -54,7 +54,7 @@ struct Value;
 
 TORCH_API std::ostream& operator<<(std::ostream & out, const Graph & g);
 TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t);
-TORCH_API std::ostream& operator<<(std::ostream & out, const Node & t);
+TORCH_API std::ostream& operator<<(std::ostream & out, const Node & n);
 
 // A list of nodes, with inputs and outputs
 struct Block;
@@ -683,7 +683,9 @@ struct Node : public Attributes<Node> {
     return *schema_;
   }
 
-  virtual ~Node() {}
+  void dump() const;
+
+  virtual ~Node() = default;
 private:
   std::pair<Value*, const Argument&> findInput(Symbol name);
   void findSchema() const;
@@ -889,8 +891,7 @@ friend struct Block;
   , block_(new Block(this, nullptr))
   , insert_before_(return_node()) {}
 
-  Graph()
-  : Graph( std::make_shared<Scope>()) {}
+  Graph() : Graph(std::make_shared<Scope>()) {}
 
   at::ArrayRef<Value*> inputs() {
     return block_->inputs();
diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index 42a5be89e55e4b..6eef40a0323068 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -83,6 +83,7 @@ struct ConstantList;
 struct IValue;
 using Tuple = ConstantList<IValue>;
 using IntList = ConstantList<int64_t>;
+using TensorList = ConstantList<at::Tensor>;
 using DoubleList = ConstantList<double>;
 
 // IValue is the generic tagged union used by the interpreter to hold
@@ -93,7 +94,7 @@ using DoubleList = ConstantList<double>;
 // retain/release calls.
 
 #define TORCH_FORALL_TAGS(_) \
-  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList)
+  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(TensorList)
 
 struct IValue {
   IValue()
@@ -223,6 +224,20 @@ struct IValue {
     return toRetainable<DoubleList>();
   }
 
+  //TensorList
+  IValue(Shared<TensorList> v);
+  IValue(std::vector<at::Tensor> v);
+  bool isTensorList() const { return Tag::TensorList == tag; }
+  Shared<TensorList> toTensorList() && {
+    JIT_ASSERT(isTensorList());
+    return moveToRetainable<TensorList>();
+  }
+  Shared<TensorList> toTensorList() const & {
+    JIT_ASSERT(isTensorList());
+    return toRetainable<TensorList>();
+  }
+
+  // None
   bool isNone() {
     return Tag::None == tag;
   }
@@ -369,8 +384,15 @@ inline IValue::IValue(Shared<DoubleList> v)
 inline IValue::IValue(std::vector<double> v)
 : IValue(DoubleList::create(std::move(v))) {}
 
+inline IValue::IValue(Shared<TensorList> v)
+: tag(Tag::TensorList), retainable(true) {
+  as_retainable = v.detach();
+}
+inline IValue::IValue(std::vector<at::Tensor> v)
+: IValue(TensorList::create(std::move(v))) {}
+
 inline std::vector<int64_t> IValue::copyToIntList() const {
-  return std::vector<int64_t>(toIntList()->elements());
+  return toIntList()->elements().vec();
 }
 
 }}
diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp
index f19d18caa9289e..5cb2c2c11ad5a7 100644
--- a/torch/csrc/jit/operator.cpp
+++ b/torch/csrc/jit/operator.cpp
@@ -248,8 +248,12 @@ std::string canonicalSchemaString(const FunctionSchema& schema) {
 
 using OperatorMap = std::unordered_map<Symbol, std::vector<std::shared_ptr<Operator>>>;
 struct OperatorRegistry  {
-  OperatorMap operators;
+private:
   std::mutex lock;
+  OperatorMap operators;
+  // list of operators whose schema have not yet been parsed, and must
+  // be registered before any call to lookup an opeator
+  std::vector<std::shared_ptr<Operator>> to_register;
   // Those two maps are used to implement lookupByLiteral, which is needed for the n->match(...) calls.
   // Basically, every function schema is assigned a unique string you can use to match it. However,
   // parsing those strings or comparing and hashing them character by character would be very slow, so
@@ -260,18 +264,26 @@ struct OperatorRegistry  {
   // by performing a lookup in the operators_by_sig map.
   std::unordered_map<std::string, std::shared_ptr<Operator>> operators_by_sig;
   std::unordered_map<const char *, std::shared_ptr<Operator>> operators_by_sig_literal;
-  void registerOperator(Operator&& op){
-    std::lock_guard<std::mutex> guard(lock);
 
-    Symbol sym = Symbol::fromQualString(op.schema.name);
-    auto op_ptr = std::make_shared<Operator>(std::move(op));
-
-    operators[sym].push_back(op_ptr);
+  // XXX - caller must be holding lock
+  void registerPendingOperators() {
+    for(auto op : to_register) {
+      Symbol sym = Symbol::fromQualString(op->schema().name);
+      operators[sym].push_back(op);
+      operators_by_sig[canonicalSchemaString(op->schema())] = op;
+    }
+    to_register.clear();
+  }
 
-    operators_by_sig[canonicalSchemaString(op.schema)] = op_ptr;
+public:
+  void registerOperator(Operator&& op) {
+    std::lock_guard<std::mutex> guard(lock);
+    to_register.push_back(std::make_shared<Operator>(std::move(op)));
   }
 
   const std::shared_ptr<Operator>& lookupByLiteral(const char * name) {
+    std::lock_guard<std::mutex> guard(lock);
+    registerPendingOperators();
     auto it = operators_by_sig_literal.find(name);
     if (it == operators_by_sig_literal.end()) {
       auto op_ptr_it = operators_by_sig.find(name);
@@ -289,8 +301,10 @@ struct OperatorRegistry  {
     return it->second;
   }
 
+
   const std::vector<std::shared_ptr<Operator>>& getOperators(Symbol name) {
     std::lock_guard<std::mutex> guard(lock);
+    registerPendingOperators();
     static std::vector<std::shared_ptr<Operator>> empty;
     auto it = operators.find(name);
     if(it != operators.end())
@@ -342,16 +356,16 @@ bool typeMatches(TypePtr actual, TypePtr formal) {
 }
 
 bool Operator::matches(const Node* node) const {
-  if (node->kind().toQualString() != schema.name) {
+  if (node->kind().toQualString() != schema().name) {
     return false;
   }
   size_t attributes_size = node->numAttributes();
   size_t attributes_seen = 0;
   auto inputs_size = node->inputs().size();
   size_t input_i = 0;
-  for(size_t arg_i = 0; arg_i < schema.arguments.size(); ++arg_i) {
+  for(size_t arg_i = 0; arg_i < schema().arguments.size(); ++arg_i) {
     at::optional<AttributeKind> attribute_kind;
-    const Argument& arg = schema.arguments[arg_i];
+    const Argument& arg = schema().arguments[arg_i];
     if(attributes_size > 0 && (attribute_kind = attributeKindOf(arg.type))) {
       auto name = Symbol::fromQualString("attr::" + arg.name);
       if(!node->hasAttribute(name) || node->kindOf(name) != *attribute_kind) {
@@ -359,22 +373,6 @@ bool Operator::matches(const Node* node) const {
         return false;
       }
       attributes_seen++;
-    } else if(*arg.type == *ListType::ofTensors()) {
-      // Tensor[] is handled as varargs, consume inputs until the remaining required arguments
-      // XXX - there can only be a single Tensor[] in a declaration
-      size_t remaining_required = 0;
-      for(size_t j = arg_i + 1; j < schema.arguments.size(); ++j){
-        // remaining arguments are only those that won't be consumed from attributes
-        if(attributes_size == 0 || !attributeKindOf(schema.arguments[j].type))
-          remaining_required++;
-      }
-      while(inputs_size - input_i > remaining_required) {
-        auto input = node->inputs()[input_i++];
-        if(!typeMatches(input->type(), DynamicType::get())) {
-          // std::cout << "vararg argument is not Dynamic\n";
-          return false;
-        }
-      }
     } else {
       if(input_i == inputs_size) {
         // std::cout << "not enough inputs\n";
@@ -388,11 +386,11 @@ bool Operator::matches(const Node* node) const {
     }
   }
 
-  if(!schema.is_vararg && input_i != inputs_size) {
+  if(!schema().is_vararg && input_i != inputs_size) {
     // std::cout << "not all inputs used\n" << input_i << " " << inputs_size << "\n";
     return false;
   }
-  if(!schema.is_vararg && attributes_seen != attributes_size) {
+  if(!schema().is_vararg && attributes_seen != attributes_size) {
     // std::cout << "not all attributes used\n" << attributes_seen << " " << attributes_size << "\n";
     return false;
   }
@@ -426,7 +424,7 @@ const Operator& getOperatorFor(const Node* node) {
   er << "\ncandidates were:\n";
   const auto& candidates = getAllOperatorsFor(node->kind());
   for(auto & candidate : candidates) {
-    er << "  " << candidate->schema << "\n";
+    er << "  " << candidate->schema() << "\n";
   }
   throw er;
 }
@@ -436,7 +434,7 @@ OperatorSet::OperatorSet(std::initializer_list<const char *> sig_literals) {
   auto & registry = getRegistry();
   for (const char * sig : sig_literals) {
     auto op = registry.lookupByLiteral(sig);
-    ops[Symbol::fromQualString(op->schema.name)].push_back(op);
+    ops[Symbol::fromQualString(op->schema().name)].push_back(op);
   }
 }
 
diff --git a/torch/csrc/jit/operator.h b/torch/csrc/jit/operator.h
index 7e6a314d2cb8c3..be2c20b01a5379 100644
--- a/torch/csrc/jit/operator.h
+++ b/torch/csrc/jit/operator.h
@@ -2,57 +2,81 @@
 // once C10 exists this can be removed, or stubbed out, but we need
 // it now to implement correct semantic checking for script
 #pragma once
-#include "ATen/ATen.h"
+
 #include "torch/csrc/jit/assertions.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/function_schema.h"
 #include "torch/csrc/jit/stack.h"
 
+#include "ATen/ATen.h"
+
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
 namespace torch { namespace jit {
 
-FunctionSchema parseSchema(const std::string& decl);
+FunctionSchema parseSchema(const std::string& schema);
 
 using OperationCreator = std::function<Operation(Node*)>;
 
 struct TORCH_API Operator {
-  Operator(FunctionSchema schema, OperationCreator op, OperationCreator op_const_attributes = nullptr)
-    : schema(std::move(schema))
-    , op(std::move(op))
-    , op_const_attributes(std::move(op_const_attributes)) {}
+  Operator(FunctionSchema schema, OperationCreator op_creator)
+      : schema_(std::make_shared<FunctionSchema>(std::move(schema))),
+        op_creator_(std::move(op_creator)) {}
 
-  Operator(const std::string& schema, OperationCreator op, OperationCreator op_const_attributes = nullptr)
-    : Operator(parseSchema(schema), std::move(op), std::move(op_const_attributes)) {}
+  Operator(const std::string& schema, OperationCreator op_creator)
+      : schema_string_(schema), op_creator_(std::move(op_creator)) {}
 
-  // Helper constructor to regsiter `op` to run
+  // Helper constructor to register `op` to run
   // run for _every_ IR Node where n.kind() == name, regardless of arguments.
-  // This is accomplished by marking the schema varargs and having no required arguments.
-  // This is used for things like prim::While or prim::If that can take a number
-  // of different valid input types and lengths.
-  Operator(Symbol name, OperationCreator op)
-  : Operator(FunctionSchema(name, {}, {}, true), op, op) {}
-
-  FunctionSchema schema;
-
-  bool matches(const Node* n) const;
-  // Operators have different versions depending on if some inputs are encoded
-  // as attributes or inputs. This function returns the right Operation function,
-  // given a node encoded for one variant.
-  // Behavior is undefined if matches(n) == false
-  // TODO (apaszke) : remove
-  Operation selectVariant(Node* n) const {
-    if(n->hasAttributes()) {
-      JIT_ASSERT(op_const_attributes != nullptr);
-      return op_const_attributes(n);
-    } else {
-      return op(n);
+  // This is accomplished by marking the schema varargs and having no required
+  // arguments. This is used for things like prim::While or prim::If that can
+  // take a number of different valid input types and lengths.
+  Operator(Symbol name, OperationCreator op_creator)
+      : Operator(FunctionSchema(name, {}, {}, true), std::move(op_creator)) {}
+
+  Operator(FunctionSchema schema, Operation op)
+      : schema_(std::make_shared<FunctionSchema>(std::move(schema))),
+        op_(std::make_shared<Operation>(std::move(op))) {}
+
+  Operator(const std::string& schema, Operation op)
+      : schema_string_(schema),
+        op_(std::make_shared<Operation>(std::move(op))) {}
+
+  bool matches(const Node* node) const;
+
+  Operation getOperation(Node* node = nullptr) const {
+    if (op_) {
+      return *op_;
     }
+    AT_ASSERT(node != nullptr);
+    return op_creator_(node);
   }
-  bool hasAttributedVersion() const {
-    return op_const_attributes != nullptr;
+
+  const FunctionSchema & schema() const {
+    // we lazily parse schema initialized from strings so that
+    // we do less work during static operator registration
+    if(!schema_) {
+      schema_ = std::make_shared<FunctionSchema>(parseSchema(schema_string_.value()));
+      schema_string_ = at::nullopt;
+    }
+    return *schema_;
   }
 private:
-  OperationCreator op;
-  OperationCreator op_const_attributes;
+  mutable at::optional<std::string> schema_string_;
+  // cannot use at::optional because windows has issues that require an assignment operator to be generated
+  // cannot use std::unique_ptr because initializer lists of Operators end up copying the Operator
+  mutable std::shared_ptr<FunctionSchema> schema_;
+
+  // Essentially a variant<Operation, OperationCreator>.
+  // NB: std::function has a default state (where it == nullptr).
+  std::shared_ptr<Operation> op_;
+  OperationCreator op_creator_;
 };
 
 const std::vector<std::shared_ptr<Operator>>& getAllOperatorsFor(Symbol name);
@@ -62,7 +86,7 @@ const Operator& getOperatorFor(const Node* node);
 inline Operation getOperation(Node* node) {
   // note: getOperatorFor ensures that getOperatorFor(node).matches(node) == true
   // so the call to selectVariant is always valid.
-  return getOperatorFor(node).selectVariant(node);
+  return getOperatorFor(node).getOperation(node);
 }
 
 void registerOperator(Operator&& op);
diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp
index 0e40bc8831a6df..414dc1652a4da1 100644
--- a/torch/csrc/jit/passes/batch_mm.cpp
+++ b/torch/csrc/jit/passes/batch_mm.cpp
@@ -3,8 +3,9 @@
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
 #include "torch/csrc/jit/interned_strings.h"
 #include "torch/csrc/jit/constants.h"
-#include "torch/csrc/utils/functional.h"
+#include "torch/csrc/jit/symbolic_variable.h"
 #include "torch/csrc/jit/assertions.h"
+#include "torch/csrc/utils/functional.h"
 
 #include <ATen/ATen.h>
 #include <algorithm>
@@ -191,12 +192,11 @@ void BatchMMBlock(Block* block) {
       int cat_dim    = s == Side::LHS ? 1 : 0;
       cat_sizes[cat_dim] *= matmuls.size(); // make them really cat_sizes
 
-      auto inputs = fmap(matmuls, [=](Node *mm) { return mm->inputs()[inputs_off]; });
       WithInsertPoint iguard { root.node };
-      inputs.push_back(insertConstant(*graph, cat_dim));
-      Node *cat = graph->insertNode(graph->create(aten::cat, inputs));
-      cat->output()->setType(type->withSizes(cat_sizes));
-      return cat->output();
+      auto inputs = fmap(matmuls, [=](Node *mm) -> SymbolicVariable { return mm->inputs()[inputs_off]; });
+      auto cat_output = SymbolicVariable::cat(inputs, cat_dim).value();
+      cat_output->setType(type->withSizes(cat_sizes));
+      return cat_output;
     };
 
     auto lhs_batch = batch_inputs(Side::LHS, root.lhs_sizes);
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
new file mode 100644
index 00000000000000..39492f9e76c50c
--- /dev/null
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -0,0 +1,95 @@
+#include "torch/csrc/jit/passes/constant_propagation.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/jit/constants.h"
+#include "torch/csrc/jit/interpreter.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/ivalue.h"
+#include "torch/csrc/jit/operator.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/utils/functional.h"
+
+namespace torch { namespace jit {
+
+namespace {
+
+std::unordered_set<Symbol> skip_list = {
+  //FIXME If & Loop require special casing because they cannot be run as a
+  //single node.
+  prim::If,
+  prim::Loop,
+  //FIXME Same problem as in DCE - cpp & python PythonOp and CppOp should be
+  //FIXME treated as having side effects but ONNX depends on them being removed
+  prim::Print,
+  //all the rand functions from native_functions.yaml
+  aten::permute,
+  aten::rand,
+  aten::rand_out,
+  aten::rand_like,
+  aten::randint,
+  aten::randint_out,
+  aten::randint_like,
+  aten::randn,
+  aten::randn_out,
+  aten::randn_like,
+  aten::randperm,
+  aten::randperm_out,
+ };
+
+std::vector<IValue> runNode(Node* n) {
+  auto op = getOperation(n);
+  Stack stack;
+  for (auto input : n->inputs()) {
+    stack.push_back(*(toIValue(input)));
+  }
+  op(stack);
+  auto var_outputs = fmap(stack, [&](IValue v) {
+    if (v.isTensor()) {
+      return IValue(autograd::as_variable_ref(v.toTensor()).data());
+    } else {
+      return v;
+    }
+  });
+  return var_outputs;
+}
+
+void propagateNode(Node* n) {
+  auto outputs = runNode(n);
+  auto graph = n->owningGraph();
+  WithInsertPoint guard(n);
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    auto new_output = insertConstant(*graph, outputs[i]);
+    n->outputs()[i]->replaceAllUsesWith(new_output);
+    // let dce elimination remove n
+  }
+}
+
+} // anonymous namespace
+
+void ConstantPropagation(Node* n, bool recurse) {
+  bool constant_inputs = (n->inputs().size() > 0) &&
+    std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) {
+      return v->node()->kind() == prim::Constant;
+    });
+  bool supported_node = skip_list.count(n->kind()) == 0;
+  if (constant_inputs && supported_node) {
+    propagateNode(n);
+  }
+  if (recurse) {
+    for (Block * block : n->blocks())
+      ConstantPropagation(block, recurse);
+  }
+}
+
+void ConstantPropagation(Block* block, bool recurse) {
+  ConstantPropagation(block->param_node(), recurse);
+  for (auto n: block->nodes()) {
+    ConstantPropagation(n, recurse);
+  }
+}
+
+void ConstantPropagation(std::shared_ptr<Graph>& graph) {
+  ConstantPropagation(graph->block(), true);
+  EliminateDeadCode(graph);
+}
+
+}}
diff --git a/torch/csrc/jit/passes/constant_propagation.h b/torch/csrc/jit/passes/constant_propagation.h
new file mode 100644
index 00000000000000..12df329c81ccfc
--- /dev/null
+++ b/torch/csrc/jit/passes/constant_propagation.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+TORCH_API void ConstantPropagation(std::shared_ptr<Graph>& graph);
+TORCH_API void ConstantPropagation(Block* block, bool recurse);
+TORCH_API void ConstantPropagation(Node* n, bool recurse);
+
+}}
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index cb3757cffb0e34..cc8dcb8926dee0 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -177,16 +177,25 @@ struct GraphFuser {
     }
   }
 
-  bool allCatInputsHaveSameSize(Node * node) {
-    JIT_ASSERT(node->kind() == aten::cat);
-    std::vector<Value*> inputs = node->inputs();
-    if (!node->hasAttributes()) {
-      inputs.pop_back(); // Get rid of the dim argument
-    }
+  bool isFusableCatNode(Node * node) {
+    if (node->kind() != aten::cat)
+      return false;
+    if (!node->is_constant(attr::dim))
+      return false;
 
-    auto expected = inputs.at(0)->type()->cast<TensorType>();
+    auto tensors_node = node->namedInput(attr::tensors)->node();
+    if (tensors_node->kind() != prim::ListConstruct) return false;
+    // NB: Note that technically other uses of the list aren't a big problem for us.
+    // It would be enough to place the prim::FusedConcat before the prim::ListConstruct, and
+    // allUsersAreThisConsumerOrOccurAfterIt would still be satisfied. However, I don't expect this
+    // to be necessary any time soon, and so we're simply assuming that we don't have to deal with that.
+    if (tensors_node->output()->uses().size() > 1) return false;
+    auto tensors = tensors_node->inputs();
+
+    // Our fusion code assumes that all inputs have the same shapes, so we need to check this too.
+    auto expected = tensors.at(0)->type()->cast<TensorType>();
     if (!expected) return false;
-    return std::all_of(inputs.begin(), inputs.end(), [expected](Value *v) {
+    return std::all_of(tensors.begin(), tensors.end(), [&expected](Value *v) {
         auto actual = v->type()->cast<TensorType>();
         return actual && actual->sizes() == expected->sizes();
     });
@@ -197,15 +206,7 @@ struct GraphFuser {
   // because it is not a simple map, can be put in a fusion group
   // as long as no items in the group read the output of concat
   bool isFusableAsExitNode(Node * node) {
-    if(isFusable(node))
-      return true;
-    // this concat fusion only works when all the inputs are the same size
-    // and we can statically infer the dimension along which we should concat
-    // otherwise they cannot partipate in the same map
-    if(node->kind() == aten::cat && node->is_constant(attr::dim) && allCatInputsHaveSameSize(node))
-      return true;
-
-    return false;
+    return isFusable(node) || isFusableCatNode(node);
   }
 
   // necessary condition for fusion. If all of the uses of producer are consumer
@@ -241,8 +242,9 @@ struct GraphFuser {
     // we can move the consumer up into the producer.
     // but this requires better handling of merging fusion groups so it is not done now
     at::optional<int> consumer_device = getDevice(consumer);
+    Node *real_consumer = consumer->kind() == aten::cat ? consumer->namedInput(attr::tensors)->node() : consumer;
     return isFusable(producer->node()) &&
-      allUsersAreThisConsumerOrOccurAfterIt(consumer, producer) &&
+      allUsersAreThisConsumerOrOccurAfterIt(real_consumer, producer) &&
       consumer_device && consumer_device == getDevice(producer->node()) &&
       (*consumer_device != kCPUDevice || sharedFusionCompiler().canCompileOnCPU());
   }
@@ -389,7 +391,24 @@ struct GraphFuser {
 
   Node * fuse(Node * consumer, Value * producer) {
     auto group = consumer;
-    if(group->kind() != prim::FusionGroup) {
+    if (consumer->kind() == aten::cat) {
+      Graph * graph = consumer->owningGraph();
+      Node * list_construct = consumer->namedInput(attr::tensors)->node();
+      int64_t dim = consumer->get<int64_t>(attr::dim).value();
+
+      Node * fused_cat = graph->create(prim::FusedConcat, list_construct->inputs())->i_(attr::dim, dim);
+      fused_cat->insertBefore(list_construct);
+      fused_cat->output()->copyMetadata(consumer->output());
+      consumer->output()->replaceAllUsesWith(fused_cat->output());
+      topological_index[fused_cat] = topological_index[list_construct];
+
+      // NB: this deletes the fused_cat node from the original graph
+      group = createSingletonFusionGroup(fused_cat);
+      consumer->destroy();
+      if (list_construct->output()->uses().empty()) {
+        list_construct->destroy();
+      }
+    } else if (consumer->kind() != prim::FusionGroup) {
       group = createSingletonFusionGroup(consumer);
     }
     if (producer->node()->kind() == prim::FusionGroup) {
@@ -450,7 +469,6 @@ struct GraphFuser {
       }
     }
 
-    // TODO: Remove this restriction if we ever need to distribute across
     // multiple return operators
     Node * producer_for_chunk_node = producer_for_chunk->node();
     JIT_ASSERT(producer_for_chunk_node->outputs().size() == 1);
@@ -521,11 +539,14 @@ struct GraphFuser {
   std::pair<graph_node_list::iterator, bool> scanNode(Node * consumer) {
     auto stage_guard = block->owningGraph()->setStageTemporary(consumer->stage());
     if(isFusableAsExitNode(consumer)) {
+      value_list inputs;
+      auto consumer_inputs = consumer->kind() == aten::cat ?
+        consumer->namedInput(attr::tensors)->node()->inputs() :
+        consumer->inputs();
       // handle inputs in reverse topological order as well...
       // otherwise in f(a,a+b) it will appear a is used twice if we consider
       // the f-a fusion before the f-(a+b) fusion first.
-      value_list inputs;
-      for(auto i : consumer->inputs()) {
+      for(auto i : consumer_inputs) {
         if (i->node()->owningBlock() == block) {
           inputs.push_back(i);
           JIT_ASSERT(topological_index.count(i->node()) > 0);
diff --git a/torch/csrc/jit/passes/lower_grad_of.h b/torch/csrc/jit/passes/lower_grad_of.h
index a0a881e3002ed9..0ec3589e3acd31 100644
--- a/torch/csrc/jit/passes/lower_grad_of.h
+++ b/torch/csrc/jit/passes/lower_grad_of.h
@@ -10,6 +10,6 @@ namespace torch { namespace jit {
 //  outputs = <original_computation>
 // else:
 //  outputs = undefineds
-TORCH_API void LowerGradOf(Graph& graph);
+TORCH_API void LowerGradOf(Graph& g);
 
 }}
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 63fb7030aa3ad1..ee9b76f417bd17 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -263,6 +263,39 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
     default:
       break; // fall-through
   }
+  if (node->matches("aten::cat(Tensor[] tensors, int dim) -> Tensor", /*with_const=*/attr::dim)) {
+    auto list_node = node->namedInput(attr::tensors)->node();
+    JIT_ASSERT(list_node->kind() == prim::ListConstruct);
+    auto tensors = list_node->inputs();
+    if (tensors.size() > 0) {
+      auto input_types = fmap(tensors, [](Value *v) { return v->type()->cast<TensorType>(); });
+      if (std::all_of(input_types.begin(), input_types.end(),
+          [](const TensorTypePtr& tp) { return tp != nullptr; })) {
+        std::vector<int64_t> sizes = input_types[0]->sizes();
+        const int64_t dim = wrapDim(node->get<int64_t>(attr::dim).value(), sizes);
+        const int64_t ndim = sizes.size();
+
+        if (dim < 0 || dim >= ndim)
+          goto cat_fail;
+
+        sizes[dim] = 0;
+        for (auto & tp : input_types) {
+          auto & tp_sizes = tp->sizes();
+          if (sizes.size() != tp_sizes.size())
+            goto cat_fail;
+          for (int64_t i = 0; i < ndim; ++i) {
+            if (sizes[i] != tp_sizes[i] && i != dim) {
+              goto cat_fail;
+            }
+          }
+          sizes[dim] += tp_sizes[dim];
+        }
+        node->output()->setType(input_types[0]->withSizes(sizes));
+        return;
+      }
+    }
+  }
+cat_fail:
 
   bool can_propagate_by_running = canPropagateShapeByRunningIt(node);
   auto maybe_tensor_types = gatherTensorTypes(node);
diff --git a/torch/csrc/jit/passes/to_batch.cpp b/torch/csrc/jit/passes/to_batch.cpp
index 5494cf2b78a798..f78da9b92baccc 100644
--- a/torch/csrc/jit/passes/to_batch.cpp
+++ b/torch/csrc/jit/passes/to_batch.cpp
@@ -3,59 +3,530 @@
 
 namespace torch { namespace jit {
 
-std::unordered_map<std::string, std::shared_ptr<Graph>> ToBatch::batch_operator_table;
+std::unordered_map<std::string, std::vector<std::shared_ptr<Graph>>> ToBatch::batch_operator_table;
 
-void ToBatch::toBatch(Block* block, Block* res_block) {
-  // change inputs of a graph - expand tensor to {data, mask, dims}
-  auto size = block->inputs().size();
-  for(size_t i = 0; i < size; i++){
-    auto input = block->inputs()[i];
+std::shared_ptr<Graph> ToBatch::getBatchOperator(std::string name, int64_t num_inputs){
+  if(batch_operator_table.find(name) == batch_operator_table.end()){
+    throw std::runtime_error("function " + name + " is not supported in batched tensor yet");
+  }
+  auto ops = batch_operator_table.at(name);
+  if(num_inputs == -1)  // default function
+    return ops[0];
+  for(auto op : ops){
+    if(size_t(num_inputs) == op->inputs().size())
+      return op;
+  }
+  throw std::runtime_error("function " + name + " with " + std::to_string(num_inputs) + " inputs is not supported in batched tensor yet");
+}
+
+// replace aten operator node with BatchTensor operator graph
+void ToBatch::visitAten(Node* n, Block* block, Block* res_block){
+  auto res_graph = res_block->owningGraph();
+  auto func_name = std::string(n->kind().toUnqualString());
+  std::vector<Value*> new_inputs;
+  for(Value *input : n->inputs()){
+    if(rn_env.find(input) == rn_env.end()){  // non-tensor input
+      auto new_input = batch_map.at(input);
+      new_inputs.insert(new_inputs.end(), new_input.begin(), new_input.end());
+    }
+    else{  // batched tensor input
+      new_inputs.push_back(rn_env.at(input));
+    }
+  }
+
+  // transform scalar to tensor before pass to batch operator script
+  for(size_t i = 0; i < new_inputs.size(); i++){
+    auto input = new_inputs[i];
+    if(input->type() == IntType::get() || input->type() == FloatType::get()){
+      auto to_tensor_node = res_graph->createNumToTensor(input);
+      res_graph->insertNode(to_tensor_node);
+      new_inputs[i] = to_tensor_node->output();
+    }
+  }
+
+  auto batch_graph = getBatchOperator(func_name, new_inputs.size());
+  auto outputs = script::inlineCallTo(*res_block->owningGraph(), *batch_graph, new_inputs);
+
+  // Assume all outputs from inlined operator implementation are in the triple form batched tensor or just a single non-tensor.
+  if(outputs.size() == 1){
+    // if previous output is scalar, transform new output back to scalar from dynamic
+    if(n->outputs()[0]->type() != outputs[0]->type()){
+      Node* to_scalar_node;
+      if(n->outputs()[0]->type() == IntType::get()){
+        to_scalar_node = res_graph->createTensorToNum(IntType::get(), outputs[0]);
+      }
+      else if(n->outputs()[0]->type() == FloatType::get()){
+        to_scalar_node = res_graph->createTensorToNum(FloatType::get(), outputs[0]);
+      }
+      else{
+        throw std::runtime_error("NYI: scalar type other than int, float is not supported yet");
+      }
+      res_graph->insertNode(to_scalar_node);
+      rn_env[n->outputs()[0]] = to_scalar_node->output();
+    }
+    else
+      rn_env[n->outputs()[0]] = outputs[0];
+  }
+  else{
+    for(size_t i = 0; i < n->outputs().size(); i++){
+      auto output = n->outputs()[i];
+      batch_map[output] = std::vector<Value*>(outputs.begin() + i * EXP_BTENSOR_SIZE, outputs.begin() + i * EXP_BTENSOR_SIZE + EXP_BTENSOR_SIZE);
+    }
+  }
+}
+
+// clone prim::Constant to new graph
+// batching transformation is applied to the output of prim::NumToTensor.
+// If there is a prim::NumToTensor following prim::Constant, it will be finally transformed to BatchTensor.
+void ToBatch::visitConstant(Node* n, Block* block, Block* res_block){
+  auto res_graph = res_block->owningGraph();
+  auto* r_node = res_graph->createClone(n, rn_fn);
+  r_node->setStage(n->stage());
+  res_block->appendNode(r_node);
+  rn_env[n->output()] = r_node->output();
+}
+
+// change return tensor to expanded batched tensor, eg: {data, mask, dims}
+void ToBatch::visitNumToTensor(Node* n, Block* block, Block* res_block){
+  auto res_graph = res_block->owningGraph();
+  auto* r_node = res_graph->createClone(n, rn_fn);
+  r_node->setStage(n->stage());
+  res_block->appendNode(r_node);
+  auto outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("batch_from_scalar_tensor"), r_node->outputs());
+  batch_map[n->output()] = outputs;
+}
+
+// clone prim::TensorToNum to new graph
+void ToBatch::visitTensorToNum(Node* n, Block* block, Block* res_block){
+  auto res_graph = res_block->owningGraph();
+  if(rn_env.find(n->input()) == rn_env.end()){
+    rn_env[n->input()] = batch_map.at(n->input())[0];
+  }
+  auto* r_node = res_graph->createClone(n, rn_fn);
+  r_node->setStage(n->stage());
+  res_block->appendNode(r_node);
+  rn_env[n->output()] = r_node->output();
+  batch_map[n->output()] = batch_map.at(n->input());
+}
+
+// clone prim::ListConstruct to new graph
+void ToBatch::visitListConstruct(Node* n, Block* block, Block* res_block){
+  auto res_graph = res_block->owningGraph();
+  if(n->inputs()[0]->type() == DynamicType::get()){  // TensorList: expand directly
+    std::vector<Value*> inputs;
+    for(Value* input: n->inputs()) {
+      auto res = batch_map.at(input);
+      inputs.insert(inputs.end(), res.begin(), res.end());
+    }
+    batch_map[n->output()] = inputs;
+  }
+  else {  // ScalarList: transform to tensor, then transform back
+    for(Value* input : n->inputs()) {
+      if(rn_env.find(input) == rn_env.end()){
+        rn_env[input] = batch_map.at(input)[0];
+      }
+    }
+    auto* r_node = res_graph->createClone(n, rn_fn);
+    r_node->setStage(n->stage());
+    res_block->appendNode(r_node);
+    // transform int[] to tensor
+    auto to_tensor_node = res_graph->create(Symbol::fromQualString("aten::_list_to_tensor"));
+    to_tensor_node->setStage(n->stage());
+    to_tensor_node->addInput(r_node->output());
+    res_block->appendNode(to_tensor_node);
+    rn_env[n->output()] = to_tensor_node->output();
+  }
+}
+
+// prim::If transformation:
+// elif is not supported
+//
+// transformation example:
+// @torch.jit.batch(batch_size=4)
+// def batch_if(a, b):
+//     if a > b:
+//         a += b
+//     else:
+//         a -= b
+//     return a
+//
+// original graph:
+// graph(%a.1 : Dynamic
+//       %b : Dynamic) {
+//   %2 : Dynamic = aten::gt(%a.1, %b)
+//   %a : Dynamic = prim::If(%2)
+//     block0() {
+//       %a.2 : Dynamic = aten::add[alpha={1}](%a.1, %b)
+//       -> (%a.2)
+//     }
+//     block1() {
+//       %a.3 : Dynamic = aten::sub[alpha={1}](%a.1, %b)
+//       -> (%a.3)
+//     }
+//   return (%a);
+// }
+//
+// transformed graph:
+// graph(%a.1_data : Dynamic
+//       %a.1_mask : Dynamic
+//       %a.1_dims : Dynamic
+//       %b_data : Dynamic
+//       %b_mask : Dynamic
+//       %b_dims : Dynamic) {
+//   %6 : Dynamic = aten::gt(%a.1_data, %b_data)  // calculate condition
+//   %7 : Dynamic = aten::mul(%a.1_mask, %b_mask)
+//   %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+//   %9 : int = prim::TensorToNum(%6)
+//   %10 : Long() = prim::Constant[value={1}]()  // if_block
+//   %alpha.1 : float = prim::TensorToNum(%10)
+//   %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1)
+//   %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask)
+//   %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+//   %15 : Long() = prim::Constant[value={1}]()  // else_block
+//   %alpha : float = prim::TensorToNum(%15)
+//   %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha)
+//   %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
+//   %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+//   %20 : Dynamic = aten::type_as(%7, %6)   // combine two outputs (batch_where)
+//   %cond_mask.1 : Dynamic = aten::mul(%6, %20)
+//   %22 : int = aten::dim(%cond_mask.1)
+//   %23 : int = prim::Constant[value=1]()
+//   %24 : int = aten::eq(%22, %23)
+//   %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%24)
+//     block0() {
+//       %28 : int = aten::dim(%data.1)
+//       %29 : int = prim::Constant[value=1]()
+//       %30 : int = aten::sub(%28, %29)
+//       %31 : int = prim::Constant[value=1]()
+//       %data.3 : Dynamic = prim::Loop(%30, %31, %cond_mask.1)
+//         block0(%_ : int, %34 : Dynamic) {
+//           %35 : int = prim::Constant[value=1]()
+//           %36 : int = aten::neg(%35)
+//           %data.2 : Dynamic = aten::unsqueeze(%34, %36)
+//           %38 : int = prim::Constant[value=1]()
+//           -> (%38, %data.2)
+//         }
+//       %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
+//       %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1)
+//       -> (%cond_data.1, %cond_mask.2, %data.3)
+//     }
+//     block1() {
+//       -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
+//     }
+//   %res_data : Dynamic = aten::where(%cond_data, %data.1, %data.4)
+//   %res_mask : Dynamic = aten::where(%cond_mask, %mask.1, %mask)
+//   %res_dims : Dynamic = aten::__or__(%dims.1, %dims)
+//   return (%res_data, %res_mask, %res_dims);
+// }
+void ToBatch::visitIf(Node* n, Block* block, Block* res_block){
+  toBatch(n->blocks()[0], res_block);
+  toBatch(n->blocks()[1], res_block);
+
+  // combine results from two if paths
+  for(size_t i = 0; i < n->outputs().size(); i++){
+    std::vector<Value*> inputs;
+    if(batch_map.find(n->input()) == batch_map.end()){  // cond is scalar
+      inputs.push_back(rn_env.at(n->input()));
+    }
+    else{  // cond is tensor
+      auto cond = batch_map.at(n->input());
+      inputs.insert(inputs.end(), cond.begin(), cond.end());
+    }
+    auto if_output = batch_map.at(n->blocks()[0]->outputs()[i]);
+    inputs.insert(inputs.end(), if_output.begin(), if_output.end());
+    auto else_output = batch_map.at(n->blocks()[1]->outputs()[i]);
+    inputs.insert(inputs.end(), else_output.begin(), else_output.end());
+    auto outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("where", inputs.size()), inputs);
+    batch_map[n->outputs()[i]] = outputs;
+  }
+}
+
+// prim::Loop transformation:
+//
+// transformation example:
+// @torch.jit.batch(batch_size=4)
+// def batch_while(a, b):
+//     while a > b:
+//         a -= b
+//     return a
+//
+// original graph:
+// graph(%a.1 : Dynamic
+//       %b : Dynamic) {
+//   %2 : int = prim::Constant[value={2147483647}]()
+//   %3 : Dynamic = aten::gt(%a.1, %b)
+//   %a : Dynamic = prim::Loop(%2, %3, %a.1)
+//     block0(%4 : Dynamic, %5 : Dynamic) {
+//       %a.2 : Dynamic = aten::sub[alpha={1}](%5, %b)
+//       %9 : Dynamic = aten::gt(%a.2, %b)
+//       -> (%9, %a.2)
+//     }
+//   return (%a);
+// }
+//
+// transformed graph:
+// graph(%a.1_data : Dynamic
+//       %a.1_mask : Dynamic
+//       %a.1_dims : Dynamic
+//       %b_data : Dynamic
+//       %b_mask : Dynamic
+//       %b_dims : Dynamic) {
+//   %6 : int = prim::Constant[value=2147483647]()
+//   %7 : Dynamic = aten::gt(%a.1_data, %b_data)
+//   %8 : Dynamic = aten::mul(%a.1_mask, %b_mask)
+//   %9 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
+//   %10 : int = prim::TensorToNum(%7)
+//   %11 : Dynamic = aten::mul(%7, %8)
+//   %12 : Dynamic = aten::sum(%11)
+//   %13 : Dynamic = aten::gt[other={0}](%12)  // cond_any
+//   %14 : int = prim::TensorToNum(%13)
+//   %62 : Dynamic, %63 : Dynamic, %64 : Dynamic, %a : Dynamic, %60 : Dynamic, %61 : Dynamic = prim::Loop(%6, %14, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
+//     block0(%loop_num : int, %cond_data.2 : Dynamic, %cond_mask.3 : Dynamic, %cond_dims : Dynamic, %6_data : Dynamic, %6_mask : Dynamic, %6_dims : Dynamic) {
+//       %23 : Long() = prim::Constant[value={1}]()
+//       %alpha : float = prim::TensorToNum(%23)
+//       %data.1 : Dynamic = aten::sub(%6_data, %b_data, %alpha)
+//       %mask : Dynamic = aten::mul(%6_mask, %b_mask)
+//       %dims : Dynamic = aten::__or__(%6_dims, %b_dims)
+//       %28 : Dynamic = aten::gt(%data.1, %b_data)
+//       %29 : Dynamic = aten::mul(%mask, %b_mask)
+//       %30 : Dynamic = aten::__or__(%dims, %b_dims)
+//       %31 : int = prim::TensorToNum(%28)
+//       %32 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2)  // update outputs (batch_where)
+//       %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %32)
+//       %34 : int = aten::dim(%cond_mask.1)
+//       %35 : int = prim::Constant[value=1]()
+//       %36 : int = aten::eq(%34, %35)
+//       %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%36)
+//         block0() {
+//           %40 : int = aten::dim(%data.1)
+//           %41 : int = prim::Constant[value=1]()
+//           %42 : int = aten::sub(%40, %41)
+//           %43 : int = prim::Constant[value=1]()
+//           %data.3 : Dynamic = prim::Loop(%42, %43, %cond_mask.1)
+//             block0(%_ : int, %46 : Dynamic) {
+//               %47 : int = prim::Constant[value=1]()
+//               %48 : int = aten::neg(%47)
+//               %data.2 : Dynamic = aten::unsqueeze(%46, %48)
+//               %50 : int = prim::Constant[value=1]()
+//               -> (%50, %data.2)
+//             }
+//           %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
+//           %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
+//           -> (%cond_data.1, %cond_mask.2, %data.3)
+//         }
+//         block1() {
+//           -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
+//         }
+//       %res_data : Dynamic = aten::where(%cond_data, %data.1, %6_data)
+//       %res_mask : Dynamic = aten::where(%cond_mask, %mask, %6_mask)
+//       %res_dims : Dynamic = aten::__or__(%dims, %6_dims)
+//       %56 : Dynamic = aten::mul(%28, %29)
+//       %57 : Dynamic = aten::sum(%56)
+//       %58 : Dynamic = aten::gt[other={0}](%57)
+//       %59 : int = prim::TensorToNum(%58)
+//       -> (%59, %28, %29, %30, %res_data, %res_mask, %res_dims)
+//     }
+//   return (%a, %60, %61);
+// }
+void ToBatch::visitLoop(Node* n, Block* block, Block* res_block){
+  auto res_graph = res_block->owningGraph();
+  // bool cond_is_tensor indicates whether cond is tensor
+  // cond_is_tensor = false, eg: for loop, n->inputs()[1] = byte()
+  // cond_is_tensor = true, eg: in some while loop, cond is a batched tensor,
+  //                            we need to add expanded cond to the inputs of loop node and block,
+  //                            and compute cond_any as cond for while loop
+  bool cond_is_tensor = (batch_map.find(n->inputs()[1]) != batch_map.end());
+
+  // create prim::Loop node for res_block
+
+  // type of cond in loop should be int type
+  if(rn_env.at(n->inputs()[0])->type() != IntType::get()){
+    auto to_int_node = res_graph->createTensorToNum(IntType::get(), rn_env.at(n->inputs()[0]));
+    res_graph->insertNode(to_int_node);
+    rn_env[n->inputs()[0]] = to_int_node->output();
+  }
+  if(cond_is_tensor){
+    auto cond = batch_map.at(n->inputs()[1]);
+    auto cond_any = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("any"), cond);
+    auto to_int_node = res_graph->createTensorToNum(IntType::get(), cond_any[0]);
+    res_graph->insertNode(to_int_node);
+    rn_env[n->inputs()[1]] = to_int_node->output();
+  }
+  for(size_t i = 2; i < n->inputs().size(); i++){
+    auto input = n->inputs()[i];
+    rn_env[input] = batch_map.at(input)[0];
+  }
+  auto* r_node = res_graph->createClone(n, rn_fn, /*copy_blocks=*/false);
+
+  // change inputs of prim::Loop
+  if(cond_is_tensor){
+    for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){
+      auto cond = batch_map.at(n->inputs()[1]);
+      r_node->insertInput(i + 2, cond[i]);
+    }
+  }
+  for(size_t i = 2; i < n->inputs().size(); i++){
+    for(size_t j = 1; j < EXP_BTENSOR_SIZE; j++){
+      r_node->insertInput((i - 2) * EXP_BTENSOR_SIZE + EXP_BTENSOR_SIZE * cond_is_tensor + 2 + j, batch_map.at(n->inputs()[i])[j]);
+    }
+  }
+  r_node->setStage(n->stage());
+  res_block->appendNode(r_node);
+
+  // create block for Loop node in res_block
+  // if cond is tensor:    first 4 inputs of block: cond_any, cond_data, cond_mask, cond_dims
+  // if cond is not tensor: first 1 input of block: cond
+  auto loop_block = r_node->addBlock();
+
+  // add inputs
+  loop_block->addInput("loop_num");
+  loop_block->inputs()[0]->setType(IntType::get());
+  rn_env[n->blocks()[0]->inputs()[0]] = loop_block->inputs()[0];
+  if(cond_is_tensor){
+    for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){
+      loop_block->addInput("cond_" + EXP_BTENSOR_NAME[i]);
+    }
+  }
+  for(size_t i = 1; i < n->blocks()[0]->inputs().size(); i++){
+    auto input = n->blocks()[0]->inputs()[i];
     auto name = input->uniqueName();
-    res_block->addInput(name + "_data");
-    res_block->addInput(name + "_mask");
-    res_block->addInput(name + "_dims");
-    batch_map[input] = std::vector<Value*>(res_block->inputs().slice(i * 3, 3));
+    for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){
+      loop_block->addInput(name + "_" + EXP_BTENSOR_NAME[j]);
+    }
+    batch_map[input] = std::vector<Value*>(loop_block->inputs().slice((i - 1) * EXP_BTENSOR_SIZE + 1 + EXP_BTENSOR_SIZE * cond_is_tensor, EXP_BTENSOR_SIZE).vec());
+  }
+
+  toBatch(n->blocks()[0], loop_block);
+
+  WithInsertPoint guard(loop_block);
+
+  // use where operator to update variables and add to outputs
+  for(size_t i = 0; i < n->outputs().size(); i++){
+    std::vector<Value*> inputs, outputs;
+    if(cond_is_tensor){
+      for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){
+        inputs.push_back(loop_block->inputs()[j + 1]);
+      }
+      auto data = batch_map.at(n->blocks()[0]->outputs()[i + 1]);
+      inputs.insert(inputs.end(), data.begin(), data.end());
+      for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){
+        inputs.push_back(loop_block->inputs()[i * EXP_BTENSOR_SIZE + j + EXP_BTENSOR_SIZE + 1]);
+      }
+      outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("where"), inputs);
+    }
+    else{
+      for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){
+        inputs.push_back(loop_block->inputs()[i * EXP_BTENSOR_SIZE + j + 1]);
+      }
+      auto data = batch_map.at(n->blocks()[0]->outputs()[i + 1]);
+      inputs.insert(inputs.end(), data.begin(), data.end());
+      outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("update"), inputs);
+    }
+    batch_map[n->outputs()[i]] = outputs;
+    for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){
+      loop_block->registerOutput(outputs[j]);
+    }
+  }
+
+  // update loop conditions
+  if(cond_is_tensor){
+    auto cond = batch_map.at(n->blocks()[0]->outputs()[0]);
+    auto cond_any = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("any"), cond);
+    auto to_int_node = res_graph->createTensorToNum(IntType::get(), cond_any[0]);
+    res_graph->insertNode(to_int_node);
+    loop_block->insertOutput(0, to_int_node->output());
+    for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){
+      loop_block->insertOutput(i + 1, cond[i]);
+    }
+  }
+  else{
+    auto cond = rn_env.at(n->blocks()[0]->outputs()[0]);
+    loop_block->insertOutput(0, cond);
+  }
+
+  // change outputs of prim::Loop
+  auto size = r_node->outputs().size();
+  for(size_t i = 0; i < size; i++){
+    for(size_t j = 1; j < EXP_BTENSOR_SIZE; j++){
+      r_node->insertOutput(i * EXP_BTENSOR_SIZE + j);
+    }
+    batch_map[n->outputs()[i]] = r_node->outputs().slice(i * EXP_BTENSOR_SIZE, EXP_BTENSOR_SIZE).vec();
+  }
+  // add cond to outputs of loop node
+  if(cond_is_tensor){
+    for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){
+      r_node->insertOutput(i);
+    }
+  }
+}
+
+void ToBatch::toBatch(Block* block, Block* res_block) {
+  WithInsertPoint guard(res_block);
+
+  // change inputs of block - expand tensor to batchtensor eg: (data, mask, dims)
+  // eg: a -> a_data, a_mask, a_dims
+  // for block in prim::Loop, register inputs separately to deal with cond
+  if(!block->owningNode() || block->owningNode()->kind() != prim::Loop){
+    auto size = block->inputs().size();
+    for(size_t i = 0; i < size; i++){
+      auto input = block->inputs()[i];
+      auto name = input->uniqueName();
+      for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){
+        res_block->addInput(name + "_" + EXP_BTENSOR_NAME[j]);
+      }
+      batch_map[input] = std::vector<Value*>(res_block->inputs().slice(i * EXP_BTENSOR_SIZE, EXP_BTENSOR_SIZE).vec());
+    }
   }
 
   for (auto it = block->nodes().begin(); it != block->nodes().end(); it++) {
     auto n = *it;
-    // replace tensor operator to BatchTensor operator
     if(n->kind().is_aten()){
-      auto batch_graph = batch_operator_table.at(n->kind().toUnqualString());
-      WithInsertPoint guard(res_block);
-      std::vector<Value*> new_inputs;
-      for(Value *input : n->inputs()){
-        if(batch_map.find(input) != batch_map.end()){
-          auto new_input = batch_map.at(input);
-          new_inputs.insert(new_inputs.end(), new_input.begin(), new_input.end());
-        }
-        else{
-          throw std::runtime_error("NYI: non-tensor input for aten operator is not supported yet");
-        }
-      }
-      auto outputs = script::inlineCallTo(*res_block->owningGraph(), *batch_graph, new_inputs);
-      // Assume all outputs from inlined operator implementation are in the triple form.
-      for(size_t i = 0; i < n->outputs().size(); i++){
-        auto output = n->outputs()[i];
-        batch_map[output] = std::vector<Value*>(outputs.begin() + i * 3, outputs.begin() + i * 3 + 3);
-      }
+      visitAten(n, block, res_block);
     }
     else if(n->kind().is_prim()){
-      throw std::runtime_error("NYI: node of prim kind is not supported to transform to batch graph yet");
+      switch(n->kind()){
+        case prim::Constant:
+          visitConstant(n, block, res_block);
+          break;
+        case prim::NumToTensor:
+          visitNumToTensor(n, block, res_block);
+          break;
+        case prim::TensorToNum:
+          visitTensorToNum(n, block, res_block);
+          break;
+        case prim::ListConstruct:
+          visitListConstruct(n, block, res_block);
+          break;
+        case prim::If:
+          visitIf(n, block, res_block);
+          break;
+        case prim::Loop:
+          visitLoop(n, block, res_block);
+          break;
+        default:
+          throw std::runtime_error("NYI: node of prim kind other than [Constant, NumToTensor, TensorToNum, If, Loop] is not supported yet");
+      }
+    }
+    else{
+      throw std::runtime_error("NYI: node that is not aten or prim kind is not supported yet");
     }
   }
-  // change outputs of a graph - expand tensor to {data, mask, dims}
-  for(Value* output : block->outputs()){
-    auto r_output = batch_map.at(output);
-    res_block->registerOutput(r_output[0]);
-    res_block->registerOutput(r_output[1]);
-    res_block->registerOutput(r_output[2]);
+  // change outputs of block - expand tensor to batchtensor(data, mask, dims)
+  // for block in prim::Loop, register outputs separately to deal with cond and cond_any
+  // for block in prim::If, register outputs separately by combining outputs from two paths and return
+  if(!block->owningNode() || (block->owningNode()->kind() != prim::Loop && block->owningNode()->kind() != prim::If)) {
+    for(Value* output : block->outputs()){
+      auto r_output = batch_map.at(output);
+      for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){
+        res_block->registerOutput(r_output[i]);
+      }
+    }
   }
 }
 
 std::shared_ptr<Graph> to_batch_graph(std::shared_ptr<Graph>& graph){
   // std::cout<<graph->toString()<<std::endl;
-  auto res_graph = std::make_shared<Graph>(graph->scope_root());
+  std::shared_ptr<Graph> res_graph = std::make_shared<Graph>(graph->scope_root());
   ToBatch to_batch;
   to_batch.toBatch(graph->block(), res_graph->block());
   // std::cout<<res_graph->toString()<<std::endl;
@@ -66,7 +537,7 @@ void initRegisterBatchOpsBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
   m.def("to_batch_graph", &to_batch_graph);
   m.def("register_batch_operator", [](std::string name, std::shared_ptr<Graph> graph){
-    ToBatch::batch_operator_table[name] = graph;
+    ToBatch::batch_operator_table[name].push_back(graph);
   });
 }
 
diff --git a/torch/csrc/jit/passes/to_batch.h b/torch/csrc/jit/passes/to_batch.h
index 23c23a0632b310..6545e2a2d4f8ed 100644
--- a/torch/csrc/jit/passes/to_batch.h
+++ b/torch/csrc/jit/passes/to_batch.h
@@ -3,14 +3,33 @@
 #include "torch/csrc/jit/pybind.h"
 #include "torch/csrc/jit/ir.h"
 
+#include <ATen/ATen.h>
+
 namespace torch { namespace jit {
 
 class ToBatch {
 private:
+  // number of tensors to represent a expanded BatchTensor. {data, mask, dims} for now.
+  const size_t EXP_BTENSOR_SIZE = 3;
+  const std::vector<std::string> EXP_BTENSOR_NAME = {"data", "mask", "dims"};
   // mapping from tensor in original graph to {data, mask, dims} in new graph
   std::unordered_map<Value*, std::vector<Value*>> batch_map;
+  // mapping from input in original graph to new input in new graph - used in createClone
+  std::unordered_map<Value*, Value*> rn_env;
+  std::function<Value*(Value*)> rn_fn = [this](Value* v) { return rn_env.at(v); };
+
+private:
+  std::shared_ptr<Graph> getBatchOperator(std::string name, int64_t input_num = -1);
+  void visitAten(Node* n, Block* block, Block* res_block);
+  void visitConstant(Node* n, Block* block, Block* res_block);
+  void visitNumToTensor(Node* n, Block* block, Block* res_block);
+  void visitTensorToNum(Node* n, Block* block, Block* res_block);
+  void visitListConstruct(Node* n, Block* block, Block* res_block);
+  void visitIf(Node* n, Block* block, Block* res_block);
+  void visitLoop(Node* n, Block* block, Block* res_block);
+
 public:
-  static std::unordered_map<std::string, std::shared_ptr<Graph>> batch_operator_table;
+  static std::unordered_map<std::string, std::vector<std::shared_ptr<Graph>>> batch_operator_table;
   TORCH_API void toBatch(Block* block, Block* res_block);
 };
 
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index 415fc311086ac9..0598e651d32437 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -4,26 +4,70 @@
 
 namespace torch { namespace jit {
 
-inline Stack createStack(const py::tuple& tuple, size_t reserve_extra_space = 0) {
+inline Stack createStack(const py::tuple& tuple, at::ArrayRef<Value*> inputs, size_t reserve_extra_space = 0) {
+  if (tuple.size() != inputs.size()) {
+    throw std::runtime_error("expected " + std::to_string(inputs.size()) +
+                             " inputs, but got " + std::to_string(tuple.size()));
+  }
+  static const auto castToIValue = [](const py::object& obj, Type& t) -> IValue{
+    switch (t.kind()) {
+      case TypeKind::DynamicType:
+      case TypeKind::TensorType:
+        return py::cast<autograd::Variable>(obj);
+      case TypeKind::FloatType:
+        return py::cast<double>(obj);
+      case TypeKind::IntType:
+        return py::cast<int64_t>(obj);
+      case TypeKind::NoneType:
+        return {};
+      case TypeKind::ListType:
+      case TypeKind::TupleType:
+        throw std::runtime_error("Lists and tuples are not supported yet");
+      case TypeKind::NumberType:
+        throw std::runtime_error("Insufficient type information to convert input");
+    }
+    throw std::runtime_error("Missing cases in castToIValue! File a bug report.");
+  };
   Stack result;
   result.reserve(tuple.size() + reserve_extra_space);
-  for(auto e : tuple) {
-    result.push_back(py::cast<autograd::Variable>(e));
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    result.push_back(castToIValue(tuple[i], *inputs[i]->type()));
   }
   return result;
 }
 
-inline py::object wrapStack(Stack&& outputs) {
+inline py::object wrapStack(Stack&& outputs, at::ArrayRef<Value*> output_vals) {
+  if (outputs.size() != output_vals.size()) {
+    throw std::runtime_error("expected " + std::to_string(output_vals.size()) +
+                             " outputs, but got " + std::to_string(outputs.size()));
+  }
+  static const auto createOutput = [](IValue && ivalue, Value * value) -> py::object {
+    switch (value->type()->kind()) {
+      case TypeKind::DynamicType:
+      case TypeKind::TensorType:
+        return py::cast(autograd::Variable(ivalue.toTensor()));
+      case TypeKind::FloatType:
+        return py::cast(ivalue.toDouble());
+      case TypeKind::IntType:
+        return py::cast(ivalue.toInt());
+      case TypeKind::NoneType:
+        return py::none();
+      case TypeKind::ListType:
+      case TypeKind::TupleType:
+        throw std::runtime_error("Lists and tuples are not supported yet");
+      case TypeKind::NumberType:
+        throw std::runtime_error("Insufficient type information to convert input");
+    }
+    throw std::runtime_error("Missing cases in createOutput! File a bug report.");
+  };
   if (outputs.size() == 0) {
     return py::none();
   } else if (outputs.size() == 1) {
-    JIT_ASSERT(outputs[0].isTensor());
-    return py::cast(autograd::as_variable_ref(std::move(outputs[0]).toTensor()));
+    return createOutput(std::move(outputs[0]), output_vals[0]);
   } else {
     py::tuple tuple(outputs.size());
     for(size_t i = 0; i < outputs.size(); i++) {
-      JIT_ASSERT(outputs[i].isTensor());
-      tuple[i] = py::cast(autograd::as_variable_ref(std::move(outputs[i]).toTensor()));
+      tuple[i] = createOutput(std::move(outputs[i]), output_vals[i]);
     }
     return tuple;
   }
diff --git a/torch/csrc/jit/python_arg_flatten.h b/torch/csrc/jit/python_arg_flatten.h
index b5139032fde169..3e1477e52e0701 100644
--- a/torch/csrc/jit/python_arg_flatten.h
+++ b/torch/csrc/jit/python_arg_flatten.h
@@ -14,7 +14,7 @@ namespace torch { namespace jit { namespace python {
 struct IODescriptor {
   struct VariableMetadata {
     VariableMetadata(const autograd::Variable& var)
-      : sizes(var.sizes())
+      : sizes(var.sizes().vec())
       , type(var.type().scalarType())
       , device(var.type().is_cuda() ? var.get_device() : -1)
       , requires_grad(var.requires_grad()) {}
@@ -104,7 +104,7 @@ struct ParsedArgs {
 
 
 ParsedArgs flatten(py::handle obj);
-PyObject* unflatten(at::ArrayRef<autograd::Variable> outputs,
+PyObject* unflatten(at::ArrayRef<autograd::Variable> vars,
                     const IODescriptor& structure);
 
 }}} // namespace torch::jit::python
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 81211085569953..b72fdb6b8860b1 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -451,10 +451,22 @@ void initPythonIRBindings(PyObject * module_) {
     .def("scalarType",[](Type& t) {
       return at::toString(t.expect<TensorType>()->scalarType());
     })
-    ;
+    .def("__eq__", [](std::shared_ptr<Type>& self, std::shared_ptr<Type>& other) {
+		  return *self == *other;
+    })
+    .def("isSubtypeOf", [](std::shared_ptr<Type>& self, std::shared_ptr<Type> other) {
+        return self->isSubtypeOf(other);
+    });
 
+  py::class_<NumberType, Type, std::shared_ptr<NumberType>>(m, "NumberType")
+    .def_static("get", &NumberType::get);
+  py::class_<IntType, Type, std::shared_ptr<IntType>>(m, "IntType")
+    .def_static("get", &IntType::get);
+  py::class_<FloatType, Type, std::shared_ptr<FloatType>>(m, "FloatType")
+    .def_static("get", &FloatType::get);
   py::class_<DynamicType, Type, std::shared_ptr<DynamicType>>(m, "DynamicType")
-    .def(py::init([](){ return DynamicType::create(); }));
+    .def_static("get", &DynamicType::get);
+
   py::class_<TupleType, Type, std::shared_ptr<TupleType>>(m, "TupleType")
     .def(py::init([](std::vector<TypePtr> a){ return TupleType::create(a); }))
     .def("elements", [](TupleType &self){
@@ -465,7 +477,9 @@ void initPythonIRBindings(PyObject * module_) {
       return types;
     });
   py::class_<ListType, Type, std::shared_ptr<ListType>>(m, "ListType")
-    .def_static("ofInts", &ListType::ofInts);
+    .def_static("ofInts", &ListType::ofInts)
+    .def_static("ofTensors", &ListType::ofTensors)
+    .def("getElementType", &ListType::getElementType);
 
   py::class_<Use>(m,"Use")
   .def_readonly("user",&Use::user)
diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp
index 7439b2b5e334cc..0496af67412654 100644
--- a/torch/csrc/jit/python_tracer.cpp
+++ b/torch/csrc/jit/python_tracer.cpp
@@ -103,10 +103,10 @@ void pythonRecordSourceLocation(Node* n) {
   n->setSourceLocation(sl);
 }
 
-void initPythonTracerBindings(PyObject* module_) {
+void initPythonTracerBindings(PyObject* module) {
   setRecordSourceLocation(pythonRecordSourceLocation);
 
-  auto m = py::handle(module_).cast<py::module>();
+  auto m = py::handle(module).cast<py::module>();
   py::class_<TracingState,std::shared_ptr<TracingState>>(m, "TracingState", py::dynamic_attr())
     // NB: no constructor; you have to get it from C++ code
     .def("__repr__", [](const TracingState& s) {
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 8fe747e59900f0..f2b8ea18a2be24 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -231,6 +231,18 @@ RegisterOperators reg({
               push(stack, std::move(vals));
               return 0;
             };
+          } else if (lt->getElementType()->isSubtypeOf(DynamicType::get())) {
+            return [=](Stack& stack) {
+              const size_t stack_size = stack.size();
+              std::vector<at::Tensor> vals;
+              vals.reserve(num_inputs);
+              for (size_t i = stack_size - num_inputs; i < stack_size; ++i) {
+                vals.push_back(std::move(stack[i]).toTensor());
+              }
+              drop(stack, num_inputs);
+              push(stack, std::move(vals));
+              return 0;
+            };
           } else {
             std::stringstream ss;
             ss << "unsupported list type: " << *lt->getElementType();
@@ -335,7 +347,35 @@ RegisterOperators reg2({
             return 0;
           };
         }),
-
+    Operator(
+        "aten::_tensor_to_list(Tensor a) -> int[]",
+        [](Node* node) {
+          return [=](Stack& stack) {
+            at::Tensor t;
+            pop(stack, t);
+            std::vector<int64_t> elems;
+            for(int i = 0; i < t.size(0); i++){
+              elems.push_back(*t[i].toIntData());
+            }
+            push(stack, jit::IntList::create(elems));
+            return 0;
+          };
+        }),
+    Operator(
+        "aten::_list_to_tensor(int[] a) -> Tensor",
+        [](Node* node) {
+          return [=](Stack& stack) {
+            std::vector<int64_t> l;
+            pop(stack, l);
+            auto t = torch::empty(
+                {static_cast<int64_t>(l.size())}, at::dtype(at::kInt));
+            for(size_t i = 0; i < l.size(); i++){
+              t[i] = l[i];
+            }
+            push(stack, t);
+            return 0;
+          };
+        }),
     // commutative
     DEFINE_ST_OP(mul, at::mul(b, a))
     DEFINE_ST_OP(add, at::add(b, a))
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 0016f69b5ce07b..4f27cb25b53cb7 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -351,37 +351,19 @@ Value* createNumber(Graph& g, const SourceRange& loc, const at::Tensor& val) {
   return output;
 }
 
-Value* createStack(Graph& g, const SourceRange& loc, at::ArrayRef<Value*> inputs) {
-  // bake in constant propagation for the all-constant case because it is
-  // common to see constant lists like [1, 2] passed to attributes
-  bool all_constant = std::all_of(inputs.begin(), inputs.end(), [&](Value* v) {
-    return v->node()->kind() == prim::Constant;
-  });
-  if(all_constant) {
-    auto values = fmap(inputs, [&](Value* v) {
-      return v->node()->t(attr::value);
-    });
-    return insertConstant(g, at::stack(values), loc);
-  }
-  return g.insertNode(g.create(aten::stack, inputs)
-                      ->i_(attr::dim, 0)
-                      ->setSourceLocation(std::make_shared<SourceRange>(loc)))->output();
-}
-
-static bool isTensorSubtype(Value* v) {
-  return v->type()->isSubtypeOf(DynamicType::get());
-}
-
 at::optional<std::vector<int64_t>> getIntListAttribute(at::optional<int32_t> N, Value* input) {
   auto list = constant_as<Shared<jit::IntList>>(input);
   if(list)
-    return std::vector<int64_t>(list.value()->elements());
+    return list.value()->elements().vec();
+
   // broadcast IntList[3] with value 4 -> {4, 4, 4}
   if(!N)
     return at::nullopt;
+
   auto r = constant_as<int64_t>(input);
   if(!r)
     return at::nullopt;
+
   // broadcast to attribute size
   return std::vector<int64_t>(*N, *r);
 }
@@ -455,51 +437,46 @@ at::optional<std::vector<Value*>> tryMatchSchema(
     }
 
     // check input types
-    std::vector<Value*> flat_inputs;
+    std::vector<Value*> matched_inputs;
     for(size_t i = 0; i < schema.arguments.size(); ++i) {
-      NamedValue v = *positional_inputs[i];
+      Value* value = positional_inputs[i]->value;
       const auto& arg = schema.arguments[i];
 
       // some functions that take lists of integers for fixed size arrays
       // also allow single ints to be passed in their place.
       // the single int is then repeated to the length of the list
-      if (isIntUsedAsIntList(v.value, arg)) {
-        std::vector<Value*> repeated(*arg.N, v.value);
-        v.value = graph.insertNode(graph.createList(IntType::get(), repeated))->output();
+      if (isIntUsedAsIntList(value, arg)) {
+        std::vector<Value*> repeated(*arg.N, value);
+        value = graph.insertNode(graph.createList(IntType::get(), repeated))->output();
       }
 
-      // Allow tuples that only contain integers to turn into lists of integers
-      if(*ListType::ofInts() == *arg.type &&
-         v.value->type()->kind() == TypeKind::TupleType &&
-         v.value->type()->isSubtypeOf(ListType::ofInts())) {
-        auto unpacked = createTupleUnpack(v.value);
-        v.value = graph.insertNode(graph.createList(IntType::get(), unpacked))->output();
+      // Allow homogeneous tuples to be casted implicitly to lists of appropriate types
+      if (arg.type->kind() == TypeKind::ListType &&
+          value->type()->kind() == TypeKind::TupleType &&
+          value->type()->isSubtypeOf(arg.type)) {
+        auto unpacked = createTupleUnpack(value);
+        auto elem_type = arg.type->expect<ListType>()->getElementType();
+        value = graph.insertNode(graph.createList(elem_type, unpacked))->output();
       }
 
-      if (v.value->node()->kind() == prim::None){
+      if (value->node()->kind() == prim::None){
         if (arg.type->isSubtypeOf(NumberType::get()))
-          v.value = insertConstant(graph, at::Scalar(NAN), loc);
+          value = insertConstant(graph, at::Scalar(NAN), loc);
         else
-          v.value = graph.insertNode(graph.createUndefined())->output();
+          value = graph.insertNode(graph.createUndefined())->output();
       }
 
-      if(!v.value->type()->isSubtypeOf(arg.type)) {
+      if(!value->type()->isSubtypeOf(arg.type)) {
         err() << "expected a value of type " << arg.type->str() << " for argument '" << arg.name << "' but found "
-              << v.value->type()->str() << "\n"
-              << v.loc;
+              << value->type()->str() << "\n"
+              << positional_inputs[i]->loc;
         return at::nullopt;
       }
 
-      // we only support tensor lists for builtins, where they must be flattened
-      if(arg.type->isSubtypeOf(ListType::ofTensors())) {
-        auto outputs = createTupleUnpack(v.value);
-        flat_inputs.insert(flat_inputs.end(), outputs.begin(), outputs.end());
-      } else {
-        flat_inputs.push_back(v.value);
-      }
+      matched_inputs.push_back(value);
     }
 
-    return flat_inputs;
+    return matched_inputs;
 }
 
 
@@ -513,27 +490,27 @@ static std::shared_ptr<SugaredValue> tryEmitBuiltin(
   at::ArrayRef<NamedValue> attributes) {
 
   auto graph = method.graph();
-  auto flat_inputs = tryMatchSchema(op->schema, loc, *graph, inputs, attributes, failure_messages);
-  if(!flat_inputs)
+  auto matched_inputs = tryMatchSchema(op->schema(), loc, *graph, inputs, attributes, failure_messages);
+  if(!matched_inputs)
     return nullptr;
   // we successfully matched this schema, construct the node
 
   NodeKind kind(Symbol::aten(name));
-  auto n = graph->insertNode(graph->create(kind, *flat_inputs, 0))
+  auto n = graph->insertNode(graph->create(kind, *matched_inputs, 0))
                 ->setSourceLocation(std::make_shared<SourceRange>(loc));
 
   // special case for chunk when the chunks=<const> is known
   // DO NOT ADD MORE SPECIAL CASES HERE, REFACTOR INTO A FUNCTION IF
   // NEEDED
   if(n->kind() == aten::chunk) {
-    auto value = constant_as<int64_t>((*flat_inputs)[1]);
+    auto value = constant_as<int64_t>((*matched_inputs)[1]);
     if(!value) {
       throw ErrorReport(loc) << "argument 'chunks' must be a constant";
     }
     for(int64_t i = 0; i < *value; ++i)
       n->addOutput();
   } else {
-    for(auto & ret : op->schema.returns) {
+    for(auto & ret : op->schema().returns) {
       n->addOutput()->setType(ret.type);
     }
   }
@@ -588,7 +565,7 @@ std::shared_ptr<SugaredValue> emitBuiltinCall(
 }
 
 static Value* ensureTensor(const SourceRange& range, Value* v) {
-  if(!isTensorSubtype(v)) {
+  if(!v->type()->isSubtypeOf(DynamicType::get())) {
     throw ErrorReport(range) << "expected a tensor value but found a "
                              << v->type()->str();
   }
@@ -700,7 +677,7 @@ struct to_ir {
       if (return_stmt.values().size() == 1 && results.size() == 1) {
         auto result = results.at(0);
         if(result->type()->cast<TupleType>()) {
-          results = createTupleUnpack(result);
+          results = createTupleUnpack(result).vec();
         }
       }
       if (typed_def.schema && typed_def.schema->returns.size() != results.size()) {
@@ -711,12 +688,16 @@ struct to_ir {
       auto range = return_stmt.range();
       size_t return_type_idx = 0;
       for (auto& r : results) {
-        if(r->type()->isSubtypeOf(NumberType::get())) {
-          graph->registerOutput(numToTensor(range, r));
-        } else {
-          ensureTensor(range, r);
-          graph->registerOutput(r);
+        // TODO: support tuples and lists as returns
+        auto return_kind = r->type()->kind();
+        if (return_kind != TypeKind::TensorType &&
+            return_kind != TypeKind::DynamicType &&
+            return_kind != TypeKind::IntType &&
+            return_kind != TypeKind::FloatType) {
+          throw ErrorReport(return_stmt.range()) << "The only supported return types "
+            << "are tensors, ints and floats";
         }
+        graph->registerOutput(r);
         TypePtr type = DynamicType::get();
         if (typed_def.schema) {
           type = typed_def.schema->returns.at(return_type_idx).type;
@@ -1387,6 +1368,11 @@ struct to_ir {
         auto values = getValues(ll.inputs(), /*maybe_unpack=*/true, identity);
         return graph->insertNode(graph->createTuple(values))->output();
       } break;
+      case TK_TUPLE_LITERAL: {
+        auto ll = TupleLiteral(tree);
+        auto values = getValues(ll.inputs(), /*maybe_unpack=*/true, identity);
+        return graph->insertNode(graph->createTuple(values))->output();
+      } break;
       default:
         throw ErrorReport(tree) << "NYI: " << tree;
         break;
diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h
index 0b87cf56be6ad3..3c4dcb07a248ee 100644
--- a/torch/csrc/jit/script/compiler.h
+++ b/torch/csrc/jit/script/compiler.h
@@ -68,7 +68,7 @@ struct SugaredValue : public std::enable_shared_from_this<SugaredValue> {
     SourceRange loc,
     Method & m,
     // note: names for args will be 'argument 0', 'argument 1', etc..
-    at::ArrayRef<NamedValue> inputs,
+    at::ArrayRef<NamedValue> inputs_,
     at::ArrayRef<NamedValue> attributes,
     size_t n_binders) {
 // n_binders is always set to the number of variables an expression is
@@ -89,7 +89,7 @@ struct SugaredValue : public std::enable_shared_from_this<SugaredValue> {
     throw ErrorReport(loc) << "cannot call a " << kind();
   }
 
-  virtual ~SugaredValue() {}
+  virtual ~SugaredValue() = default;
 };
 
 // most things in the environment are just simple value types
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index cb7893234dc747..39bb51ed89ca5d 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -370,10 +370,15 @@ static void gatherParametersAndBuffers(std::vector<at::Tensor*> & values, const
   }
 }
 
+Stack createStack(const py::tuple& tuple, const Method& method) {
+  auto relevant_inputs = method.graph()->inputs().slice(0, method.num_inputs());
+  return createStack(tuple, relevant_inputs);
+}
+
 py::object runMethodFromPython(Method& m, py::args args) {
-  auto stack = createStack(args);
+  auto stack = createStack(args, m);
   m.run(stack);
-  return wrapStack(std::move(stack));
+  return wrapStack(std::move(stack), m.graph()->outputs());
 }
 
 void initJitScriptBindings(PyObject* module) {
@@ -502,7 +507,8 @@ void initJitScriptBindings(PyObject* module) {
       })
       .def("graph_for", [](Module& self, py::args args) {
         if (self.find_method("forward")) {
-          return self.get_method("forward").graph_for(createStack(args));
+          Method & m = self.get_method("forward");
+          return m.graph_for(createStack(args, m.graph()->inputs()));
         }
         throw std::runtime_error("Attempted to call graph_for on a Module without a compiled forward()");
       })
@@ -530,7 +536,7 @@ void initJitScriptBindings(PyObject* module) {
     .def("propagate_and_assign_input_and_output_shapes", &Method::propagate_and_assign_input_and_output_shapes)
     .def("params", &Method::params)
     .def("graph_for", [](Method& self, py::args args) {
-      return self.graph_for(createStack(args));
+      return self.graph_for(createStack(args, self.graph()->inputs()));
     })
     .def("set_arg_and_return_types", [](Method &self, TypedDef &typed_def, bool method) {
       std::vector<Argument> arg_type_args, return_type_args;
diff --git a/torch/csrc/jit/script/lexer.h b/torch/csrc/jit/script/lexer.h
index 912b488dde5d9e..1694889d630d39 100644
--- a/torch/csrc/jit/script/lexer.h
+++ b/torch/csrc/jit/script/lexer.h
@@ -75,6 +75,7 @@ namespace script {
   _(TK_GATHER, "gather", "")                     \
   _(TK_NOTHING, "nothing", "")                   \
   _(TK_LIST_LITERAL, "list-literal", "")         \
+  _(TK_TUPLE_LITERAL, "tuple-literal", "")       \
   _(TK_FOR, "for", "for")                        \
   _(TK_IN, "in", "in")                           \
   _(TK_STARRED, "starred", "")                   \
diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h
index abea2778053699..0cd833dc15e488 100644
--- a/torch/csrc/jit/script/parser.h
+++ b/torch/csrc/jit/script/parser.h
@@ -30,7 +30,7 @@ struct Parser {
         List<Attribute>(makeList(range, std::move(attributes))));
   }
   // exp | expr, | expr, expr, ...
-  TreeRef parseExpOrExpList(int end) {
+  TreeRef parseExpOrExpTuple(int end) {
     auto prefix = parseExp();
     if(L.cur().kind == ',') {
       std::vector<Expr> exprs = { prefix };
@@ -39,7 +39,7 @@ struct Parser {
         exprs.push_back(parseExp());
       }
       auto list = List<Expr>::create(prefix.range(), exprs);
-      prefix = ListLiteral::create(list.range(), list);
+      prefix = TupleLiteral::create(list.range(), list);
     }
     return prefix;
   }
@@ -61,7 +61,14 @@ struct Parser {
       } break;
       case '(': {
         L.next();
-        prefix = parseExpOrExpList(')');
+        if (L.nextIf(')')) {
+          /// here we have the empty tuple case
+          std::vector<Expr> vecExpr;
+          List<Expr> listExpr = List<Expr>::create(L.cur().range, vecExpr);
+          prefix = TupleLiteral::create(L.cur().range, listExpr);
+          break;
+        }
+        prefix = parseExpOrExpTuple(')');
         L.expect(')');
       } break;
       case '[': {
@@ -242,7 +249,7 @@ struct Parser {
   // first[,other,lhs] = rhs
   Assign parseAssign(List<Expr> list) {
     auto red = parseOptionalReduction();
-    auto rhs = parseExpOrExpList(TK_NEWLINE);
+    auto rhs = parseExpOrExpTuple(TK_NEWLINE);
     L.expect(TK_NEWLINE);
     return Assign::create(list.range(), list, AssignKind(red), Expr(rhs));
   }
diff --git a/torch/csrc/jit/script/python_tree_views.cpp b/torch/csrc/jit/script/python_tree_views.cpp
index 569d1b0e66fdf3..7ece5e055a33df 100644
--- a/torch/csrc/jit/script/python_tree_views.cpp
+++ b/torch/csrc/jit/script/python_tree_views.cpp
@@ -193,6 +193,10 @@ void initTreeViewBindings(PyObject *module) {
     .def(py::init([](const SourceRange& range, std::vector<Expr> args) {
       return ListLiteral::create(range, wrap_list(range, std::move(args)));
     }));
+  py::class_<TupleLiteral, Expr>(m, "TupleLiteral")
+    .def(py::init([](const SourceRange& range, std::vector<Expr> args) {
+      return TupleLiteral::create(range, wrap_list(range, std::move(args)));
+    }));
   py::class_<Gather, Expr>(m, "Gather")
     .def(py::init([](const Expr& base, const Expr& index) {
       return Gather::create(base.range(), base, index);
diff --git a/torch/csrc/jit/script/tree.h b/torch/csrc/jit/script/tree.h
index e3d69d2790682d..0b9bc7009e0162 100644
--- a/torch/csrc/jit/script/tree.h
+++ b/torch/csrc/jit/script/tree.h
@@ -89,7 +89,7 @@ struct Tree : std::enable_shared_from_this<Tree> {
       throw std::runtime_error(ss.str());
     }
   }
-  virtual ~Tree() {}
+  virtual ~Tree() = default;
 
  private:
   int kind_;
diff --git a/torch/csrc/jit/script/tree_views.h b/torch/csrc/jit/script/tree_views.h
index 6cc934ab4d177a..10ac01799c0607 100644
--- a/torch/csrc/jit/script/tree_views.h
+++ b/torch/csrc/jit/script/tree_views.h
@@ -58,6 +58,7 @@ namespace script {
 //       | Gather(Expr value, Expr indices)                             TK_GATHER
 //       | Var(Ident name)                                              TK_VAR
 //       | ListLiteral(List<Expr> inputs)                               TK_LIST_LITERAL
+//       | TupleLiteral(List<Expr> inputs)                              TK_TUPLE_LITERAL
 //       | Starred(Expr expr)                                           TK_STARRED
 //
 // -- NB: only allowed expressions are Const or List(Const)
@@ -255,6 +256,7 @@ struct Expr : public TreeView {
       case TK_GATHER:
       case TK_VAR:
       case TK_LIST_LITERAL:
+      case TK_TUPLE_LITERAL:
       case '@':
       case TK_POW:
         return;
@@ -694,6 +696,17 @@ struct ListLiteral : public Expr {
   }
 };
 
+struct TupleLiteral : public Expr {
+  explicit TupleLiteral(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_TUPLE_LITERAL);
+  }
+  List<Expr> inputs() const {
+    return subtree(0);
+  }
+  static TupleLiteral create(const SourceRange& range, const List<Expr>& inputs) {
+    return TupleLiteral(Compound::create(TK_TUPLE_LITERAL, range, {inputs}));
+  }
+};
 
 struct Starred : public Expr {
   explicit Starred(const TreeRef& tree) : Expr(tree) {
diff --git a/torch/csrc/jit/stack.h b/torch/csrc/jit/stack.h
index 2c74ae7e0a4c77..7a23aa55df538f 100644
--- a/torch/csrc/jit/stack.h
+++ b/torch/csrc/jit/stack.h
@@ -77,8 +77,8 @@ inline void pack(Stack & stack, T&& v) {
 }
 
 template<>
-inline void pack(Stack & stack, std::vector<at::Tensor>&& ts) {
-  for(auto& t : ts) {
+inline void pack(Stack & stack, std::vector<at::Tensor>&& v) {
+  for(auto& t : v) {
     stack.push_back(IValue(std::move(t)));
   }
 }
diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h
index e4d2f98ba0ea0f..ef6d41005789f8 100644
--- a/torch/csrc/jit/symbolic_variable.h
+++ b/torch/csrc/jit/symbolic_variable.h
@@ -119,18 +119,20 @@ struct SymbolicVariable {
     return create(t("narrow"), { *this, insertConstant(dim), insertConstant(start), insertConstant(length) }, 1)[0];
   }
   static SymbolicVariable cat(ArrayRef<SymbolicVariable> inputs, Value* dim) {
-    std::vector<SymbolicVariable> all_inputs = inputs;
-    all_inputs.push_back(dim);
-    return create(aten::cat, all_inputs)[0];
+    Graph *g = dim->owningGraph();
+    auto value_inputs = fmap(inputs, [](const SymbolicVariable & v) { return v.value(); });
+    Value *input_list = g->insertNode(g->createList(DynamicType::get(), value_inputs))->output();
+    return create(aten::cat, {input_list, dim})[0];
   }
   static SymbolicVariable cat(ArrayRef<SymbolicVariable> inputs, int dim) {
     JIT_ASSERT(inputs.size() > 0);
     return SymbolicVariable::cat(inputs, inputs[0].insertConstant(dim));
   }
   static SymbolicVariable stack(ArrayRef<SymbolicVariable> inputs, Value* dim) {
-    std::vector<SymbolicVariable> all_inputs = inputs;
-    all_inputs.push_back(dim);
-    return create(aten::stack, all_inputs)[0];
+    Graph *g = dim->owningGraph();
+    auto value_inputs = fmap(inputs, [](const SymbolicVariable & v) { return v.value(); });
+    Value *input_list = g->insertNode(g->createList(DynamicType::get(), value_inputs))->output();
+    return create(aten::stack, {input_list, dim})[0];
   }
   static SymbolicVariable stack(ArrayRef<SymbolicVariable> inputs, int dim) {
     JIT_ASSERT(inputs.size() > 0);
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index 8c9763f88353e5..d5d204f9465bd8 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -220,6 +220,9 @@ static void fusionTests() {
   testOne(1,2,0,2);
 
 
+  auto createFusedConcat = [](Graph & graph, at::ArrayRef<Value*> inputs, int64_t dim) -> Value* {
+    return graph.insertNode(graph.create(prim::FusedConcat, inputs)->i_(attr::dim, dim))->output();
+  };
 
   auto testConcat = [&](int dim) {
     Graph graph;
@@ -227,7 +230,7 @@ static void fusionTests() {
     Var i1 = Var::asNewInput(graph);
     auto o0 = i0 * i1;
     o0.addAsOutput();
-    Var::cat({i0, o0}, dim).addAsOutput();
+    Var(createFusedConcat(graph, {i0, o0}, dim)).addAsOutput();
 
     auto a = at::rand({3,4,5}, at::kCUDA);
     auto b = at::rand({4,3,5}, at::kCUDA).transpose(0,1);
@@ -776,6 +779,9 @@ void argumentSpecTest() {
   REQUIRE(!(c == a));
   REQUIRE(spec.count(c) == 0);
 
+  Stack stack = { var(CF, {1,2}, true), 3, var(CF, {1,2}, true) };
+  ArgumentSpec with_const(true, stack);
+  REQUIRE(with_const.at(2).sizes().size() == 2);
 }
 
 void shapeAnalysisTest() {
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index aec6eb4ddc9447..a0e2f65e617754 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -38,9 +38,9 @@ void addInputs(Node *n, const char * name, const std::string& value)         { b
 void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { badArgType(); }
 
 void addInputs(Node *n, const char * name, at::TensorList value) {
-  for (auto & t : value) {
-    n->addInput(getValueTrace(t));
-  }
+  Graph *g = n->owningGraph();
+  Node *list_node = g->appendNode(g->createList(DynamicType::get(), fmap(value, getValueTrace)));
+  n->addInput(list_node->output());
 }
 
 void addInputs(Node *n, const char * name, at::IntList value) {
diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp
index ebcc91a908c213..ddb4dfad0154ad 100644
--- a/torch/csrc/jit/type.cpp
+++ b/torch/csrc/jit/type.cpp
@@ -46,31 +46,31 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
   return out;
 }
 
-TypePtr DynamicType::get() {
+DynamicTypePtr DynamicType::get() {
   static auto value = DynamicType::create();
   return value;
 }
-TypePtr NumberType::get() {
+NumberTypePtr NumberType::get() {
   static auto value = NumberType::create();
   return value;
 }
-TypePtr IntType::get() {
+IntTypePtr IntType::get() {
   static auto value = IntType::create();
   return value;
 }
-TypePtr FloatType::get() {
+FloatTypePtr FloatType::get() {
   static auto value = FloatType::create();
   return value;
 }
-TypePtr NoneType::get() {
+NoneTypePtr NoneType::get() {
   static auto value = NoneType::create();
   return value;
 }
-TypePtr ListType::ofTensors() {
+ListTypePtr ListType::ofTensors() {
   static auto value = ListType::create(DynamicType::get());
   return value;
 }
-TypePtr ListType::ofInts() {
+ListTypePtr ListType::ofInts() {
   static auto value = ListType::create(IntType::get());
   return value;
 }
diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h
index 7b7d708a549b32..713718e40681c8 100644
--- a/torch/csrc/jit/type.h
+++ b/torch/csrc/jit/type.h
@@ -80,7 +80,7 @@ struct TORCH_API Type : std::enable_shared_from_this<Type> {
     JIT_ASSERT(T::Kind == kind());
     return std::static_pointer_cast<const T>(shared_from_this());
   }
-  virtual ~Type() {}
+  virtual ~Type() = default;
 };
 
 inline bool operator!=(const Type & lhs, const Type & rhs) {
@@ -104,7 +104,7 @@ struct TORCH_API DynamicType : public Type {
   }
   static const TypeKind Kind = TypeKind::DynamicType;
   // global singleton
-  static TypePtr get();
+  static DynamicTypePtr get();
 private:
   DynamicType()
   : Type(TypeKind::DynamicType) {}
@@ -186,16 +186,16 @@ struct TORCH_API TensorType : public Type {
     : Type(TypeKind::TensorType)
     , scalar_type_(tensor.type().scalarType())
     , device_(tensor.type().is_cuda() ? tensor.get_device() : -1)
-    , sizes_(tensor.sizes())
-    , strides_(tensor.strides()) {}
+    , sizes_(tensor.sizes().vec())
+    , strides_(tensor.strides().vec()) {}
   TensorType(at::ScalarType scalar_type, int device, at::IntList sizes)
     : TensorType(scalar_type, device, sizes, TensorType::contiguousStridesOf(sizes)) {}
   TensorType(at::ScalarType scalar_type, int device, at::IntList sizes, at::IntList strides)
     : Type(TypeKind::TensorType)
     , scalar_type_(scalar_type)
     , device_(device)
-    , sizes_(sizes)
-    , strides_(strides)
+    , sizes_(sizes.vec())
+    , strides_(strides.vec())
     {}
   static std::vector<int64_t> contiguousStridesOf(at::IntList sizes) {
     std::vector<int64_t> strides(sizes.size());
@@ -237,8 +237,8 @@ struct TORCH_API ListType : public Type {
     return elem;
   }
   // common cast List[Tensor]
-  static TypePtr ofTensors();
-  static TypePtr ofInts();
+  static ListTypePtr ofTensors();
+  static ListTypePtr ofInts();
 private:
   ListType(TypePtr elem)
   : Type(TypeKind::ListType), elem(elem) {}
@@ -326,7 +326,7 @@ struct TORCH_API NumberType : public Type {
   }
   static const TypeKind Kind = TypeKind::NumberType;
   // global singleton
-  static TypePtr get();
+  static NumberTypePtr get();
 private:
   NumberType()
   : Type(TypeKind::NumberType) {}
@@ -351,7 +351,7 @@ struct TORCH_API FloatType : public Type {
   }
   static const TypeKind Kind = TypeKind::FloatType;
   // global singleton
-  static TypePtr get();
+  static FloatTypePtr get();
 private:
   FloatType()
   : Type(TypeKind::FloatType) {}
@@ -376,7 +376,7 @@ struct TORCH_API IntType : public Type {
   }
   static const TypeKind Kind = TypeKind::IntType;
   // global singleton
-  static TypePtr get();
+  static IntTypePtr get();
 private:
   IntType()
   : Type(TypeKind::IntType) {}
@@ -401,7 +401,7 @@ struct NoneType : public Type {
   }
   static const TypeKind Kind = TypeKind::NoneType;
   // global singleton
-  static TypePtr get();
+  static NoneTypePtr get();
 private:
   NoneType()
   : Type(TypeKind::NoneType) {}
diff --git a/torch/csrc/jit/variable_tensor_list.h b/torch/csrc/jit/variable_tensor_list.h
index eeae2a66b17e5f..0916fe6ac051d2 100644
--- a/torch/csrc/jit/variable_tensor_list.h
+++ b/torch/csrc/jit/variable_tensor_list.h
@@ -6,10 +6,10 @@ namespace torch { namespace jit {
 // a wrapper to mark places where we expect all the at::Tensors to be
 // variables
 struct variable_tensor_list : public std::vector<at::Tensor> {
-  variable_tensor_list() {}
+  variable_tensor_list() = default;
   template<class InputIt>
   variable_tensor_list(InputIt first, InputIt last)
-  : std::vector<at::Tensor>(first, last) {} 
+  : std::vector<at::Tensor>(first, last) {}
   explicit variable_tensor_list(std::vector<at::Tensor> && tensor)
   : std::vector<at::Tensor>(std::move(tensor)) {}
 };
diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp
index b09824ec77b4a5..64747c8c4b83a9 100644
--- a/torch/csrc/onnx/init.cpp
+++ b/torch/csrc/onnx/init.cpp
@@ -1,36 +1,33 @@
 #include "torch/csrc/onnx/init.h"
-#include "torch/csrc/onnx/onnx.npb.h"
 #include "torch/csrc/onnx/onnx.h"
+#include "onnx/onnx.pb.h"
 
 namespace torch { namespace onnx {
 void initONNXBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
   auto onnx = m.def_submodule("_onnx");
-  py::enum_<onnx_TensorProto_DataType>(onnx, "TensorProtoDataType")
-      .value("UNDEFINED", onnx_TensorProto_DataType_UNDEFINED)
-      .value("FLOAT", onnx_TensorProto_DataType_FLOAT)
-      .value("UINT8", onnx_TensorProto_DataType_UINT8)
-      .value("INT8", onnx_TensorProto_DataType_INT8)
-      .value("UINT16", onnx_TensorProto_DataType_UINT16)
-      .value("INT16", onnx_TensorProto_DataType_INT16)
-      .value("INT32", onnx_TensorProto_DataType_INT32)
-      .value("INT64", onnx_TensorProto_DataType_INT64)
-      .value("STRING", onnx_TensorProto_DataType_STRING)
-      .value("BOOL", onnx_TensorProto_DataType_BOOL)
-      .value("FLOAT16", onnx_TensorProto_DataType_FLOAT16)
-      .value("DOUBLE", onnx_TensorProto_DataType_DOUBLE)
-      .value("UINT32", onnx_TensorProto_DataType_UINT32)
-      .value("UINT64", onnx_TensorProto_DataType_UINT64)
-      .value("COMPLEX64", onnx_TensorProto_DataType_COMPLEX64)
-      .value("COMPLEX128", onnx_TensorProto_DataType_COMPLEX128);
+  py::enum_<::ONNX_NAMESPACE::TensorProto_DataType>(onnx, "TensorProtoDataType")
+      .value("UNDEFINED", ::ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED)
+      .value("FLOAT", ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT)
+      .value("UINT8", ::ONNX_NAMESPACE::TensorProto_DataType_UINT8)
+      .value("INT8", ::ONNX_NAMESPACE::TensorProto_DataType_INT8)
+      .value("UINT16", ::ONNX_NAMESPACE::TensorProto_DataType_UINT16)
+      .value("INT16", ::ONNX_NAMESPACE::TensorProto_DataType_INT16)
+      .value("INT32", ::ONNX_NAMESPACE::TensorProto_DataType_INT32)
+      .value("INT64", ::ONNX_NAMESPACE::TensorProto_DataType_INT64)
+      .value("STRING", ::ONNX_NAMESPACE::TensorProto_DataType_STRING)
+      .value("BOOL", ::ONNX_NAMESPACE::TensorProto_DataType_BOOL)
+      .value("FLOAT16", ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT16)
+      .value("DOUBLE", ::ONNX_NAMESPACE::TensorProto_DataType_DOUBLE)
+      .value("UINT32", ::ONNX_NAMESPACE::TensorProto_DataType_UINT32)
+      .value("UINT64", ::ONNX_NAMESPACE::TensorProto_DataType_UINT64)
+      .value("COMPLEX64", ::ONNX_NAMESPACE::TensorProto_DataType_COMPLEX64)
+      .value("COMPLEX128", ::ONNX_NAMESPACE::TensorProto_DataType_COMPLEX128);
 
   py::enum_<OperatorExportTypes>(onnx, "OperatorExportTypes")
     .value("ONNX", OperatorExportTypes::ONNX)
     .value("ONNX_ATEN", OperatorExportTypes::ONNX_ATEN)
     .value("ONNX_ATEN_FALLBACK", OperatorExportTypes::ONNX_ATEN_FALLBACK)
     .value("RAW", OperatorExportTypes::RAW);
-
-  py::class_<ModelProto>(onnx, "ModelProto")
-      .def("prettyPrint", &ModelProto::prettyPrint);
 }
 }} // namespace torch::onnx
diff --git a/torch/csrc/onnx/onnx.cpp b/torch/csrc/onnx/onnx.cpp
deleted file mode 100644
index fa93f6866d5ed6..00000000000000
--- a/torch/csrc/onnx/onnx.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-#include "torch/csrc/onnx/onnx.h"
-
-namespace torch { namespace onnx {
-
-template <>
-bool micropb_encode<std::string, nullptr>(pb_ostream_t *stream, std::string* arg) {
-  return pb_encode_string(stream, reinterpret_cast<const pb_byte_t *>(arg->c_str()), arg->size());
-}
-// NB: Overloads don't work so great for signed variables.  Hope this doesn't
-// come up!
-template <>
-bool micropb_encode<int64_t, nullptr>(pb_ostream_t *stream, int64_t* arg) {
-  // Yes, this looks dodgy, and yes, this is what the docs say to do:
-  // https://jpa.kapsi.fi/nanopb/docs/reference.html#pb-encode-varint
-  return pb_encode_varint(stream, *reinterpret_cast<uint64_t*>(arg));
-}
-template <>
-bool micropb_encode<float, nullptr>(pb_ostream_t *stream, float* arg) {
-  return pb_encode_fixed32(stream, static_cast<void*>(arg));
-}
-template <>
-bool micropb_encode<double, nullptr>(pb_ostream_t *stream, double* arg) {
-  return pb_encode_fixed64(stream, static_cast<void*>(arg));
-}
-
-template <>
-bool micropb_encode<Dimension, nullptr>(pb_ostream_t *stream, Dimension* arg) {
-  return pb_encode_submessage(stream, onnx_TensorShapeProto_Dimension_fields,
-                              static_cast<void*>(arg));
-}
-
-// TODO: I'm not entirely sure why this can't be in the header...
-bool micropb_callback_string_from_tensor(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) {
-  at::Tensor* t = static_cast<at::Tensor*>(*arg);
-  AT_ASSERT(t->is_contiguous());
-  // Packed array format!
-  pb_encode_tag_for_field(stream, field);
-  pb_encode_string(stream, (pb_byte_t*)(t->data_ptr()),  t->type().elementSizeInBytes()*t->numel());
-
-  return true;
-}
-
-GraphProto* AttributeProto::add_graphs() {
-  auto ptr = new GraphProto();
-  graphs.emplace_back(ptr);
-  return ptr;
-}
-
-constexpr char indent_char = ' ';
-constexpr size_t indent_multiplier = 2;
-
-std::string idt(size_t indent) {
-  return std::string(indent * indent_multiplier, indent_char);
-}
-
-std::string nlidt(size_t indent) {
-  return std::string("\n") + idt(indent);
-}
-
-void TensorProto::dump(std::ostream& stream, size_t indent) {
-  stream << "TensorProto shape: [";
-  for (size_t i = 0; i < dims.size(); ++i) {
-    stream << *dims[i] << (i == dims.size() - 1 ? "" : " ");
-  }
-  stream << "]";
-}
-
-void TensorShapeProto::dump(std::ostream& stream, size_t indent) {
-  for (size_t i=0; i < dims.size(); ++i) {
-    auto &dim = dims[i];
-    if (dim->has_dim_value) {
-      stream << dim->dim_value;
-    } else {
-      stream << "?";
-    }
-    stream << (i == dims.size() - 1 ? "" : " ");
-  }
-}
-
-void TypeProtoTensor::dump(std::ostream& stream, size_t indent) {
-  stream << "Tensor dims: ";
-  shape->dump(stream);
-}
-
-void TypeProto::dump(std::ostream& stream, size_t indent) {
-  tensor_type->dump(stream);
-}
-
-void ValueInfoProto::dump(std::ostream& stream, size_t indent) {
-  stream << "{name: \"" << name
-         << "\", type:";
-  type->dump(stream);
-  stream << "}";
-}
-
-void AttributeProto::dump(std::ostream& stream, size_t indent) {
-  stream << "{ name: '" << name << "', type: ";
-  if (proto.has_f) {
-    stream << "float, value: " << proto.f;
-  } else if (proto.has_i) {
-    stream << "int, value: " << proto.i;
-  } else if (s.length()) {
-    stream << "string, value: '" << s << "'";
-  } else if (g) {
-    stream << "graph, value:\n";
-    g->dump(stream, indent+1);
-    stream << nlidt(indent);
-  } else if (t) {
-    stream << "tensor, value:";
-    t->dump(stream, indent+1);
-  } else if (floats.size()) {
-    stream << "floats, values: [";
-    for (size_t i=0; i < floats.size(); ++i)
-      stream << *floats[i] << (i == floats.size() - 1 ? "" : " ");
-    stream << "]";
-  } else if (ints.size()) {
-    stream << "ints, values: [";
-    for (size_t i=0; i < ints.size(); ++i)
-      stream << *ints[i] << (i == ints.size() - 1 ? "" : " ");
-    stream << "]";
-  } else if (strings.size()) {
-    stream << "strings, values: [";
-    for (size_t i=0; i < strings.size(); ++i)
-      stream << "'" << *strings[i] << "'" << (i == strings.size() - 1 ? "" : " ");
-    stream << "]";
-  } else if (tensors.size()) {
-    stream << "tensors, values: [";
-    for (auto& t : tensors) {
-      t->dump(stream, indent+1);
-    }
-    stream << "]";
-  } else if (graphs.size()) {
-    stream << "graphs, values: [";
-    for (auto& g : graphs) {
-      g->dump(stream, indent+1);
-    }
-    stream << "]";
-  } else {
-    stream << "UNKNOWN";
-  }
-  stream << "}";
-}
-
-void NodeProto::dump(std::ostream& stream, size_t indent) {
-  stream << "Node {type: \"" << op_type << "\", inputs: [";
-  for (size_t i=0; i < inputs.size(); ++i) {
-    stream << *inputs[i] << (i == inputs.size() - 1 ? "" : ",");
-  }
-  stream << "], outputs: [";
-  for (size_t i=0; i < outputs.size(); ++i) {
-    stream << *outputs[i] << (i == outputs.size() - 1 ? "" : ",");
-  }
-  stream << "], attributes: [";
-  for (size_t i=0; i < attributes.size(); ++i) {
-    attributes[i]->dump(stream, indent+1);
-    stream << (i == attributes.size() - 1 ? "" : ",");
-  }
-  stream << "]}";
-}
-
-void GraphProto::dump(std::ostream& stream, size_t indent) {
-  stream << idt(indent) << "GraphProto {" << nlidt(indent+1)
-         << "name: \"" << name << "\"" << nlidt(indent+1)
-         << "inputs: [";
-  for (size_t i=0; i < inputs.size(); ++i) {
-    inputs[i]->dump(stream, indent+2);
-    stream << (i == inputs.size() - 1 ? "" : ",");
-  }
-  stream << "]" << nlidt(indent+1)
-         << "outputs: [";
-  for (size_t i=0; i < outputs.size(); ++i) {
-    outputs[i]->dump(stream, indent+2);
-    stream << (i == outputs.size() - 1 ? "" : ",");
-  }
-  stream << "]" << nlidt(indent+1)
-         << "initializers: [";
-  for (size_t i=0; i < initializers.size(); ++i) {
-    initializers[i]->dump(stream, indent+2);
-    stream << (i == initializers.size() - 1 ? "" : ",");
-  }
-  stream << "]" << nlidt(indent+1)
-         << "nodes: [" << nlidt(indent+2);
-  for (size_t i=0; i < nodes.size(); ++i) {
-    nodes[i]->dump(stream, indent+2);
-    if (i != nodes.size() - 1) stream << "," << nlidt(indent+2);
-  }
-  stream << nlidt(indent+1) << "]\n" << idt(indent) << "}\n";
-}
-
-void OperatorSetIdProto::dump(std::ostream& stream, size_t indent) {
-  stream << "OperatorSetIdProto { domain: " << domain << "}";
-}
-
-void ModelProto::dump(std::ostream& stream, size_t indent) {
-  stream << idt(indent)
-         << "ModelProto {" << nlidt(indent+1)
-         << "producer_name: \"" << producer_name << "\"" << nlidt(indent+1)
-         << "domain: \"" << domain << "\"" << nlidt(indent+1)
-         << "doc_string: \"" << doc_string << "\"";
-  if (graph) {
-    stream << nlidt(indent+1) << "graph:\n";
-    graph->dump(stream, indent+2);
-  }
-  if (opset_import.size()) {
-    stream << idt(indent+1) << "opset_import: [";
-    for (auto &opset_imp : opset_import) {
-      opset_imp->dump(stream, indent+2);
-    }
-    stream << "],\n";
-  }
-  stream << idt(indent) << "}\n";
-}
-
-}} // namespace onnx
diff --git a/torch/csrc/onnx/onnx.h b/torch/csrc/onnx/onnx.h
index 7fa38cc03898e9..76170e18110f1b 100644
--- a/torch/csrc/onnx/onnx.h
+++ b/torch/csrc/onnx/onnx.h
@@ -1,435 +1,11 @@
 #pragma once
 
-#include "torch/csrc/onnx/onnx.npb.h"
-#include "torch/csrc/WindowsTorchApiMacro.h"
-
-#include <pb_encode.h>
-#include <ATen/ATen.h>
-
-#include <vector>
-#include <fstream>
-#include <memory>
-
 namespace torch { namespace onnx {
 
-using DataType = onnx_TensorProto_DataType;
-using Dimension = onnx_TensorShapeProto_Dimension;
-
-// Note [Unique vector]
-// ~~~~~~~~~~~~~~~~~~~~
-// Why do we need vectors of unique pointers?  A Google-style C++ Protobuf API
-// returns raw pointers T* which are expected to stay valid as long as the
-// enclosing protobuf is live.  However, if we store T directly in a vector, if
-// the vector ever resizes (which it may, because we don't know a priori how
-// many elements are in the vector) all of these pointers will be invalidated.
-// Thus, up-front, we have to give them permanent, dynamically allocated
-// addresses.
-template<typename T>
-using unique_vector = std::vector<std::unique_ptr<T>>;
-
-// Helper function for encoding inside callbacks
-template<typename T, const pb_field_t* Field>
-bool micropb_encode(pb_ostream_t *stream, T* arg) {
-  static_assert(Field != nullptr, "no overload in micropb_encode");
-  return pb_encode_submessage(stream, Field, static_cast<void*>(&arg->proto));
-}
-template <> bool micropb_encode<std::string, nullptr>(pb_ostream_t *stream, std::string* arg);
-template <> bool micropb_encode<int64_t, nullptr>(pb_ostream_t *stream, int64_t* arg);
-template <> bool micropb_encode<float, nullptr>(pb_ostream_t *stream, float* arg);
-template <> bool micropb_encode<double, nullptr>(pb_ostream_t *stream, double* arg);
-template <> bool micropb_encode<Dimension, nullptr>(pb_ostream_t *stream, Dimension* arg);
-// NB: If we ever add support for signed protobuf integers, we'll need a special
-// wrapper, since we can't overload over them (they look the same from C++ side)
-
-// Callback functions of type pb_callback_t.
-
-// Write out a single protobuf field inside a message
-template<typename T, const pb_field_t* Field>
-bool micropb_callback(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) {
-  if (!pb_encode_tag_for_field(stream, field)) return false;
-  if (!micropb_encode<T, Field>(stream, static_cast<T*>(*arg))) return false;
-  return true;
-}
-
-// Write out a repeated protobuf field inside a message
-template<typename T, const pb_field_t* Field>
-bool micropb_callback_list(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) {
-  std::vector<std::unique_ptr<T>>* vals = static_cast<std::vector<std::unique_ptr<T>>*>(*arg);
-  for (std::unique_ptr<T>& val : *vals) {
-    auto ptr = static_cast<void*>(val.get());
-    if (!micropb_callback<T, Field>(stream, field, &ptr)) return false;
-  }
-  return true;
-}
-
-bool micropb_callback_string_from_tensor(pb_ostream_t *stream, const pb_field_t *field, void * const *arg);
-
-// MicroProto helper class
-template<typename T>
-struct MicroProto {
-  // The actual nanopb generated protobuf struct we are filling.
-  T proto;
-
-  // The constructor takes the protobuf struct by value for initialization
-  // (since it is a C-style struct).  In the constructor you're
-  // expected to call this with something like onnx_TensorProto_init_default
-  MicroProto(T proto) : proto(proto) {}
-
-  // Usage:
-  //    std::string owning_slot;
-  //    proto.string_field = string(&owning_slot, value_to_set)
-  //
-  // This function takes a string 's' and copies it into the
-  // owning slot specified by 'slot'.  It then returns a callback
-  // intended to be assigned into the particular protobuf field.
-  // The employed callback reads out the string from owning
-  // slot and writes it out to the protobuf.
-  //
-  // You should call this function IN THE SETTER METHOD, because
-  // the no-op callback is different from a callback with an empty
-  // string: in the former case, the field is absent; in the latter,
-  // the field is present but an empty string.
-  pb_callback_t string(std::string* slot, const std::string& s) {
-    *slot = s; // copy construct
-    pb_callback_t r;
-    r.funcs.encode = &micropb_callback<std::string, nullptr>;
-    r.arg = static_cast<void*>(slot);
-    return r; // RVO
-  }
-
-  // Usage:
-  //    at::Tensor owning_slot;
-  //    proto.string_field = string_from_tensor(&owning_slot, value_to_set)
-  //
-  // This function takes an at::Tensor and copies it into the
-  // owning slot specified by 'slot'.  It then returns a callback
-  // intended to be assigned into the particular protobuf field.
-  // The employed callback reads out the tensor's data as if it
-  // were a string (adjusting for endianness, if necessary)
-  // writes it out to the protobuf.
-  //
-  // You should call this function IN THE SETTER METHOD, because
-  // the no-op callback is different from a callback with an undefined
-  // Tensor.
-  pb_callback_t string_from_tensor(at::Tensor* slot, const at::Tensor& t) {
-    *slot = t; // copy construct
-    pb_callback_t r;
-    r.funcs.encode = &micropb_callback_string_from_tensor;
-    r.arg = static_cast<void*>(slot);
-    return r; // RVO
-  }
-
-  // Usage:
-  //    unique_vector<ElemType> owning_slot;
-  //    proto.list_field = list<ElemType>(&owning_slot)
-  //
-  // This function returns a callback intended to be
-  // assigned into a particular protobuf field.  The employed
-  // callback reads out the vector of elements from the owning
-  // slot and writes the entries into the protobuf.
-  //
-  // You should call this function IN THE CONSTRUCTOR, because
-  // the no-op callback is equivalent to a callback with an empty
-  // list.  (While it's harmless to call this in the setter, but
-  // a bit wasteful.)
-  template<typename S, const pb_field_t* Field = nullptr>
-  pb_callback_t list(unique_vector<S>* slot) {
-    pb_callback_t r;
-    r.funcs.encode = &micropb_callback_list<S, Field>;
-    r.arg = static_cast<void*>(slot);
-    return r; // RVO
-  }
-
-  template<typename S, const pb_field_t* Field = nullptr>
-  pb_callback_t msg(std::unique_ptr<S>* slot) {
-    *slot = std::unique_ptr<S>(new S()); // default construct
-    pb_callback_t r;
-    r.funcs.encode = &micropb_callback<S, Field>;
-    r.arg = static_cast<void*>(slot->get());
-    return r; // RVO
-  }
-};
-
-#define DEFINE_CONST(C) \
-const auto k##C = onnx_TensorProto_DataType_##C;
-DEFINE_CONST(FLOAT)
-DEFINE_CONST(UINT8)
-DEFINE_CONST(INT8)
-DEFINE_CONST(UINT16)
-DEFINE_CONST(INT16)
-DEFINE_CONST(INT32)
-DEFINE_CONST(INT64)
-DEFINE_CONST(STRING)
-DEFINE_CONST(BOOL)
-DEFINE_CONST(FLOAT16)
-DEFINE_CONST(DOUBLE)
-DEFINE_CONST(UINT32)
-DEFINE_CONST(UINT64)
-DEFINE_CONST(COMPLEX64)
-DEFINE_CONST(COMPLEX128)
-#undef DEFINE_CONST
-
-#define DEFINE_CONST(C) \
-const auto a##C = onnx_AttributeProto_AttributeType_##C;
-DEFINE_CONST(FLOAT)
-DEFINE_CONST(INT)
-DEFINE_CONST(STRING)
-DEFINE_CONST(TENSOR)
-DEFINE_CONST(GRAPH)
-DEFINE_CONST(FLOATS)
-DEFINE_CONST(INTS)
-DEFINE_CONST(STRINGS)
-DEFINE_CONST(TENSORS)
-DEFINE_CONST(GRAPHS)
-#undef DEFINE_CONST
-
-// C++ wrappers which simulate the Google C++ Protobuf API
-//
-// These are NOT COMPLETE wrappers. If you find something is missing, add it!
-
-class AttributeProto;
-class TensorShapeProto;
-class TypeProtoTensor;
-class TensorProto;
-class TypeProto;
-class ValueInfoProto;
-class NodeProto;
-class GraphProto;
-class ModelProto;
-
-class TensorProto : public MicroProto<onnx_TensorProto> {
-private:
-  std::string name; // namespace ValueInfoProto.
-  unique_vector<int64_t> dims;
-  at::Tensor raw_data;
-  std::string dump_;
-public:
-  TensorProto() : MicroProto(onnx_TensorProto_init_default) {
-    proto.dims       = list<int64_t>(&dims);
-  }
-  void set_name(const std::string& s) { proto.name = string(&name, s); }
-  void add_dims(int64_t d) { dims.emplace_back(new int64_t(d)); }
-  // Google Protobuf divergence!
-  void set_raw_data(const at::Tensor& t) { proto.raw_data = string_from_tensor(&raw_data, t); }
-  void set_external_data_present() { proto.raw_data = string(&dump_, "__EXTERNAL"); }
-  void set_data_type(onnx_TensorProto_DataType t) { proto.has_data_type = true; proto.data_type = t; }
-  std::string get_name() const { return name; }
-  void dump(std::ostream& stream, size_t indent = 0);
-};
-
-class TensorShapeProto : public MicroProto<onnx_TensorShapeProto> {
-private:
-  unique_vector<Dimension> dims;
-public:
-  TensorShapeProto() : MicroProto(onnx_TensorShapeProto_init_default) {
-    proto.dim = list<Dimension>(&dims);
-  }
-  void add_dim(std::int64_t d) {
-    Dimension* p_d = new Dimension();
-    p_d->has_dim_value = true;
-    p_d->dim_value = d;
-    dims.emplace_back(p_d);
-  }
-  void dump(std::ostream& stream, size_t indent = 0);
-};
-
-class TypeProtoTensor : public MicroProto<onnx_TypeProto_Tensor> {
-private:
-  std::unique_ptr<TensorShapeProto> shape;
-public:
-  TypeProtoTensor() : MicroProto(onnx_TypeProto_Tensor_init_default) {}
-  void set_data_type(onnx_TensorProto_DataType t) { proto.has_elem_type = true; proto.elem_type = t; }
-  TensorShapeProto* mutable_shape() {
-    proto.shape = msg<TensorShapeProto, onnx_TensorShapeProto_fields>(&shape);
-    return shape.get();
-  }
-  void dump(std::ostream& stream, size_t indent = 0);
-};
-
-class TypeProto : public MicroProto<onnx_TypeProto> {
-private:
-  std::unique_ptr<TypeProtoTensor> tensor_type;
-public:
-  TypeProto() : MicroProto(onnx_TypeProto_init_default) {}
-  TypeProtoTensor* mutable_tensor_type() {
-    proto.tensor_type = msg<TypeProtoTensor, onnx_TypeProto_Tensor_fields>(&tensor_type);
-    return tensor_type.get();
-  }
-  void dump(std::ostream& stream, size_t indent = 0);
-};
-
-class ValueInfoProto : public MicroProto<onnx_ValueInfoProto> {
-private:
-  std::string name;
-  std::unique_ptr<TypeProto> type;
-public:
-  ValueInfoProto() : MicroProto(onnx_ValueInfoProto_init_default) {}
-  std::string get_name() { return name; }
-  void set_name(const std::string& s) { proto.name = string(&name, s); }
-  TypeProto* mutable_type() {
-    proto.type = msg<TypeProto, onnx_TypeProto_fields>(&type);
-    return type.get();
-  }
-  void dump(std::ostream& stream, size_t indent = 0);
-};
-
-class AttributeProto : public MicroProto<onnx_AttributeProto> {
-private:
-  std::string name;
-  std::string s;
-  std::unique_ptr<GraphProto> g;
-  std::unique_ptr<TensorProto> t;
-  unique_vector<float> floats;
-  unique_vector<int64_t> ints;
-  unique_vector<std::string> strings;
-  unique_vector<TensorProto> tensors;
-  unique_vector<GraphProto> graphs;
-public:
-  AttributeProto() : MicroProto(onnx_AttributeProto_init_default) {
-    proto.floats  = list<float>(&floats);
-    proto.ints    = list<int64_t>(&ints);
-    proto.strings = list<std::string>(&strings);
-    proto.tensors = list<TensorProto, onnx_TensorProto_fields>(&tensors);
-    proto.graphs  = list<GraphProto, onnx_GraphProto_fields>(&graphs);
-  }
-  void set_name(const std::string& s) { proto.name = string(&name, s); }
-  void set_type(onnx_AttributeProto_AttributeType t) { proto.has_type = true; proto.type = t; }
-  void set_f(float f) { proto.has_f = true; proto.f = f; }
-  void set_i(int64_t i) { proto.has_i = true; proto.i = i; }
-  void set_s(std::string s_) { proto.s = string(&s, s_); }
-  // See https://developers.google.com/protocol-buffers/docs/reference/cpp-generated#embeddedmessage
-  GraphProto* mutable_g() { proto.g = msg<GraphProto, onnx_GraphProto_fields>(&g); return g.get(); }
-  TensorProto* mutable_t() { proto.t = msg<TensorProto, onnx_TensorProto_fields>(&t); return t.get(); }
-  void add_floats(float f) { floats.emplace_back(new float(f)); }
-  void add_ints(int64_t i) { ints.emplace_back(new int64_t(i)); }
-  void add_strings(std::string s) { strings.emplace_back(new std::string(s)); }
-  TensorProto* add_tensors() {
-    auto ptr = new TensorProto();
-    tensors.emplace_back(ptr);
-    return ptr;
-  }
-  GraphProto* add_graphs();
-  void dump(std::ostream& stream, size_t indent = 0);
-};
-
-class NodeProto : public MicroProto<onnx_NodeProto> {
-private:
-  std::string op_type;
-  std::string domain;
-  std::string doc_string;
-  unique_vector<std::string> inputs;
-  unique_vector<std::string> outputs;
-  unique_vector<AttributeProto> attributes;
-public:
-  NodeProto() : MicroProto(onnx_NodeProto_init_default) {
-    proto.input = list<std::string>(&inputs);
-    proto.output = list<std::string>(&outputs);
-    proto.attribute = list<AttributeProto, onnx_AttributeProto_fields>(&attributes);
-  }
-  void add_input(const std::string& s) { inputs.emplace_back(new std::string(s)); }
-  void clear_input() { inputs.clear(); }
-  void add_output(const std::string& s) { outputs.emplace_back(new std::string(s)); }
-  void clear_output() { outputs.clear(); }
-  AttributeProto* add_attribute() {
-    auto ptr = new AttributeProto();
-    attributes.emplace_back(ptr);
-    return ptr;
-  }
-  void set_op_type(const std::string& s) { proto.op_type = string(&op_type, s); }
-  void set_domain(const std::string& s) { proto.domain = string(&domain, s); }
-  void set_doc_string(const std::string& s) { proto.doc_string = string(&doc_string, s); }
-  void dump(std::ostream& stream, size_t indent = 0);
-};
-
-class GraphProto : public MicroProto<onnx_GraphProto> {
-private:
-  std::string name;
-  unique_vector<ValueInfoProto> inputs;
-  unique_vector<ValueInfoProto> outputs;
-  unique_vector<NodeProto> nodes;
-  unique_vector<TensorProto> initializers;
-public:
-  GraphProto() : MicroProto(onnx_GraphProto_init_default) {
-    proto.input = list<ValueInfoProto, onnx_ValueInfoProto_fields>(&inputs);
-    proto.output = list<ValueInfoProto, onnx_ValueInfoProto_fields>(&outputs);
-    proto.node = list<NodeProto, onnx_NodeProto_fields>(&nodes);
-    proto.initializer = list<TensorProto, onnx_TensorProto_fields>(&initializers);
-  }
-  void set_name(const std::string& s) { proto.name = string(&name, s); }
-  ValueInfoProto* add_input() {
-    auto ptr = new ValueInfoProto();
-    inputs.emplace_back(ptr);
-    return ptr;
-  }
-  std::string get_input_name(size_t i) { return inputs.at(i)->get_name(); }
-  ValueInfoProto* add_output() {
-    auto ptr = new ValueInfoProto();
-    outputs.emplace_back(ptr);
-    return ptr;
-  }
-  NodeProto* add_node() {
-    auto ptr = new NodeProto();
-    nodes.emplace_back(ptr);
-    return ptr;
-  }
-  TensorProto* add_initializer() {
-    auto ptr = new TensorProto();
-    initializers.emplace_back(ptr);
-    return ptr;
-  }
-  void dump(std::ostream& stream, size_t indent = 0);
-};
-
-class OperatorSetIdProto : public MicroProto<onnx_OperatorSetIdProto> {
-private:
-  std::string domain;
-public:
-  OperatorSetIdProto() : MicroProto(onnx_OperatorSetIdProto_init_default) {}
-  void set_domain(const std::string& s) { proto.domain = string(&domain, s); }
-  void set_version(int64_t v) { proto.has_version = true; proto.version = v; }
-  void dump(std::ostream& stream, size_t indent = 0);
-};
-
-class ModelProto : public MicroProto<onnx_ModelProto> {
-private:
-  std::string producer_name;
-  std::string producer_version;
-  std::string domain;
-  std::string doc_string;
-  std::unique_ptr<GraphProto> graph;
-  unique_vector<OperatorSetIdProto> opset_import;
-public:
-  ModelProto() : MicroProto(onnx_ModelProto_init_default) {
-    proto.has_ir_version = true;
-    proto.ir_version = onnx_Version_IR_VERSION;
-    proto.opset_import = list<OperatorSetIdProto, onnx_OperatorSetIdProto_fields>(&opset_import);
-  }
-  void set_model_version(int64_t i) { proto.has_model_version = true; proto.model_version = i; }
-  void set_doc_string(const std::string& s) { proto.doc_string = string(&doc_string, s); }
-  void set_producer_name(const std::string& s) { proto.producer_name = string(&producer_name, s); }
-  void set_producer_version(const std::string& s) { proto.producer_version = string(&producer_version, s); }
-  GraphProto* mutable_graph() {
-    proto.graph = msg<GraphProto, onnx_GraphProto_fields>(&graph);
-    return graph.get();
-  }
-  OperatorSetIdProto* add_opset_import() {
-    auto ptr = new OperatorSetIdProto();
-    opset_import.emplace_back(ptr);
-    return ptr;
-  }
-  TORCH_API void dump(std::ostream& stream, size_t indent = 0);
-  std::string prettyPrint() {
-    std::stringstream ss;
-    dump(ss, 0);
-    return ss.str();
-  }
-};
-
 enum class OperatorExportTypes {
   ONNX, // Strict ONNX export
   ONNX_ATEN, // ONNX With ATen op everywhere
   ONNX_ATEN_FALLBACK, // ONNX export with ATen fallback
   RAW, // Raw export (no ONNX)
 };
-
 }} // namespace torch::onnx
diff --git a/torch/csrc/onnx/onnx.npb.cpp b/torch/csrc/onnx/onnx.npb.cpp
deleted file mode 100644
index 2d8ee60eaff414..00000000000000
--- a/torch/csrc/onnx/onnx.npb.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Automatically generated nanopb constant definitions */
-/* Generated by nanopb-0.3.9-dev */
-
-#include "onnx.npb.h"
-
-/* @@protoc_insertion_point(includes) */
-#if PB_PROTO_HEADER_VERSION != 30
-#error Regenerate this file with the current version of nanopb generator.
-#endif
-
-
-
-const pb_field_t onnx_AttributeProto_fields[14] = {
-    PB_FIELD(  1, STRING  , OPTIONAL, CALLBACK, FIRST, onnx_AttributeProto, name, name, 0),
-    PB_FIELD(  2, FLOAT   , OPTIONAL, STATIC  , OTHER, onnx_AttributeProto, f, name, 0),
-    PB_FIELD(  3, INT64   , OPTIONAL, STATIC  , OTHER, onnx_AttributeProto, i, f, 0),
-    PB_FIELD(  4, BYTES   , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, s, i, 0),
-    PB_FIELD(  5, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, t, s, &onnx_TensorProto_fields),
-    PB_FIELD(  6, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, g, t, &onnx_GraphProto_fields),
-    PB_FIELD(  7, FLOAT   , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, floats, g, 0),
-    PB_FIELD(  8, INT64   , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, ints, floats, 0),
-    PB_FIELD(  9, BYTES   , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, strings, ints, 0),
-    PB_FIELD( 10, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, tensors, strings, &onnx_TensorProto_fields),
-    PB_FIELD( 11, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, graphs, tensors, &onnx_GraphProto_fields),
-    PB_FIELD( 13, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, doc_string, graphs, 0),
-    PB_FIELD( 20, UENUM   , OPTIONAL, STATIC  , OTHER, onnx_AttributeProto, type, doc_string, 0),
-    PB_LAST_FIELD
-};
-
-const pb_field_t onnx_ValueInfoProto_fields[4] = {
-    PB_FIELD(  1, STRING  , OPTIONAL, CALLBACK, FIRST, onnx_ValueInfoProto, name, name, 0),
-    PB_FIELD(  2, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_ValueInfoProto, type, name, &onnx_TypeProto_fields),
-    PB_FIELD(  3, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ValueInfoProto, doc_string, type, 0),
-    PB_LAST_FIELD
-};
-
-const pb_field_t onnx_NodeProto_fields[8] = {
-    PB_FIELD(  1, STRING  , REPEATED, CALLBACK, FIRST, onnx_NodeProto, input, input, 0),
-    PB_FIELD(  2, STRING  , REPEATED, CALLBACK, OTHER, onnx_NodeProto, output, input, 0),
-    PB_FIELD(  3, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, name, output, 0),
-    PB_FIELD(  4, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, op_type, name, 0),
-    PB_FIELD(  5, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_NodeProto, attribute, op_type, &onnx_AttributeProto_fields),
-    PB_FIELD(  6, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, doc_string, attribute, 0),
-    PB_FIELD(  7, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, domain, doc_string, 0),
-    PB_LAST_FIELD
-};
-
-const pb_field_t onnx_ModelProto_fields[10] = {
-    PB_FIELD(  1, INT64   , OPTIONAL, STATIC  , FIRST, onnx_ModelProto, ir_version, ir_version, 0),
-    PB_FIELD(  2, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, producer_name, ir_version, 0),
-    PB_FIELD(  3, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, producer_version, producer_name, 0),
-    PB_FIELD(  4, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, domain, producer_version, 0),
-    PB_FIELD(  5, INT64   , OPTIONAL, STATIC  , OTHER, onnx_ModelProto, model_version, domain, 0),
-    PB_FIELD(  6, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, doc_string, model_version, 0),
-    PB_FIELD(  7, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, graph, doc_string, &onnx_GraphProto_fields),
-    PB_FIELD(  8, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_ModelProto, opset_import, graph, &onnx_OperatorSetIdProto_fields),
-    PB_FIELD( 14, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_ModelProto, metadata_props, opset_import, &onnx_StringStringEntryProto_fields),
-    PB_LAST_FIELD
-};
-
-const pb_field_t onnx_StringStringEntryProto_fields[3] = {
-    PB_FIELD(  1, STRING  , OPTIONAL, CALLBACK, FIRST, onnx_StringStringEntryProto, key, key, 0),
-    PB_FIELD(  2, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_StringStringEntryProto, value, key, 0),
-    PB_LAST_FIELD
-};
-
-const pb_field_t onnx_GraphProto_fields[8] = {
-    PB_FIELD(  1, MESSAGE , REPEATED, CALLBACK, FIRST, onnx_GraphProto, node, node, &onnx_NodeProto_fields),
-    PB_FIELD(  2, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_GraphProto, name, node, 0),
-    PB_FIELD(  5, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, initializer, name, &onnx_TensorProto_fields),
-    PB_FIELD( 10, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_GraphProto, doc_string, initializer, 0),
-    PB_FIELD( 11, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, input, doc_string, &onnx_ValueInfoProto_fields),
-    PB_FIELD( 12, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, output, input, &onnx_ValueInfoProto_fields),
-    PB_FIELD( 13, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, value_info, output, &onnx_ValueInfoProto_fields),
-    PB_LAST_FIELD
-};
-
-const pb_field_t onnx_TensorProto_fields[13] = {
-    PB_FIELD(  1, INT64   , REPEATED, CALLBACK, FIRST, onnx_TensorProto, dims, dims, 0),
-    PB_FIELD(  2, UENUM   , OPTIONAL, STATIC  , OTHER, onnx_TensorProto, data_type, dims, 0),
-    PB_FIELD(  3, MESSAGE , OPTIONAL, STATIC  , OTHER, onnx_TensorProto, segment, data_type, &onnx_TensorProto_Segment_fields),
-    PB_FIELD(  4, FLOAT   , REPEATED, CALLBACK, OTHER, onnx_TensorProto, float_data, segment, 0),
-    PB_FIELD(  5, INT32   , REPEATED, CALLBACK, OTHER, onnx_TensorProto, int32_data, float_data, 0),
-    PB_FIELD(  6, BYTES   , REPEATED, CALLBACK, OTHER, onnx_TensorProto, string_data, int32_data, 0),
-    PB_FIELD(  7, INT64   , REPEATED, CALLBACK, OTHER, onnx_TensorProto, int64_data, string_data, 0),
-    PB_FIELD(  8, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, name, int64_data, 0),
-    PB_FIELD(  9, BYTES   , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, raw_data, name, 0),
-    PB_FIELD( 10, DOUBLE  , REPEATED, CALLBACK, OTHER, onnx_TensorProto, double_data, raw_data, 0),
-    PB_FIELD( 11, UINT64  , REPEATED, CALLBACK, OTHER, onnx_TensorProto, uint64_data, double_data, 0),
-    PB_FIELD( 12, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, doc_string, uint64_data, 0),
-    PB_LAST_FIELD
-};
-
-const pb_field_t onnx_TensorProto_Segment_fields[3] = {
-    PB_FIELD(  1, INT64   , OPTIONAL, STATIC  , FIRST, onnx_TensorProto_Segment, begin, begin, 0),
-    PB_FIELD(  2, INT64   , OPTIONAL, STATIC  , OTHER, onnx_TensorProto_Segment, end, begin, 0),
-    PB_LAST_FIELD
-};
-
-const pb_field_t onnx_TensorShapeProto_fields[2] = {
-    PB_FIELD(  1, MESSAGE , REPEATED, CALLBACK, FIRST, onnx_TensorShapeProto, dim, dim, &onnx_TensorShapeProto_Dimension_fields),
-    PB_LAST_FIELD
-};
-
-const pb_field_t onnx_TensorShapeProto_Dimension_fields[3] = {
-    PB_FIELD(  1, INT64   , OPTIONAL, STATIC  , FIRST, onnx_TensorShapeProto_Dimension, dim_value, dim_value, 0),
-    PB_FIELD(  2, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_TensorShapeProto_Dimension, dim_param, dim_value, 0),
-    PB_LAST_FIELD
-};
-
-const pb_field_t onnx_TypeProto_fields[2] = {
-    PB_FIELD(  1, MESSAGE , OPTIONAL, CALLBACK, FIRST, onnx_TypeProto, tensor_type, tensor_type, &onnx_TypeProto_Tensor_fields),
-    PB_LAST_FIELD
-};
-
-const pb_field_t onnx_TypeProto_Tensor_fields[3] = {
-    PB_FIELD(  1, UENUM   , OPTIONAL, STATIC  , FIRST, onnx_TypeProto_Tensor, elem_type, elem_type, 0),
-    PB_FIELD(  2, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_TypeProto_Tensor, shape, elem_type, &onnx_TensorShapeProto_fields),
-    PB_LAST_FIELD
-};
-
-const pb_field_t onnx_OperatorSetIdProto_fields[3] = {
-    PB_FIELD(  1, STRING  , OPTIONAL, CALLBACK, FIRST, onnx_OperatorSetIdProto, domain, domain, 0),
-    PB_FIELD(  2, INT64   , OPTIONAL, STATIC  , OTHER, onnx_OperatorSetIdProto, version, domain, 0),
-    PB_LAST_FIELD
-};
-
-
-
-
-
-/* Check that field information fits in pb_field_t */
-#if !defined(PB_FIELD_32BIT)
-/* If you get an error here, it means that you need to define PB_FIELD_32BIT
- * compile-time option. You can do that in pb.h or on compiler command line.
- *
- * The reason you need to do this is that some of your messages contain tag
- * numbers or field sizes that are larger than what can fit in 8 or 16 bit
- * field descriptors.
- */
-PB_STATIC_ASSERT((pb_membersize(onnx_TensorProto, segment) < 65536), YOU_MUST_DEFINE_PB_FIELD_32BIT_FOR_MESSAGES_onnx_AttributeProto_onnx_ValueInfoProto_onnx_NodeProto_onnx_ModelProto_onnx_StringStringEntryProto_onnx_GraphProto_onnx_TensorProto_onnx_TensorProto_Segment_onnx_TensorShapeProto_onnx_TensorShapeProto_Dimension_onnx_TypeProto_onnx_TypeProto_Tensor_onnx_OperatorSetIdProto)
-#endif
-
-#if !defined(PB_FIELD_16BIT) && !defined(PB_FIELD_32BIT)
-/* If you get an error here, it means that you need to define PB_FIELD_16BIT
- * compile-time option. You can do that in pb.h or on compiler command line.
- *
- * The reason you need to do this is that some of your messages contain tag
- * numbers or field sizes that are larger than what can fit in the default
- * 8 bit descriptors.
- */
-PB_STATIC_ASSERT((pb_membersize(onnx_TensorProto, segment) < 256), YOU_MUST_DEFINE_PB_FIELD_16BIT_FOR_MESSAGES_onnx_AttributeProto_onnx_ValueInfoProto_onnx_NodeProto_onnx_ModelProto_onnx_StringStringEntryProto_onnx_GraphProto_onnx_TensorProto_onnx_TensorProto_Segment_onnx_TensorShapeProto_onnx_TensorShapeProto_Dimension_onnx_TypeProto_onnx_TypeProto_Tensor_onnx_OperatorSetIdProto)
-#endif
-
-
-/* On some platforms (such as AVR), double is really float.
- * These are not directly supported by nanopb, but see example_avr_double.
- * To get rid of this error, remove any double fields from your .proto.
- */
-PB_STATIC_ASSERT(sizeof(double) == 8, DOUBLE_MUST_BE_8_BYTES)
-
-/* @@protoc_insertion_point(eof) */
diff --git a/torch/csrc/onnx/onnx.npb.h b/torch/csrc/onnx/onnx.npb.h
deleted file mode 100644
index 84d3b318643830..00000000000000
--- a/torch/csrc/onnx/onnx.npb.h
+++ /dev/null
@@ -1,333 +0,0 @@
-/* Automatically generated nanopb header */
-/* Generated by nanopb-0.3.9-dev */
-
-#ifndef PB_ONNX_ONNX_PB_H_INCLUDED
-#define PB_ONNX_ONNX_PB_H_INCLUDED
-#include <pb.h>
-
-/* @@protoc_insertion_point(includes) */
-#if PB_PROTO_HEADER_VERSION != 30
-#error Regenerate this file with the current version of nanopb generator.
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Enum definitions */
-typedef enum _onnx_Version {
-    onnx_Version__START_VERSION = 0,
-    onnx_Version_IR_VERSION_2017_10_10 = 1,
-    onnx_Version_IR_VERSION_2017_10_30 = 2,
-    onnx_Version_IR_VERSION = 3
-} onnx_Version;
-#define _onnx_Version_MIN onnx_Version__START_VERSION
-#define _onnx_Version_MAX onnx_Version_IR_VERSION
-#define _onnx_Version_ARRAYSIZE ((onnx_Version)(onnx_Version_IR_VERSION+1))
-
-typedef enum _onnx_AttributeProto_AttributeType {
-    onnx_AttributeProto_AttributeType_UNDEFINED = 0,
-    onnx_AttributeProto_AttributeType_FLOAT = 1,
-    onnx_AttributeProto_AttributeType_INT = 2,
-    onnx_AttributeProto_AttributeType_STRING = 3,
-    onnx_AttributeProto_AttributeType_TENSOR = 4,
-    onnx_AttributeProto_AttributeType_GRAPH = 5,
-    onnx_AttributeProto_AttributeType_FLOATS = 6,
-    onnx_AttributeProto_AttributeType_INTS = 7,
-    onnx_AttributeProto_AttributeType_STRINGS = 8,
-    onnx_AttributeProto_AttributeType_TENSORS = 9,
-    onnx_AttributeProto_AttributeType_GRAPHS = 10
-} onnx_AttributeProto_AttributeType;
-#define _onnx_AttributeProto_AttributeType_MIN onnx_AttributeProto_AttributeType_UNDEFINED
-#define _onnx_AttributeProto_AttributeType_MAX onnx_AttributeProto_AttributeType_GRAPHS
-#define _onnx_AttributeProto_AttributeType_ARRAYSIZE ((onnx_AttributeProto_AttributeType)(onnx_AttributeProto_AttributeType_GRAPHS+1))
-
-typedef enum _onnx_TensorProto_DataType {
-    onnx_TensorProto_DataType_UNDEFINED = 0,
-    onnx_TensorProto_DataType_FLOAT = 1,
-    onnx_TensorProto_DataType_UINT8 = 2,
-    onnx_TensorProto_DataType_INT8 = 3,
-    onnx_TensorProto_DataType_UINT16 = 4,
-    onnx_TensorProto_DataType_INT16 = 5,
-    onnx_TensorProto_DataType_INT32 = 6,
-    onnx_TensorProto_DataType_INT64 = 7,
-    onnx_TensorProto_DataType_STRING = 8,
-    onnx_TensorProto_DataType_BOOL = 9,
-    onnx_TensorProto_DataType_FLOAT16 = 10,
-    onnx_TensorProto_DataType_DOUBLE = 11,
-    onnx_TensorProto_DataType_UINT32 = 12,
-    onnx_TensorProto_DataType_UINT64 = 13,
-    onnx_TensorProto_DataType_COMPLEX64 = 14,
-    onnx_TensorProto_DataType_COMPLEX128 = 15
-} onnx_TensorProto_DataType;
-#define _onnx_TensorProto_DataType_MIN onnx_TensorProto_DataType_UNDEFINED
-#define _onnx_TensorProto_DataType_MAX onnx_TensorProto_DataType_COMPLEX128
-#define _onnx_TensorProto_DataType_ARRAYSIZE ((onnx_TensorProto_DataType)(onnx_TensorProto_DataType_COMPLEX128+1))
-
-/* Struct definitions */
-typedef struct _onnx_GraphProto {
-    pb_callback_t node;
-    pb_callback_t name;
-    pb_callback_t initializer;
-    pb_callback_t doc_string;
-    pb_callback_t input;
-    pb_callback_t output;
-    pb_callback_t value_info;
-/* @@protoc_insertion_point(struct:onnx_GraphProto) */
-} onnx_GraphProto;
-
-typedef struct _onnx_NodeProto {
-    pb_callback_t input;
-    pb_callback_t output;
-    pb_callback_t name;
-    pb_callback_t op_type;
-    pb_callback_t attribute;
-    pb_callback_t doc_string;
-    pb_callback_t domain;
-/* @@protoc_insertion_point(struct:onnx_NodeProto) */
-} onnx_NodeProto;
-
-typedef struct _onnx_StringStringEntryProto {
-    pb_callback_t key;
-    pb_callback_t value;
-/* @@protoc_insertion_point(struct:onnx_StringStringEntryProto) */
-} onnx_StringStringEntryProto;
-
-typedef struct _onnx_TensorShapeProto {
-    pb_callback_t dim;
-/* @@protoc_insertion_point(struct:onnx_TensorShapeProto) */
-} onnx_TensorShapeProto;
-
-typedef struct _onnx_TypeProto {
-    pb_callback_t tensor_type;
-/* @@protoc_insertion_point(struct:onnx_TypeProto) */
-} onnx_TypeProto;
-
-typedef struct _onnx_ValueInfoProto {
-    pb_callback_t name;
-    pb_callback_t type;
-    pb_callback_t doc_string;
-/* @@protoc_insertion_point(struct:onnx_ValueInfoProto) */
-} onnx_ValueInfoProto;
-
-typedef struct _onnx_AttributeProto {
-    pb_callback_t name;
-    bool has_f;
-    float f;
-    bool has_i;
-    int64_t i;
-    pb_callback_t s;
-    pb_callback_t t;
-    pb_callback_t g;
-    pb_callback_t floats;
-    pb_callback_t ints;
-    pb_callback_t strings;
-    pb_callback_t tensors;
-    pb_callback_t graphs;
-    pb_callback_t doc_string;
-    bool has_type;
-    onnx_AttributeProto_AttributeType type;
-/* @@protoc_insertion_point(struct:onnx_AttributeProto) */
-} onnx_AttributeProto;
-
-typedef struct _onnx_ModelProto {
-    bool has_ir_version;
-    int64_t ir_version;
-    pb_callback_t producer_name;
-    pb_callback_t producer_version;
-    pb_callback_t domain;
-    bool has_model_version;
-    int64_t model_version;
-    pb_callback_t doc_string;
-    pb_callback_t graph;
-    pb_callback_t opset_import;
-    pb_callback_t metadata_props;
-/* @@protoc_insertion_point(struct:onnx_ModelProto) */
-} onnx_ModelProto;
-
-typedef struct _onnx_OperatorSetIdProto {
-    pb_callback_t domain;
-    bool has_version;
-    int64_t version;
-/* @@protoc_insertion_point(struct:onnx_OperatorSetIdProto) */
-} onnx_OperatorSetIdProto;
-
-typedef struct _onnx_TensorProto_Segment {
-    bool has_begin;
-    int64_t begin;
-    bool has_end;
-    int64_t end;
-/* @@protoc_insertion_point(struct:onnx_TensorProto_Segment) */
-} onnx_TensorProto_Segment;
-
-typedef struct _onnx_TensorShapeProto_Dimension {
-    bool has_dim_value;
-    int64_t dim_value;
-    pb_callback_t dim_param;
-/* @@protoc_insertion_point(struct:onnx_TensorShapeProto_Dimension) */
-} onnx_TensorShapeProto_Dimension;
-
-typedef struct _onnx_TypeProto_Tensor {
-    bool has_elem_type;
-    onnx_TensorProto_DataType elem_type;
-    pb_callback_t shape;
-/* @@protoc_insertion_point(struct:onnx_TypeProto_Tensor) */
-} onnx_TypeProto_Tensor;
-
-typedef struct _onnx_TensorProto {
-    pb_callback_t dims;
-    bool has_data_type;
-    onnx_TensorProto_DataType data_type;
-    bool has_segment;
-    onnx_TensorProto_Segment segment;
-    pb_callback_t float_data;
-    pb_callback_t int32_data;
-    pb_callback_t string_data;
-    pb_callback_t int64_data;
-    pb_callback_t name;
-    pb_callback_t raw_data;
-    pb_callback_t double_data;
-    pb_callback_t uint64_data;
-    pb_callback_t doc_string;
-/* @@protoc_insertion_point(struct:onnx_TensorProto) */
-} onnx_TensorProto;
-
-/* Default values for struct fields */
-
-/* Initializer values for message structs */
-#define onnx_AttributeProto_init_default         {{{NULL}, NULL}, false, 0, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, (onnx_AttributeProto_AttributeType)0}
-#define onnx_ValueInfoProto_init_default         {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
-#define onnx_NodeProto_init_default              {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
-#define onnx_ModelProto_init_default             {false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
-#define onnx_StringStringEntryProto_init_default {{{NULL}, NULL}, {{NULL}, NULL}}
-#define onnx_GraphProto_init_default             {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
-#define onnx_TensorProto_init_default            {{{NULL}, NULL}, false, (onnx_TensorProto_DataType)0, false, onnx_TensorProto_Segment_init_default, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
-#define onnx_TensorProto_Segment_init_default    {false, 0, false, 0}
-#define onnx_TensorShapeProto_init_default       {{{NULL}, NULL}}
-#define onnx_TensorShapeProto_Dimension_init_default {false, 0, {{NULL}, NULL}}
-#define onnx_TypeProto_init_default              {{{NULL}, NULL}}
-#define onnx_TypeProto_Tensor_init_default       {false, (onnx_TensorProto_DataType)0, {{NULL}, NULL}}
-#define onnx_OperatorSetIdProto_init_default     {{{NULL}, NULL}, false, 0}
-#define onnx_AttributeProto_init_zero            {{{NULL}, NULL}, false, 0, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, (onnx_AttributeProto_AttributeType)0}
-#define onnx_ValueInfoProto_init_zero            {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
-#define onnx_NodeProto_init_zero                 {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
-#define onnx_ModelProto_init_zero                {false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
-#define onnx_StringStringEntryProto_init_zero    {{{NULL}, NULL}, {{NULL}, NULL}}
-#define onnx_GraphProto_init_zero                {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
-#define onnx_TensorProto_init_zero               {{{NULL}, NULL}, false, (onnx_TensorProto_DataType)0, false, onnx_TensorProto_Segment_init_zero, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
-#define onnx_TensorProto_Segment_init_zero       {false, 0, false, 0}
-#define onnx_TensorShapeProto_init_zero          {{{NULL}, NULL}}
-#define onnx_TensorShapeProto_Dimension_init_zero {false, 0, {{NULL}, NULL}}
-#define onnx_TypeProto_init_zero                 {{{NULL}, NULL}}
-#define onnx_TypeProto_Tensor_init_zero          {false, (onnx_TensorProto_DataType)0, {{NULL}, NULL}}
-#define onnx_OperatorSetIdProto_init_zero        {{{NULL}, NULL}, false, 0}
-
-/* Field tags (for use in manual encoding/decoding) */
-#define onnx_GraphProto_node_tag                 1
-#define onnx_GraphProto_name_tag                 2
-#define onnx_GraphProto_initializer_tag          5
-#define onnx_GraphProto_doc_string_tag           10
-#define onnx_GraphProto_input_tag                11
-#define onnx_GraphProto_output_tag               12
-#define onnx_GraphProto_value_info_tag           13
-#define onnx_NodeProto_input_tag                 1
-#define onnx_NodeProto_output_tag                2
-#define onnx_NodeProto_name_tag                  3
-#define onnx_NodeProto_op_type_tag               4
-#define onnx_NodeProto_domain_tag                7
-#define onnx_NodeProto_attribute_tag             5
-#define onnx_NodeProto_doc_string_tag            6
-#define onnx_StringStringEntryProto_key_tag      1
-#define onnx_StringStringEntryProto_value_tag    2
-#define onnx_TensorShapeProto_dim_tag            1
-#define onnx_TypeProto_tensor_type_tag           1
-#define onnx_ValueInfoProto_name_tag             1
-#define onnx_ValueInfoProto_type_tag             2
-#define onnx_ValueInfoProto_doc_string_tag       3
-#define onnx_AttributeProto_name_tag             1
-#define onnx_AttributeProto_doc_string_tag       13
-#define onnx_AttributeProto_type_tag             20
-#define onnx_AttributeProto_f_tag                2
-#define onnx_AttributeProto_i_tag                3
-#define onnx_AttributeProto_s_tag                4
-#define onnx_AttributeProto_t_tag                5
-#define onnx_AttributeProto_g_tag                6
-#define onnx_AttributeProto_floats_tag           7
-#define onnx_AttributeProto_ints_tag             8
-#define onnx_AttributeProto_strings_tag          9
-#define onnx_AttributeProto_tensors_tag          10
-#define onnx_AttributeProto_graphs_tag           11
-#define onnx_ModelProto_ir_version_tag           1
-#define onnx_ModelProto_opset_import_tag         8
-#define onnx_ModelProto_producer_name_tag        2
-#define onnx_ModelProto_producer_version_tag     3
-#define onnx_ModelProto_domain_tag               4
-#define onnx_ModelProto_model_version_tag        5
-#define onnx_ModelProto_doc_string_tag           6
-#define onnx_ModelProto_graph_tag                7
-#define onnx_ModelProto_metadata_props_tag       14
-#define onnx_OperatorSetIdProto_domain_tag       1
-#define onnx_OperatorSetIdProto_version_tag      2
-#define onnx_TensorProto_Segment_begin_tag       1
-#define onnx_TensorProto_Segment_end_tag         2
-#define onnx_TensorShapeProto_Dimension_dim_value_tag 1
-#define onnx_TensorShapeProto_Dimension_dim_param_tag 2
-#define onnx_TypeProto_Tensor_elem_type_tag      1
-#define onnx_TypeProto_Tensor_shape_tag          2
-#define onnx_TensorProto_dims_tag                1
-#define onnx_TensorProto_data_type_tag           2
-#define onnx_TensorProto_segment_tag             3
-#define onnx_TensorProto_float_data_tag          4
-#define onnx_TensorProto_int32_data_tag          5
-#define onnx_TensorProto_string_data_tag         6
-#define onnx_TensorProto_int64_data_tag          7
-#define onnx_TensorProto_name_tag                8
-#define onnx_TensorProto_doc_string_tag          12
-#define onnx_TensorProto_raw_data_tag            9
-#define onnx_TensorProto_double_data_tag         10
-#define onnx_TensorProto_uint64_data_tag         11
-
-/* Struct field encoding specification for nanopb */
-extern const pb_field_t onnx_AttributeProto_fields[14];
-extern const pb_field_t onnx_ValueInfoProto_fields[4];
-extern const pb_field_t onnx_NodeProto_fields[8];
-extern const pb_field_t onnx_ModelProto_fields[10];
-extern const pb_field_t onnx_StringStringEntryProto_fields[3];
-extern const pb_field_t onnx_GraphProto_fields[8];
-extern const pb_field_t onnx_TensorProto_fields[13];
-extern const pb_field_t onnx_TensorProto_Segment_fields[3];
-extern const pb_field_t onnx_TensorShapeProto_fields[2];
-extern const pb_field_t onnx_TensorShapeProto_Dimension_fields[3];
-extern const pb_field_t onnx_TypeProto_fields[2];
-extern const pb_field_t onnx_TypeProto_Tensor_fields[3];
-extern const pb_field_t onnx_OperatorSetIdProto_fields[3];
-
-/* Maximum encoded size of messages (where known) */
-/* onnx_AttributeProto_size depends on runtime parameters */
-/* onnx_ValueInfoProto_size depends on runtime parameters */
-/* onnx_NodeProto_size depends on runtime parameters */
-/* onnx_ModelProto_size depends on runtime parameters */
-/* onnx_StringStringEntryProto_size depends on runtime parameters */
-/* onnx_GraphProto_size depends on runtime parameters */
-/* onnx_TensorProto_size depends on runtime parameters */
-#define onnx_TensorProto_Segment_size            22
-/* onnx_TensorShapeProto_size depends on runtime parameters */
-/* onnx_TensorShapeProto_Dimension_size depends on runtime parameters */
-/* onnx_TypeProto_size depends on runtime parameters */
-/* onnx_TypeProto_Tensor_size depends on runtime parameters */
-/* onnx_OperatorSetIdProto_size depends on runtime parameters */
-
-/* Message IDs (where set with "msgid" option) */
-#ifdef PB_MSGID
-
-#define ONNX_MESSAGES \
-
-
-#endif
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-/* @@protoc_insertion_point(eof) */
-
-#endif
diff --git a/torch/csrc/onnx/onnx.options b/torch/csrc/onnx/onnx.options
deleted file mode 100644
index dd02d208eb7698..00000000000000
--- a/torch/csrc/onnx/onnx.options
+++ /dev/null
@@ -1,24 +0,0 @@
-# Note [Callback for nested messages]
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# nanopb's default translation for a nested, non-repeated (possibly
-# optional) message is to include it *inline* (no indirection), with
-# a boolean has_g/has_t field to indicate its presence or not.  Why
-# do we not like this?  It's not compatible with our ownership model,
-# where a TensorProto/GraphProto class owns the protobuf struct it
-# is constructing.  With the default translation, the protobuf struct
-# occurs in two places: a TensorProto, AND the parent protobuf struct
-# field.  That's bad.  Turning it back into a callback solves the
-# ownership problem.
-#
-# Two more bonuses: at the cost of an indirection, we no longer waste fields
-# when we aren't actually storing a graph/tensor; furthermore, circular
-# dependencies now work!
-
-onnx.AttributeProto.g type:FT_CALLBACK
-onnx.AttributeProto.t type:FT_CALLBACK
-onnx.ModelProto.graph type:FT_CALLBACK
-onnx.TypeProto.Tensor.shape type:FT_CALLBACK
-onnx.TypeProto.tensor_type type:FT_CALLBACK
-onnx.ValueInfoProto.type type:FT_CALLBACK
-onnx.TypeProto no_unions:true
-onnx.TensorShapeProto.Dimension no_unions:true
diff --git a/torch/csrc/utils/hash.h b/torch/csrc/utils/hash.h
index 05a5a27b51223a..954a7b5b7d0814 100644
--- a/torch/csrc/utils/hash.h
+++ b/torch/csrc/utils/hash.h
@@ -32,7 +32,7 @@ namespace torch {
 // DEALINGS IN THE SOFTWARE.
 
 inline size_t hash_combine(size_t seed, size_t value) {
-  return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2));
+  return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp
index f8d5fd1ba1cd63..0160bdd2d8e506 100644
--- a/torch/csrc/utils/invalid_arguments.cpp
+++ b/torch/csrc/utils/invalid_arguments.cpp
@@ -16,7 +16,7 @@ std::string py_typename(PyObject *object) {
 
 struct Type {
   virtual bool is_matching(PyObject *object) = 0;
-  virtual ~Type() {};
+  virtual ~Type() = default;
 };
 
 struct SimpleType: public Type {
diff --git a/torch/csrc/utils/invalid_arguments.h b/torch/csrc/utils/invalid_arguments.h
index 138c3331113b7c..daaccfd877f377 100644
--- a/torch/csrc/utils/invalid_arguments.h
+++ b/torch/csrc/utils/invalid_arguments.h
@@ -7,7 +7,9 @@
 namespace torch {
 
 std::string format_invalid_args(
-    PyObject *args, PyObject *kwargs, const std::string& name,
+    PyObject* given_args,
+    PyObject* given_kwargs,
+    const std::string& function_name,
     const std::vector<std::string>& options);
 
 } // namespace torch
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index b00bd27c087495..0f2f51904c2554 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -90,8 +90,8 @@ struct PythonArgParser {
 
 private:
   [[noreturn]]
-  void print_error(PyObject* args, PyObject* kwargs, PyObject* dst[]);
-  PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* dst[]);
+  void print_error(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]);
+  PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]);
 
   std::vector<FunctionSignature> signatures_;
   std::string function_name;
diff --git a/torch/csrc/utils/tensor_apply.h b/torch/csrc/utils/tensor_apply.h
index 47fbaa672c4262..5dfdef98c81db4 100644
--- a/torch/csrc/utils/tensor_apply.h
+++ b/torch/csrc/utils/tensor_apply.h
@@ -6,8 +6,8 @@
 namespace torch { namespace utils {
 
 at::Tensor & apply_(at::Tensor & self, PyObject* fn);
-at::Tensor & map_(at::Tensor & self, const at::Tensor & other, PyObject* fn);
-at::Tensor & map2_(at::Tensor & self, const at::Tensor & other1,
-                   const at::Tensor & other2, PyObject* fn);
+at::Tensor & map_(at::Tensor & self, const at::Tensor & other_, PyObject* fn);
+at::Tensor & map2_(at::Tensor & self, const at::Tensor & x_,
+                   const at::Tensor & y_, PyObject* fn);
 
 }} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 3a8b4a7bbc1592..d03fd55f2accfc 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -139,8 +139,10 @@ ScalarType infer_scalar_type(PyObject *obj) {
   }
 #ifdef USE_NUMPY
   if (PyArray_Check(obj)) {
-    auto array = (PyArrayObject*)obj;
-    return numpy_dtype_to_aten(PyArray_TYPE(array));
+    return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)obj));
+  }
+  if (PyArray_CheckScalar(obj)) {
+    return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)(PyArray_FromScalar(obj, NULL))));
   }
 #endif
   if (PySequence_Check(obj)) {
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index f8b26b121fd3e8..a2086ae95b899c 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -61,7 +61,8 @@ def init_process_group(backend, init_method='env://', **kwargs):
         group_name (str, optional): Group name. See description of init methods.
 
     To enable ``backend == mpi``, PyTorch needs to built from source on a system that
-    supports MPI.
+    supports MPI. If you want to use Openmpi with CUDA-aware support, please use Openmpi
+    major version 2 and above.
 
     """
     world_size = kwargs.pop('world_size', -1)
diff --git a/torch/distributions/__init__.py b/torch/distributions/__init__.py
index ca961d88ba0a63..47ee177c2cc959 100644
--- a/torch/distributions/__init__.py
+++ b/torch/distributions/__init__.py
@@ -96,6 +96,7 @@
 from .lowrank_multivariate_normal import LowRankMultivariateNormal
 from .multinomial import Multinomial
 from .multivariate_normal import MultivariateNormal
+from .negative_binomial import NegativeBinomial
 from .normal import Normal
 from .one_hot_categorical import OneHotCategorical
 from .pareto import Pareto
@@ -129,6 +130,7 @@
     'LogisticNormal',
     'Multinomial',
     'MultivariateNormal',
+    'NegativeBinomial',
     'Normal',
     'OneHotCategorical',
     'Pareto',
diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py
index a263082c967fe1..f8688af3f3a392 100644
--- a/torch/distributions/constraint_registry.py
+++ b/torch/distributions/constraint_registry.py
@@ -164,7 +164,9 @@ def _transform_to_positive(constraint):
 
 
 @biject_to.register(constraints.greater_than)
+@biject_to.register(constraints.greater_than_eq)
 @transform_to.register(constraints.greater_than)
+@transform_to.register(constraints.greater_than_eq)
 def _transform_to_greater_than(constraint):
     return transforms.ComposeTransform([transforms.ExpTransform(),
                                         transforms.AffineTransform(constraint.lower_bound, 1)])
@@ -178,7 +180,9 @@ def _transform_to_less_than(constraint):
 
 
 @biject_to.register(constraints.interval)
+@biject_to.register(constraints.half_open_interval)
 @transform_to.register(constraints.interval)
+@transform_to.register(constraints.half_open_interval)
 def _transform_to_interval(constraint):
     # Handle the special case of the unit interval.
     lower_is_0 = isinstance(constraint.lower_bound, numbers.Number) and constraint.lower_bound == 0
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
index 18da2bff1392a4..0b6eb53b0cd93a 100644
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@@ -27,8 +27,10 @@
     'dependent',
     'dependent_property',
     'greater_than',
+    'greater_than_eq',
     'integer_interval',
     'interval',
+    'half_open_interval',
     'is_dependent',
     'less_than',
     'lower_cholesky',
@@ -151,6 +153,17 @@ def check(self, value):
         return self.lower_bound < value
 
 
+class _GreaterThanEq(Constraint):
+    """
+    Constrain to a real half line `[lower_bound, inf)`.
+    """
+    def __init__(self, lower_bound):
+        self.lower_bound = lower_bound
+
+    def check(self, value):
+        return self.lower_bound <= value
+
+
 class _LessThan(Constraint):
     """
     Constrain to a real half line `[-inf, upper_bound)`.
@@ -174,6 +187,18 @@ def check(self, value):
         return (self.lower_bound <= value) & (value <= self.upper_bound)
 
 
+class _HalfOpenInterval(Constraint):
+    """
+    Constrain to a real interval `[lower_bound, upper_bound)`.
+    """
+    def __init__(self, lower_bound, upper_bound):
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+
+    def check(self, value):
+        return (self.lower_bound <= value) & (value < self.upper_bound)
+
+
 class _Simplex(Constraint):
     """
     Constrain to the unit simplex in the innermost (rightmost) dimension.
@@ -240,9 +265,11 @@ def check(self, value):
 real_vector = _RealVector()
 positive = _GreaterThan(0.)
 greater_than = _GreaterThan
+greater_than_eq = _GreaterThanEq
 less_than = _LessThan
 unit_interval = _Interval(0., 1.)
 interval = _Interval
+half_open_interval = _HalfOpenInterval
 simplex = _Simplex()
 lower_triangular = _LowerTriangular()
 lower_cholesky = _LowerCholesky()
diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
new file mode 100644
index 00000000000000..854ad5b7b087fa
--- /dev/null
+++ b/torch/distributions/negative_binomial.py
@@ -0,0 +1,83 @@
+import torch
+import torch.nn.functional as F
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import broadcast_all, probs_to_logits, lazy_property, logits_to_probs
+
+
+class NegativeBinomial(Distribution):
+    r"""
+    Creates a Negative Binomial distribution, i.e. distribution
+    of the number of independent identical Bernoulli trials
+    needed before `total_count` failures are achieved. The probability
+    of success of each Bernoulli trial is `probs`.
+
+    Args:
+        total_count (float or Tensor): non-negative number of negative Bernoulli
+            trials to stop, although the distribution is still valid for real
+            valued count
+        probs (Tensor): Event probabilities of success in the half open interval [0, 1)
+        logits (Tensor): Event log-odds for probabilities of success
+    """
+    arg_constraints = {'total_count': constraints.greater_than_eq(0),
+                       'probs': constraints.half_open_interval(0., 1.)}
+    support = constraints.nonnegative_integer
+
+    def __init__(self, total_count, probs=None, logits=None, validate_args=None):
+        if (probs is None) == (logits is None):
+            raise ValueError("Either `probs` or `logits` must be specified, but not both.")
+        if probs is not None:
+            self.total_count, self.probs, = broadcast_all(total_count, probs)
+            self.total_count = self.total_count.type_as(self.probs)
+        else:
+            self.total_count, self.logits, = broadcast_all(total_count, logits)
+            self.total_count = self.total_count.type_as(self.logits)
+
+        self._param = self.probs if probs is not None else self.logits
+        batch_shape = self._param.size()
+        super(NegativeBinomial, self).__init__(batch_shape, validate_args=validate_args)
+
+    def _new(self, *args, **kwargs):
+        return self._param.new(*args, **kwargs)
+
+    @property
+    def mean(self):
+        return self.total_count * torch.exp(self.logits)
+
+    @property
+    def variance(self):
+        return self.mean / torch.sigmoid(-self.logits)
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs, is_binary=True)
+
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits, is_binary=True)
+
+    @property
+    def param_shape(self):
+        return self._param.size()
+
+    @lazy_property
+    def _gamma(self):
+        return torch.distributions.Gamma(concentration=self.total_count,
+                                         rate=torch.exp(-self.logits))
+
+    def sample(self, sample_shape=torch.Size()):
+        with torch.no_grad():
+            rate = self._gamma.sample(sample_shape=sample_shape)
+            return torch.poisson(rate)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+
+        log_unnormalized_prob = (self.total_count * F.logsigmoid(-self.logits) +
+                                 value * F.logsigmoid(self.logits))
+
+        log_normalization = (-torch.lgamma(self.total_count + value) + torch.lgamma(1. + value) +
+                             torch.lgamma(self.total_count))
+
+        return log_unnormalized_prob - log_normalization
diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py
index ccc0ffffa2ec21..0219942aac155a 100644
--- a/torch/distributions/utils.py
+++ b/torch/distributions/utils.py
@@ -32,30 +32,19 @@ def _finfo(tensor):
     return _FINFO[tensor.storage_type()]
 
 
-def _broadcast_shape(shapes):
-    r"""
-    Given a list of tensor sizes, returns the size of the resulting broadcasted
-    tensor.
-
-    Args:
-        shapes (list of torch.Size): list of tensor sizes
-    """
-    shape = torch.Size()
-    for s in shapes:
-        shape = torch._C._infer_size(s, shape)
-    return shape
+# promote numbers to tensors of dtype torch.get_default_dtype()
+def _default_promotion(v):
+    return torch.tensor(v, dtype=torch.get_default_dtype())
 
 
 def broadcast_all(*values):
     r"""
     Given a list of values (possibly containing numbers), returns a list where each
     value is broadcasted based on the following rules:
-      - `torch.*Tensor` instances are broadcasted as per the `broadcasting rules
-        <http://pytorch.org/docs/master/notes/broadcasting.html>`_
+      - `torch.*Tensor` instances are broadcasted as per :ref:`_broadcasting-semantics`.
       - numbers.Number instances (scalars) are upcast to tensors having
         the same size and type as the first tensor passed to `values`.  If all the
-        values are scalars, then they are upcasted to Tensors having size
-        `(1,)`.
+        values are scalars, then they are upcasted to scalar Tensors.
 
     Args:
         values (list of `numbers.Number` or `torch.*Tensor`)
@@ -64,22 +53,16 @@ def broadcast_all(*values):
         ValueError: if any of the values is not a `numbers.Number` or
             `torch.*Tensor` instance
     """
-    values = list(values)
-    scalar_idxs = [i for i in range(len(values)) if isinstance(values[i], Number)]
-    tensor_idxs = [i for i in range(len(values)) if values[i].__class__.__name__ == 'Tensor']
-    if len(scalar_idxs) + len(tensor_idxs) != len(values):
+    if not all(torch.is_tensor(v) or isinstance(v, Number) for v in values):
         raise ValueError('Input arguments must all be instances of numbers.Number or torch.tensor.')
-    if tensor_idxs:
-        broadcast_shape = _broadcast_shape([values[i].size() for i in tensor_idxs])
-        for idx in tensor_idxs:
-            values[idx] = values[idx].expand(broadcast_shape)
-        template = values[tensor_idxs[0]]
-        for idx in scalar_idxs:
-            values[idx] = template.new(template.size()).fill_(values[idx])
-    else:
-        for idx in scalar_idxs:
-            values[idx] = torch.tensor(float(values[idx]))
-    return values
+    if not all(map(torch.is_tensor, values)):
+        new_tensor = _default_promotion
+        for value in values:
+            if torch.is_tensor(value):
+                new_tensor = value.new_tensor
+                break
+        values = [v if torch.is_tensor(v) else new_tensor(v) for v in values]
+    return torch.broadcast_tensors(*values)
 
 
 def _sum_rightmost(value, dim):
diff --git a/torch/functional.py b/torch/functional.py
index 19d47f394fa757..0133a012981854 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -10,6 +10,7 @@
     'argmin',
     'btrifact',
     'btriunpack',
+    'broadcast_tensors',
     'isfinite',
     'isinf',
     'isnan',
@@ -19,6 +20,28 @@
 ]
 
 
+def broadcast_tensors(*tensors):
+    r"""broadcast_tensors(*tensors) -> List of Tensors
+
+    Broadcasts the given tensors according to :ref:`_broadcasting-semantics`.
+
+    Args:
+        *tensors: any number of tensors of the same type
+
+    Example::
+
+        >>> x = torch.arange(3).view(1, 3)
+        >>> y = torch.arange(2).view(2, 1)
+        >>> a, b = torch.broadcast_tensors(x, y)
+        >>> a.size()
+        torch.Size([2, 3])
+        >>> a
+        tensor([[0, 1, 2],
+                [0, 1, 2]])
+    """
+    return torch._C._VariableFunctions.broadcast_tensors(tensors)
+
+
 def split(tensor, split_size_or_sections, dim=0):
     r"""Splits the tensor into chunks.
 
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index c0cf4f9d1c2e75..d09e970f729470 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -403,9 +403,12 @@ def wrapper(*args):
                 else:
                     new_args.append(arg)
             res = res_mod(*new_args)
-            # assert len(res) / 3 == 0
-            # result = [BatchTensor(*res[i * 3: i * 3 + 3]) for i in range(len(res) // 3)]
-            result = BatchTensor(*res)
+            assert len(res) % 3 == 0
+            if len(res) % 3 != 0:
+                raise "non-batched-tensor output is not supported yet"
+            result = [BatchTensor(*res[i * 3: i * 3 + 3]) for i in range(len(res) // 3)]
+            if len(result) == 1:
+                return result[0]
             return result
         wrapper.__doc__ = fn.__doc__
         return wrapper
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index 77e6cf777f2784..1db7749e07e34e 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -3,7 +3,7 @@
 import ast
 import inspect
 import torch
-from torch._C import DynamicType, TupleType
+from torch._C import DynamicType, TupleType, FloatType, IntType
 from textwrap import dedent
 
 
@@ -204,9 +204,13 @@ def as_ann(ann):
 
 def ann_to_type(ann):
     if ann is None:
-        return DynamicType()
+        return DynamicType.get()
     elif ann is torch.Tensor:
-        return DynamicType()
+        return DynamicType.get()
     elif is_tuple(ann):
         return TupleType([ann_to_type(a) for a in ann.__args__])
+    elif ann is float:
+        return FloatType.get()
+    elif ann is int:
+        return IntType.get()
     raise ValueError("The only supported annotations kinds are Tensor and Tuple[...]")
diff --git a/torch/jit/batchop.py b/torch/jit/batchop.py
index bda6a3adca3a88..053130dc0fb488 100644
--- a/torch/jit/batchop.py
+++ b/torch/jit/batchop.py
@@ -1,6 +1,9 @@
 import torch
+from torch.jit import BatchTensor
 
 
+# TODO: there are some commented raise statements
+# when we support rasie exception in script, we want to check them
 @torch.jit.script
 def batch_tanh(data, mask, dims):
     data = torch.tanh(data)
@@ -14,13 +17,52 @@ def batch_sigmoid(data, mask, dims):
 
 
 @torch.jit.script
-def batch_add(data1, mask1, dims1, data2, mask2, dims2):
-    data = torch.add(data1, data2)
+def batch_relu(data, mask, dims):
+    data = torch.relu(data)
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_neg(data, mask, dims):
+    data = torch.neg(data)
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_neg_scalar(data):
+    return torch.neg(data)
+
+
+@torch.jit.script
+def batch_add(data1, mask1, dims1, data2, mask2, dims2, alpha_):
+    alpha = float(alpha_)
+    data = torch.add(data1, data2, alpha)
+    mask = mask1 * mask2
+    dims = dims1 or dims2
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_add_scalar(data, mask, dims, other, alpha_):
+    alpha = float(alpha_)
+    data = torch.add(data, other.type_as(data), alpha)
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_sub(data1, mask1, dims1, data2, mask2, dims2, alpha_):
+    alpha = float(alpha_)
+    data = torch.sub(data1, data2, alpha)
     mask = mask1 * mask2
     dims = dims1 or dims2
     return data, mask, dims
 
 
+@torch.jit.script
+def batch_sub_scalar(data1, data2):
+    return data1 - data2
+
+
 @torch.jit.script
 def batch_mul(data1, mask1, dims1, data2, mask2, dims2):
     data = torch.mul(data1, data2)
@@ -29,6 +71,17 @@ def batch_mul(data1, mask1, dims1, data2, mask2, dims2):
     return data, mask, dims
 
 
+@torch.jit.script
+def batch_mul_scalar(data1, data2):
+    return data1 * data2
+
+
+@torch.jit.script
+def batch_div(data, mask, dims, other):  # div(batchtensor, scalar)
+    data = torch.div(data, other)
+    return data, mask, dims
+
+
 @torch.jit.script
 def batch_mm(data1, mask1, dims1, data2, mask2, dims2):
     data1 = data1 * mask1.type_as(data1)
@@ -88,26 +141,388 @@ def batch_select(data, mask, dims, dim_, index_):
     #     raise ValueError("Cannot select 0 dim in BatchTensor")
     data = data.select(dim, index)
     if dims[dim - 1]:
-        mask = mask.select(dim, 0)
-    else:
         mask = mask.select(dim, index)
+    else:
+        mask = mask.select(dim, 0)
     dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)]))
     return data, mask, dims
 
 
+@torch.jit.script
+def batch_fmod(data, mask, dims, other_):
+    other = int(other_)
+    data = torch.fmod(data, other)
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_zeros_like(data, mask, dims):
+    res_data = torch.zeros_like(data)
+    return res_data, mask, dims
+
+
+@torch.jit.script
+def batch_index_select(data, mask, dims, dim_, index_data, index_mask, index_dims):
+    dim = int(dim_)
+    # if dim == 0:
+    #     raise ValueError("Cannot index_select along 0 dim in BatchTensor")
+    batch_size = data.size(0)  # TODO maybe index_mask will be used at some point
+    res_data = torch.zeros([0])
+    res_mask = torch.zeros([0])
+    for i in range(batch_size):
+        d = data[i].index_select(dim - 1, index_data[i]).unsqueeze(0)
+        if dims[dim - 1]:
+            m = mask[i].index_select(dim - 1, index_data[i]).unsqueeze(0)
+        else:
+            m = mask[i].unsqueeze(0)
+        if i == 0:
+            res_data = d
+            res_mask = m
+        else:
+            res_data = torch.cat((res_data, d), 0)
+            res_mask = torch.cat((res_mask, m), 0)
+    return res_data, res_mask, dims
+
+
+@torch.jit.script
+def batch_view_as(data, mask, dims, data1, mask1, dims1):
+    # if data.size(0) != data1.size(0):
+    #     raise ValueError("In view_as, tensor and target tensor should have the same batch_size")
+    # if not torch.equal(dims, dims1):
+    #     raise ValueError("In batched view_as, dims and target dims should be the same")
+    data = data.view_as(data1)
+    mask = mask.view_as(mask1)
+    dims = dims1
+    return data, mask, dims
+
+
 # assume data, data1, data2 have same size
 @torch.jit.script
 def batch_where(data, mask, dims, data1, mask1, dims1, data2, mask2, dims2):
-    res_data = torch.where(data, data1, data2)
-    res_mask = torch.where(data, mask1, mask2)
+    data = data * mask.type_as(data)
+    cond_data = data
+    cond_mask = data
+    if data.dim() == 1:
+        for _ in range(data1.dim() - 1):
+            data = data.unsqueeze(data.dim())
+        cond_data = data.expand_as(data1)
+        cond_mask = data.expand_as(mask1)
+    res_data = torch.where(cond_data, data1, data2)
+    res_mask = torch.where(cond_mask, mask1, mask2)
     res_dims = dims1 or dims2
     return res_data, res_mask, res_dims
 
+
+@torch.jit.script
+def batch_where_scalar(cond_, data1, mask1, dims1, data2, mask2, dims2):
+    cond = torch.zeros([1], dtype=torch.uint8) * cond_
+    res_data = torch.where(cond, data1, data2)
+    res_mask = torch.where(cond, mask1, mask2)
+    res_dims = torch.where(cond, dims1, dims2)
+    return res_data, res_mask, res_dims
+
+
+@torch.jit.script
+def batch_update(batch_data, batch_mask, batch_dims, new_data, new_mask, new_dims):
+    data = torch.where(new_mask, new_data, batch_data)
+    return data, new_mask, new_dims  # TODO: consider whether return new_mask and new_dims
+
+
+@torch.jit.script
+def batch_any(data, mask, dims):
+    return torch.gt(torch.sum(data * mask), 0)
+
+
+@torch.jit.script
+def batch_type_as(data, mask, dims, data1, mask1, dims1):
+    return data.type_as(data1), mask, dims
+
+
+@torch.jit.script
+def batch_gt(data, mask, dims, data1, mask1, dims1):
+    return torch.gt(data, data1), mask * mask1, dims or dims1
+
+
+@torch.jit.script
+def batch_gt_scalar(data1, data2):
+    return torch.gt(data1, data2)
+
+
+@torch.jit.script
+def batch_gt_one_scalar(data, mask, dims, other_):
+    other = float(other_)
+    return torch.gt(data, other), mask, dims
+
+
+@torch.jit.script
+def batch_lt(data, mask, dims, data1, mask1, dims1):
+    return torch.lt(data, data1), mask * mask1, dims or dims1
+
+
+@torch.jit.script
+def batch_eq(data, mask, dims, data1, mask1, dims1):
+    return torch.eq(data, data1), mask * mask1, dims or dims1
+
+
+@torch.jit.script
+def batch_size(data, mask, dims, dim_):
+    dim = int(dim_)
+    return data.size(dim)
+
+
+@torch.jit.script
+def batch_dim(data, mask, dims):
+    return data.dim()
+
+
+@torch.jit.script
+def batch_squeeze(data, mask, dims, dim_):
+    if int(dim_) < 0:
+        dim_ += data.dim()
+    dim = int(dim_)
+    # if dim == 0:
+    #     raise ValueError("cannot do squeeze along batch_dim")
+    data = data.squeeze(dim)
+    mask = mask.squeeze(dim)
+    dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)]))
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_unsqueeze(data, mask, dims, dim_):
+    if int(dim_) < 0:
+        dim_ += data.dim() + 1
+    dim = int(dim_)
+    # if dim == 0:
+    #     raise ValueError("cannot do unsqueeze along batch_dim")
+    data = data.unsqueeze(dim)
+    mask = mask.unsqueeze(dim)
+    dims = torch.cat((dims[:dim], torch.zeros([1], dtype=torch.uint8), dims[dim:dims.size(0)]))
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_argmax(data, mask, dims, dim_, keepdim_):
+    dim = int(dim_)
+    keepdim = int(keepdim_)
+    # if dim == 0:
+    #     raise ValueError("cannot do argmax along batch_dim")
+    batch_size = data.size(0)
+    res_data = torch.zeros([0])
+    for i in range(batch_size):
+        if dims[dim - 1]:
+            if dim - 1 != 0:
+                m = mask[i].transpose(0, dim - 1)
+            else:
+                m = mask[i]
+            valid_num = m.sum(0, keepdim=True)
+            while(valid_num.dim() >= 1):
+                valid_num = valid_num[0]
+            d = data[i].unsqueeze(0).narrow(dim, 0, int(valid_num))
+        else:
+            d = data[i].unsqueeze(0)
+        d = d.argmax(dim, keepdim)
+        if i == 0:
+            res_data = d
+        else:
+            res_data = torch.cat([res_data, d], 0)
+    if keepdim:
+        mask = mask
+    else:
+        mask = mask.select(dim, 0)
+        dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)]))
+    return res_data, mask, dims
+
+
+@torch.jit.script
+def batch_topk(data, mask, dims, k_, dim_, largest_, sorted_):
+    k = int(k_)
+    dim = int(dim_)
+    largest = int(largest_)
+    sorted = int(sorted_)
+    # if dim == 0:
+    #     raise ValueError("cannot do topk along batch_dim")
+    batch_size = data.size(0)
+    res_data = torch.zeros([0])
+    res_index = torch.zeros([0])
+    for i in range(batch_size):
+        if dims[dim - 1]:
+            if dim - 1 != 0:
+                m = mask[i].transpose(0, dim - 1)
+            else:
+                m = mask[i]
+            valid_num = m.sum(0, keepdim=True)
+            while(valid_num.dim() >= 1):
+                valid_num = valid_num[0]
+            d = data[i].unsqueeze(0).narrow(dim, 0, int(valid_num))
+        else:
+            d = data[i].unsqueeze(0)
+        d, idx = d.topk(k, dim, largest, sorted)
+        if i == 0:
+            res_data = d
+            res_index = idx
+        else:
+            res_data = torch.cat([res_data, d], 0)
+            res_index = torch.cat([res_index, idx], 0)
+    if dims[dim - 1]:
+        mask = mask.narrow(dim, 0, k)
+    return res_data, mask, dims, res_index, mask, dims
+
+
+@torch.jit.script
+def batch_softmax(data, mask, dims, dim_):
+    dim = int(dim_)
+    # if dim == 0:
+    #     raise ValueError("cannot do softmax along batch_dim")
+    batch_size = data.size(0)
+    max_len = data.size(dim)
+    res_data = torch.zeros([0])
+    for i in range(batch_size):
+        if dims[dim - 1]:
+            if dim - 1 != 0:
+                m = mask[i].transpose(0, dim - 1)
+            else:
+                m = mask[i]
+            valid_num = m.sum(0, keepdim=True)
+            while(valid_num.dim() >= 1):
+                valid_num = valid_num[0]
+            valid_num = int(valid_num)
+            d = data[i].unsqueeze(0).narrow(dim, 0, valid_num).softmax(dim)
+            if valid_num < max_len:
+                d = torch.cat([d, data[i].unsqueeze(0).narrow(dim, valid_num, max_len - valid_num)], dim)
+        else:
+            d = data[i].unsqueeze(0).softmax(dim)
+        if i == 0:
+            res_data = d
+        else:
+            res_data = torch.cat([res_data, d], 0)
+    return res_data, mask, dims
+
+
+# size argument in dynamic dimension has to be -1
+# in static dimension, size has to be specified, -1 is not supported
+@torch.jit.script
+def batch_view(data, mask, dims, sizes):
+    batch_size = data.size(0)
+    # if(sizes[0] != batch_size and sizes[0] != -1 and sizes[0] != 1):
+    #     raise "first dim in view must be 1, -1, or batch size"
+    # for i in range(dims.size(0)):
+    #     if dims[0] == 1 and sizes[i + 1] != -1:
+    #         raise "size argument in dynamic dimension has to be -1"
+    sizes = sizes.type_as(torch.ones([1], dtype=torch.int))
+    data_sizes_ = torch.cat([torch.ones([1], dtype=torch.int) * batch_size, sizes.narrow(0, 1, sizes.size(0) - 1)], 0)
+    data_sizes = data_sizes_._tensor_to_list()
+    res_data = data.view(data_sizes)
+    mask_sizes_ = data_sizes_.narrow(0, 0, 1)
+    res_dims = data_sizes_.narrow(0, 0, 1)
+    for i_ in range(sizes.size(0) - 1):
+        i = i_ + 1
+        if(sizes[i] == -1):
+            cur_size_ = mask.size(i)
+            cur_dim = 1
+        else:
+            cur_size_ = 1
+            cur_dim = 0
+        mask_sizes_ = torch.cat([mask_sizes_, torch.ones([1], dtype=torch.int) * cur_size_])
+        res_dims = torch.cat([res_dims, torch.ones([1], dtype=torch.int) * cur_dim])
+    mask_sizes = mask_sizes_._tensor_to_list()
+    res_mask = mask.view(mask_sizes)
+    return res_data, res_mask, res_dims.narrow(0, 1, res_dims.size(0) - 1).type_as(dims)
+
+
+@torch.jit.script
+def batch_cat2(data1, mask1, dims1, data2, mask2, dims2, dim_):
+    dim = int(dim_)
+    data = torch.cat([data1, data2], dim)
+    if(dims1[dim - 1]):
+        mask = torch.cat([mask1, mask2], dim)
+    else:
+        mask = mask1
+    return data, mask, dims1
+
+
+@torch.jit.script
+def batch_cat3(data1, mask1, dims1, data2, mask2, dims2, data3, mask3, dims3, dim_):
+    dim = int(dim_)
+    data = torch.cat([data1, data2, data3], dim)
+    if(dims1[dim - 1]):
+        mask = torch.cat([mask1, mask2, mask3], dim)
+    else:
+        mask = mask1
+    return data, mask, dims1
+
+
+@torch.jit.script
+def batch_narrow(data, mask, dims, dimension_, start_, length_):
+    dimension = int(dimension_)
+    start = int(start_)
+    length = int(length_)
+    # if dimension == 0:
+    #     raise ValueError("cannot do narrow along batch_dim")
+    data = data.narrow(dimension, start, length)
+    if dims[dimension - 1]:
+        mask = mask.narrow(dimension, start, length)
+    else:
+        mask = mask.narrow(dimension, 0, 1)
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_sum(data, mask, dims):
+    data = data * mask.type_as(data)
+    for _ in range(dims.size(0)):
+        data = data.sum(1)
+    mask = torch.ones([data.size(0)], dtype=torch.uint8)
+    dims = dims[:0]  # empty tensor
+    return data, mask, dims
+
+
+@torch.jit.script
+def batch_from_scalar_tensor(data):
+    data = data.unsqueeze(0)
+    mask = torch.ones([1], dtype=torch.uint8)
+    dims = torch.zeros([0], dtype=torch.uint8)
+    return data, mask, dims
+
 torch.register_batch_operator("tanh", batch_tanh.graph)
 torch.register_batch_operator("sigmoid", batch_sigmoid.graph)
+torch.register_batch_operator("relu", batch_relu.graph)
+torch.register_batch_operator("neg", batch_neg.graph)
+torch.register_batch_operator("neg", batch_neg_scalar.graph)
 torch.register_batch_operator("add", batch_add.graph)
+torch.register_batch_operator("add", batch_add_scalar.graph)
+torch.register_batch_operator("sub", batch_sub.graph)
+torch.register_batch_operator("sub", batch_sub_scalar.graph)
 torch.register_batch_operator("mul", batch_mul.graph)
+torch.register_batch_operator("mul", batch_mul_scalar.graph)
+torch.register_batch_operator("div", batch_div.graph)
 torch.register_batch_operator("matmul", batch_matmul.graph)
 torch.register_batch_operator("mm", batch_mm.graph)
+torch.register_batch_operator("fmod", batch_fmod.graph)
+torch.register_batch_operator("zeros_like", batch_zeros_like.graph)
 torch.register_batch_operator("select", batch_select.graph)
+torch.register_batch_operator("index_select", batch_index_select.graph)
+torch.register_batch_operator("view_as", batch_view_as.graph)
 torch.register_batch_operator("where", batch_where.graph)
+torch.register_batch_operator("where", batch_where_scalar.graph)
+torch.register_batch_operator("update", batch_update.graph)
+torch.register_batch_operator("any", batch_any.graph)
+torch.register_batch_operator("type_as", batch_type_as.graph)
+torch.register_batch_operator("gt", batch_gt.graph)
+torch.register_batch_operator("gt", batch_gt_scalar.graph)
+torch.register_batch_operator("gt", batch_gt_one_scalar.graph)
+torch.register_batch_operator("lt", batch_lt.graph)
+torch.register_batch_operator("eq", batch_eq.graph)
+torch.register_batch_operator("size", batch_size.graph)
+torch.register_batch_operator("dim", batch_dim.graph)
+torch.register_batch_operator("squeeze", batch_squeeze.graph)
+torch.register_batch_operator("unsqueeze", batch_unsqueeze.graph)
+torch.register_batch_operator("argmax", batch_argmax.graph)
+torch.register_batch_operator("topk", batch_topk.graph)
+torch.register_batch_operator("softmax", batch_softmax.graph)
+torch.register_batch_operator("view", batch_view.graph)
+torch.register_batch_operator("cat", batch_cat2.graph)
+torch.register_batch_operator("cat", batch_cat3.graph)
+torch.register_batch_operator("narrow", batch_narrow.graph)
+torch.register_batch_operator("sum", batch_sum.graph)
+torch.register_batch_operator("batch_from_scalar_tensor", batch_from_scalar_tensor.graph)
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index d152b2010fcae4..bc979d15141121 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -435,8 +435,8 @@ def build_List(ctx, expr):
 
     @staticmethod
     def build_Tuple(ctx, expr):
-        return ListLiteral(ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1),
-                           [build_expr(ctx, e) for e in expr.elts])
+        return TupleLiteral(ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1),
+                            [build_expr(ctx, e) for e in expr.elts])
 
     @staticmethod
     def build_Num(ctx, expr):
diff --git a/torch/legacy/nn/ELU.py b/torch/legacy/nn/ELU.py
index 6ad240658a9e28..9e00e8a172fc88 100644
--- a/torch/legacy/nn/ELU.py
+++ b/torch/legacy/nn/ELU.py
@@ -23,6 +23,7 @@ def updateOutput(self, input):
             self.output,
             self.alpha,
             1.0,
+            1.0,
             self.inplace
         )
         return self.output
@@ -34,6 +35,7 @@ def updateGradInput(self, input, gradOutput):
             self.gradInput,
             self.output,
             self.alpha,
+            1.0,
             1.0
         )
         return self.gradInput
diff --git a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
index cc176931d8c0c2..b23157581bdfc0 100644
--- a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
+++ b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
@@ -100,6 +100,14 @@ void DataChannelMPI::destroy() {}
 
 
 bool DataChannelMPI::init() {
+#ifdef OMPI_MAJOR_VERSION
+  // OMPI_* is specific to Openmpi implementation.
+  // Openmpi v1.10 segfaults in MPI_Bcast with CUDA buffer.
+  if (int(OMPI_MAJOR_VERSION) < 2) {
+      throw std::runtime_error("Please use Openmpi major version 2 and above for distributed.");
+  }
+#endif /* OMPI_MAJOR_VERSION */
+
   int provided;
   MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
   if (provided != MPI_THREAD_MULTIPLE) {
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
index 26f6c480420b67..9bb0ef0e98ca82 100644
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@@ -64,7 +64,7 @@ inline std::vector<std::vector<int64_t>> getSizes(
     const std::vector<at::Tensor>& tensors) {
   std::vector<std::vector<int64_t>> sizes(tensors.size());
   for (size_t i = 0; i < tensors.size(); i++) {
-    sizes[i] = tensors[i].sizes();
+    sizes[i] = tensors[i].sizes().vec();
   }
   return sizes;
 }
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 17a7c09b012da6..746c2664529175 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -741,6 +741,25 @@ def selu(input, inplace=False):
 """)
 
 
+def celu(input, alpha=1., inplace=False):
+    r"""celu(input, alpha=1., inplace=False) -> Tensor
+
+    Applies element-wise,
+    :math:`\text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))`.
+
+    See :class:`~torch.nn.CELU` for more details.
+    """
+    if inplace:
+        return torch.celu_(input, alpha)
+    return torch.celu(input, alpha)
+
+celu_ = _add_docstr(torch.celu_, r"""
+celu_(input, alpha=1.) -> Tensor
+
+In-place version of :func:`~celu`.
+""")
+
+
 def leaky_relu(input, negative_slope=0.01, inplace=False):
     r"""
     leaky_relu(input, negative_slope=0.01, inplace=False) -> Tensor
@@ -859,7 +878,7 @@ def softmin(input, dim=None, _stacklevel=3):
     """
     if dim is None:
         dim = _get_softmax_dim('softmin', input.dim(), _stacklevel)
-    return -input.softmax(dim)
+    return (-input).softmax(dim)
 
 
 def softmax(input, dim=None, _stacklevel=3):
@@ -1099,7 +1118,7 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2,
             assert padding_idx >= -weight.size(0), 'Padding_idx must be within num_embeddings'
             padding_idx = weight.size(0) + padding_idx
     elif padding_idx is None:
-            padding_idx = -1
+        padding_idx = -1
     if max_norm is not None:
         # `embedding_renorm_` will call .contiguous() on input anyways, so we
         # call it here and take advantage of the improved locality in the
@@ -1350,6 +1369,41 @@ def local_response_norm(input, size, alpha=1e-4, beta=0.75, k=1):
 
 # loss
 
+def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank=0,
+             reduction='elementwise_mean'):
+    r"""The Connectionist Temporal Classification loss.
+
+    See :class:`~torch.nn.CTCLoss` for details.
+
+    Args:
+        log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`,
+            `T = input length`, and `N = batch size`.
+            The logarithmized probabilities of the outputs
+            (e.g. obtained with :func:`torch.nn.functional.log_softmax`).
+        targets: :math:`(N, S)` or `(sum(target_lenghts))`.
+            Targets (cannot be blank). In the second form, the targets are assumed to be concatenated.
+        input_lengths: :math:`(N)`.
+            Lengths of the inputs (must each be :math:`\leq T`)
+        target_lengths: :math:`(N)`.
+            Lengths of the targets
+        blank (int, optional):
+            Blank label. Default :math:`0`.
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the output losses will be divided by the target lengths and
+            then the mean over the batch is taken. Default: 'elementwise_mean'
+
+    Example::
+
+        >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_()
+        >>> targets = torch.randint(1, 21, (16, 30), dtype=torch.long)
+        >>> input_lengths = torch.full((16,), 50, dtype=torch.long)
+        >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long)
+        >>> loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths)
+        >>> loss.backward()
+    """
+    return torch.ctc_loss(log_probs, targets, input_lengths, target_lengths, blank, _Reduction.get_enum(reduction))
+
 
 def nll_loss(input, target, weight=None, size_average=None, ignore_index=-100,
              reduce=None, reduction='elementwise_mean'):
@@ -1671,7 +1725,7 @@ def _pointwise_loss(lambd, lambd_optimized, input, target, reduction='elementwis
             return d
         return torch.mean(d) if reduction == 'elementwise_mean' else torch.sum(d)
     else:
-        return lambd_optimized(input, target, reduction)
+        return lambd_optimized(input, target, _Reduction.get_enum(reduction))
 
 
 def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean'):
@@ -1695,9 +1749,7 @@ def l1_loss(input, target, size_average=None, reduce=None, reduction='elementwis
     See :class:`~torch.nn.L1Loss` for details.
     """
     if size_average is not None or reduce is not None:
-        reduction = _Reduction.legacy_get_enum(size_average, reduce)
-    else:
-        reduction = _Reduction.get_enum(reduction)
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
     return _pointwise_loss(lambda a, b: torch.abs(a - b), torch._C._nn.l1_loss,
                            input, target, reduction)
 
@@ -1710,9 +1762,7 @@ def mse_loss(input, target, size_average=None, reduce=None, reduction='elementwi
     See :class:`~torch.nn.MSELoss` for details.
     """
     if size_average is not None or reduce is not None:
-        reduction = _Reduction.legacy_get_enum(size_average, reduce)
-    else:
-        reduction = _Reduction.get_enum(reduction)
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
     return _pointwise_loss(lambda a, b: (a - b) ** 2, torch._C._nn.mse_loss, input, target, reduction)
 
 
diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
index 4d98f482768a63..6c66f8d43f005f 100644
--- a/torch/nn/modules/__init__.py
+++ b/torch/nn/modules/__init__.py
@@ -3,10 +3,10 @@
 from .conv import Conv1d, Conv2d, Conv3d, \
     ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
 from .activation import Threshold, ReLU, Hardtanh, ReLU6, Sigmoid, Tanh, \
-    Softmax, Softmax2d, LogSoftmax, ELU, SELU, Hardshrink, LeakyReLU, LogSigmoid, \
+    Softmax, Softmax2d, LogSoftmax, ELU, SELU, CELU, Hardshrink, LeakyReLU, LogSigmoid, \
     Softplus, Softshrink, PReLU, Softsign, Softmin, Tanhshrink, RReLU, GLU
 from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \
-    CosineEmbeddingLoss, HingeEmbeddingLoss, MarginRankingLoss, \
+    CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \
     MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \
     SmoothL1Loss, SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, PoissonNLLLoss
 from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict
@@ -31,10 +31,10 @@
 __all__ = [
     'Module', 'Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d',
     'ConvTranspose2d', 'ConvTranspose3d', 'Threshold', 'ReLU', 'Hardtanh', 'ReLU6',
-    'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'GLU', 'Hardshrink',
+    'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'CELU', 'GLU', 'Hardshrink',
     'LeakyReLU', 'LogSigmoid', 'Softplus', 'Softshrink', 'PReLU', 'Softsign', 'Softmin',
     'Tanhshrink', 'RReLU', 'L1Loss', 'NLLLoss', 'KLDivLoss', 'MSELoss', 'BCELoss', 'BCEWithLogitsLoss',
-    'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss',
+    'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'CTCLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss',
     'MultiLabelMarginLoss', 'MultiLabelSoftMarginLoss', 'MultiMarginLoss', 'SmoothL1Loss',
     'SoftMarginLoss', 'CrossEntropyLoss', 'Container', 'Sequential', 'ModuleList', 'ModuleDict',
     'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d',
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index d372a2cae21d2c..51cfab79404145 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -118,6 +118,7 @@ class RReLU(Module):
     .. _`Empirical Evaluation of Rectified Activations in Convolutional Network`:
         https://arxiv.org/abs/1505.00853
     """
+
     def __init__(self, lower=1. / 8, upper=1. / 3, inplace=False):
         super(RReLU, self).__init__()
         self.lower = lower
@@ -299,6 +300,46 @@ def extra_repr(self):
         return 'alpha={}{}'.format(self.alpha, inplace_str)
 
 
+class CELU(Module):
+    r"""Applies element-wise,
+    :math:`\text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))`
+
+    More details can be found in the paper `Continuously Differentiable Exponential Linear Units`_ .
+
+    Args:
+        alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/CELU.png
+
+    Examples::
+
+        >>> m = nn.CELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+    .. _`Continuously Differentiable Exponential Linear Units`:
+        https://arxiv.org/abs/1704.07483
+    """
+
+    def __init__(self, alpha=1., inplace=False):
+        super(CELU, self).__init__()
+        self.alpha = alpha
+        self.inplace = inplace
+
+    def forward(self, input):
+        return F.celu(input, self.alpha, self.inplace)
+
+    def extra_repr(self):
+        inplace_str = ', inplace' if self.inplace else ''
+        return 'alpha={}{}'.format(self.alpha, inplace_str)
+
+
 class SELU(Module):
     r"""Applies element-wise,
     :math:`\text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))`,
@@ -668,6 +709,7 @@ class Softmin(Module):
         >>> input = torch.randn(2, 3)
         >>> output = m(input)
     """
+
     def __init__(self, dim=None):
         super(Softmin, self).__init__()
         self.dim = dim
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 489e8998843f98..ec7d60d8125152 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1123,6 +1123,61 @@ def forward(self, anchor, positive, negative):
         return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
                                      eps=self.eps, swap=self.swap, reduction=self.reduction)
 
+
+class CTCLoss(_Loss):
+    r"""The Connectionist Temporal Classification loss.
+
+    Args:
+        blank (int, optional): blank label. Default :math:`0`.
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
+            'elementwise_mean': the output losses will be divided by the target lengths and
+            then the mean over the batch is taken. Default: 'elementwise_mean'
+
+    Inputs:
+        log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`,
+            `T = input length`, and `N = batch size`.
+            The logarithmized probabilities of the outputs
+            (e.g. obtained with :func:`torch.nn.functional.log_softmax`).
+        targets: :math:`(N, S)` or `(sum(target_lenghts))`.
+            Targets (cannot be blank). In the second form, the targets are assumed to be concatenated.
+        input_lengths: :math:`(N)`.
+            Lengths of the inputs (must each be :math:`\leq T`)
+        target_lengths: :math:`(N)`.
+            Lengths of the targets
+
+
+    Example::
+
+        >>> ctc_loss = nn.CTCLoss()
+        >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_()
+        >>> targets = torch.randint(1, 21, (16, 30), dtype=torch.long)
+        >>> input_lengths = torch.full((16,), 50, dtype=torch.long)
+        >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long)
+        >>> loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
+        >>> loss.backward()
+
+    Reference:
+        A. Graves et al.: Connectionist Temporal Classification:
+        Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
+        https://www.cs.toronto.edu/~graves/icml_2006.pdf
+
+    .. Note::
+        In order to use CuDNN, the following must be satisfied: :attr:`targets` must be
+        in concatenated format, all :attr:`input_lengths` must be `T`.  :math:`blank=0`,
+        :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
+        :class:`torch.IntTensor`.
+
+        The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
+    """
+
+    def __init__(self, blank=0, reduction='elementwise_mean'):
+        super(CTCLoss, self).__init__(reduction=reduction)
+        self.blank = blank
+
+    def forward(self, log_probs, targets, input_lengths, target_lengths):
+        return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction)
+
 # TODO: L1HingeEmbeddingCriterion
 # TODO: MSECriterion weight
 # TODO: ClassSimplexCriterion
diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py
index c2b32cb97b6b01..424670ac76fc14 100644
--- a/torch/nn/parallel/distributed_c10d.py
+++ b/torch/nn/parallel/distributed_c10d.py
@@ -242,11 +242,7 @@ def train(self, mode=True):
             module.train(mode)
 
     def _dist_broadcast_coalesced(self, tensors, buffer_size):
-        for tensors in _take_tensors(tensors, buffer_size):
-            flat_tensors = _flatten_dense_tensors(tensors)
-            c10d.broadcast(flat_tensors, 0, self.process_group).wait()
-            for tensor, synced in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
-                tensor.copy_(synced)
+        c10d._dist_broadcast_coalesced(tensors, buffer_size, self.process_group)
 
     def _sync_params(self):
         if len(self.device_ids) > 1:
diff --git a/torch/nn/utils/convert_parameters.py b/torch/nn/utils/convert_parameters.py
index 7f0dd1666dad9c..36a7eb207bcc65 100644
--- a/torch/nn/utils/convert_parameters.py
+++ b/torch/nn/utils/convert_parameters.py
@@ -45,9 +45,9 @@ def vector_to_parameters(vec, parameters):
         param_device = _check_param_device(param, param_device)
 
         # The length of the parameter
-        num_param = torch.prod(torch.LongTensor(list(param.size())))
+        num_param = param.numel()
         # Slice the vector, reshape it, and replace the old data of the parameter
-        param.data = vec[pointer:pointer + num_param].view(param.size()).data
+        param.data = vec[pointer:pointer + num_param].view_as(param).data
 
         # Increment the pointer
         pointer += num_param
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index 3ca44f35c4eff3..3262ca282b2c5d 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -70,6 +70,12 @@ def _get_const(value, desc, arg_name):
     return _parse_arg(value, desc)
 
 
+def _unpack_list(list_value):
+    list_node = list_value.node()
+    assert list_node.kind() == "prim::ListConstruct"
+    return list_node.inputs()
+
+
 def parse_args(*arg_descriptors):
     def decorator(fn):
         def wrapper(g, *args):
@@ -215,13 +221,18 @@ def reciprocal(g, self):
     return g.op("Div", _if_scalar_type_as(g, torch.ones(1), self), self)
 
 
-# This syntax is Python 2 portable
-def cat(g, *args):
-    dim = _get_const(args[-1], 'i', 'dim')
-    tensors = args[:-1]
+@parse_args('v', 'i')
+def cat(g, tensor_list, dim):
+    tensors = _unpack_list(tensor_list)
     return g.op("Concat", *tensors, axis_i=dim)
 
 
+@parse_args('v', 'i')
+def stack(g, tensor_list, dim):
+    unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in _unpack_list(tensor_list)]
+    return g.op("Concat", *unsqueezed, axis_i=dim)
+
+
 def mm(g, self, other):
     # Create a dummy C tensor. Only needed for API purposes, the value is
     # since beta = 0
@@ -349,11 +360,6 @@ def view(g, self, size):
     return g.op("Reshape", self, shape)
 
 
-def stack(g, *args):
-    unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in args[:-1]] + [args[-1]]
-    return concat(g, *unsqueezed)
-
-
 @parse_args('v', 'i', 'i')
 def split(g, self, split_size, dim):
     size = self.type().sizes()[dim]
@@ -555,9 +561,10 @@ def replication_pad(g, input, padding):
 
 @parse_args('v', 'is')
 def upsample_nearest2d(g, input, output_size):
+    height_scale = float(output_size[-2]) / input.type().sizes()[-2]
+    width_scale = float(output_size[-1]) / input.type().sizes()[-1]
     return g.op("Upsample", input,
-                height_scale_f=float(output_size[-2]) / input.type().sizes()[-2],
-                width_scale_f=float(output_size[-1]) / input.type().sizes()[-1],
+                scales_f=[1., 1., height_scale, width_scale],
                 mode_s="nearest")
 
 
@@ -565,10 +572,11 @@ def upsample_nearest2d(g, input, output_size):
 def upsample_bilinear2d(g, input, output_size, align_corners):
     if align_corners:
         return _unimplemented("upsample_bilinear2d", "align_corners == True")
-    w_scale = float(output_size[-1]) / input.type().sizes()[-1]
-    h_scale = float(output_size[-2]) / input.type().sizes()[-2]
-    return g.op("Upsample", input, width_scale_f=w_scale,
-                height_scale_f=h_scale, mode_s="bilinear")
+    height_scale = float(output_size[-2]) / input.type().sizes()[-2]
+    width_scale = float(output_size[-1]) / input.type().sizes()[-1]
+    return g.op("Upsample", input,
+                scales_f=[1., 1., height_scale, width_scale],
+                mode_s="bilinear")
 
 
 def gt(g, input, other):
@@ -659,10 +667,12 @@ def unfold(g, input, dimension, size, step):
     return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step)
 
 
-@parse_args('v', 't', 't')
-def elu(g, input, alpha, scale):
+@parse_args('v', 't', 't', 't')
+def elu(g, input, alpha, scale, input_scale):
     if scale and scale != 1.:
         return _unimplemented("scale", "does not support scale in Elu")
+    if input_scale and input_scale != 1.:
+        return _unimplemented("input_scale", "does not support input_scale in Elu")
     # See Note [Export inplace]
     return g.op("Elu", input, alpha_f=_scalar(alpha))
 
@@ -676,8 +686,10 @@ def index_select(g, self, dim, index):
     return g.op("Gather", self, index, axis_i=dim)
 
 
-def index_put(g, *inputs):
-    return g.op("ATen", *inputs, operator_s='index_put')
+def index_put(g, self, indices_list_value, values):
+    indices_list = list(_unpack_list(indices_list_value))
+    args = [self] + indices_list + [values]
+    return g.op("ATen", *args, operator_s='index_put')
 
 
 def type_as(g, self, other):
@@ -868,14 +880,17 @@ def topk(g, self, k, dim, largest, sorted, out=None):
     return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
 
 
-@parse_args('v', 'is')
 def repeat(g, self, repeats):
-    if self.isTensor():
+    if not _is_value(repeats):
+        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+    const_repeats = _maybe_get_const(repeats, 'is')
+
+    if self.isTensor() and not _is_value(const_repeats):
         sizes = self.type().sizes()
-        diff_dims = len(repeats) - len(sizes)
+        diff_dims = len(const_repeats) - len(sizes)
         if diff_dims > 0:
             self = view(g, self, [1] * diff_dims + sizes)
-    return g.op("Tile", self, g.op("Constant", value_t=torch.LongTensor(repeats)))
+    return g.op("Tile", self, repeats)
 
 
 def instance_norm(g, input, **kwargs):
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 4f9299d258ea3e..b770b900c4edd3 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -480,8 +480,14 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor
                     raise RuntimeError("Unsupported prim::Constant kind: `{}`. Send a bug report.".format(
                         n.kindOf("value")))
             elif op_name == "ListConstruct":
-                unsqueezed = [g.op("Unsqueeze", input, axes_i=[0]) for input in inputs]
-                return g.op("Concat", *unsqueezed, axis_i=0)
+                t = n.output().type()
+                # Tensor lists are used mostly for inputs to cat/stack. They need to be handled
+                # in those symbolics, and should become dead afterwards.
+                if t == torch._C.ListType.ofTensors():
+                    return None
+                elif t == torch._C.ListType.ofInts():
+                    unsqueezed = [g.op("Unsqueeze", input, axes_i=[0]) for input in inputs]
+                    return g.op("Concat", *unsqueezed, axis_i=0)
             elif op_name == "Undefined":
                 # Undefined is not an ONNX operator; keep it as prim::Undefined
                 # and let the exporter handle finally eliminating these
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index ad7f780719ccd3..96cfaff8684cf0 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -1,3 +1,4 @@
+import types
 import math
 import torch
 from torch._six import inf
@@ -86,6 +87,37 @@ def __init__(self, optimizer, lr_lambda, last_epoch=-1):
         self.last_epoch = last_epoch
         super(LambdaLR, self).__init__(optimizer, last_epoch)
 
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        The learning rate lambda functions will only be saved if they are callable objects
+        and not if they are functions or lambdas.
+        """
+        state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', 'lr_lambdas')}
+        state_dict['lr_lambdas'] = [None] * len(self.lr_lambdas)
+
+        for idx, fn in enumerate(self.lr_lambdas):
+            if not isinstance(fn, types.FunctionType):
+                state_dict['lr_lambdas'][idx] = fn.__dict__.copy()
+
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Arguments:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        lr_lambdas = state_dict.pop('lr_lambdas')
+        self.__dict__.update(state_dict)
+
+        for idx, fn in enumerate(lr_lambdas):
+            if fn is not None:
+                self.lr_lambdas[idx].__dict__.update(fn)
+
     def get_lr(self):
         return [base_lr * lmbda(self.last_epoch)
                 for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]
diff --git a/torch/tensor.py b/torch/tensor.py
index 6b587fcf903586..9784fd59c9d2fb 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -384,6 +384,8 @@ def __dir__(self):
         return sorted(keys)
 
     # Numpy array interface, to support `numpy.asarray(tensor) -> ndarray`
+    __array_priority__ = 1000    # prefer Tensor ops over numpy ones
+
     def __array__(self, dtype=None):
         if dtype is None:
             return self.cpu().numpy()