diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 00000000000000..4077f91d3a683f
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,7 @@
+version: 2
+jobs:
+  build:
+    docker:
+      - image: circleci/python:3.7-node-browsers
+    steps:
+      - run: echo "hello world"
diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
index 345e89ccf879fb..30108031f72308 100755
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@@ -155,6 +155,9 @@ if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
   export LANG=C.UTF-8
   export LC_ALL=C.UTF-8
   export HCC_AMDGPU_TARGET=gfx900
+
+  ########## HIPIFY Caffe2 operators
+  ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py"
 fi
 
 # Try to include Redis support for Linux builds
@@ -195,6 +198,7 @@ else
 fi
 
 
+
 ###############################################################################
 # Configure and make
 ###############################################################################
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index e4e0e9fc1d66f0..1e05bbdcb9b600 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -104,5 +104,5 @@ if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then
   echo "Building libtorch"
   # NB: Install outside of source directory (at the same level as the root
   # pytorch folder) so that it doesn't get cleaned away prior to docker push.
-  WERROR=1 VERBOSE=1 tools/cpp_build/build_all.sh "$PWD/../cpp-build"
+  WERROR=1 VERBOSE=1 tools/cpp_build/build_caffe2.sh "$PWD/../cpp-build"
 fi
diff --git a/.jenkins/pytorch/disabled-configs.txt b/.jenkins/pytorch/disabled-configs.txt
index cdd51d3fb54a56..c7041697026085 100644
--- a/.jenkins/pytorch/disabled-configs.txt
+++ b/.jenkins/pytorch/disabled-configs.txt
@@ -3,3 +3,5 @@
 # fail.  You can use this to temporarily reserve a test name to
 # turn on CI side before PyTorch repository supports it.  This
 # file has the same format as .jenkins/enabled-configs.txt
+
+py2-clang3.8-rocmnightly-ubuntu16.04-test
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 7dc760c06683f4..92ef7ad191adb0 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -57,7 +57,7 @@ test_cpp_api() {
   CPP_BUILD="$PWD/../cpp-build"
   rm -rf $CPP_BUILD
   mkdir -p $CPP_BUILD
-  WERROR=1 VERBOSE=1 tools/cpp_build/build_all.sh "$CPP_BUILD"
+  WERROR=1 VERBOSE=1 tools/cpp_build/build_caffe2.sh "$CPP_BUILD"
 
   python tools/download_mnist.py --quiet -d test/cpp/api/mnist
 
@@ -65,7 +65,7 @@ test_cpp_api() {
   # without these paths being set
   export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$PWD/miniconda3/lib"
   export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/miniconda3/lib"
-  "$CPP_BUILD"/libtorch/bin/test_api
+  "$CPP_BUILD"/caffe2/bin/test_api
 }
 
 if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index bc2762860dd2bd..7e5b98ee628cd1 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -9,11 +9,6 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
 echo "Testing pytorch"
 
-if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-  echo "Skipping ROCm tests for now"
-  exit 0
-fi
-
 # JIT C++ extensions require ninja.
 git clone https://github.com/ninja-build/ninja --quiet
 pushd ninja
@@ -49,13 +44,10 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
 fi
 
-export ATEN_DISABLE_AVX=
-export ATEN_DISABLE_AVX2=
 if [[ "${JOB_BASE_NAME}" == *-NO_AVX-* ]]; then
-  export ATEN_DISABLE_AVX=1
-fi
-if [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then
-  export ATEN_DISABLE_AVX2=1
+  export ATEN_CPU_CAPABILITY=default
+elif [[ "${JOB_BASE_NAME}" == *-NO_AVX2-* ]]; then
+  export ATEN_CPU_CAPABILITY=avx
 fi
 
 test_python_nn() {
@@ -104,12 +96,12 @@ test_libtorch() {
      echo "Testing libtorch"
      CPP_BUILD="$PWD/../cpp-build"
      if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-       "$CPP_BUILD"/libtorch/bin/test_jit
+       "$CPP_BUILD"/caffe2/bin/test_jit
      else
-       "$CPP_BUILD"/libtorch/bin/test_jit "[cpu]"
+       "$CPP_BUILD"/caffe2/bin/test_jit "[cpu]"
      fi
      python tools/download_mnist.py --quiet -d test/cpp/api/mnist
-     OMP_NUM_THREADS=2 "$CPP_BUILD"/libtorch/bin/test_api
+     OMP_NUM_THREADS=2 "$CPP_BUILD"/caffe2/bin/test_api
   fi
 }
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1cb5d13bc10f31..9e302869a8a8ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,6 +53,7 @@ endif()
 # Note to developers: if you add an option below, make sure you also add it to
 # cmake/Summary.cmake so that the summary prints out the option values.
 include(CMakeDependentOption)
+option(BUILD_TORCH "Build Torch" OFF)
 option(BUILD_CAFFE2 "Build Caffe2" ON)
 option(BUILD_ATEN "Build ATen" OFF)
 option(BUILD_BINARY "Build C++ binaries" ON)
@@ -214,6 +215,7 @@ if(NOT MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-function")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-overflow")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations")
   # These flags are not available in GCC-4.8.5. Set only when using clang.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 805cfd72573b76..08ff783dea4657 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -72,6 +72,9 @@ For example:
 
 You do not need to repeatedly install after modifying python files.
 
+In case you want to reinstall, make sure that you uninstall pytorch first by running `pip uninstall torch`
+and `python setup.py clean`. Then you can install in `build develop` mode again.
+
 ## Unit testing
 
 PyTorch's testing is located under `test/`. Run the entire test suite with
@@ -146,9 +149,7 @@ working on:
 
 - Working on `torch/lib` and want to run your changes / rerun cmake?  Run
   `python setup.py build_deps`.  Note that this will rerun cmake for
-  every subdirectory in TH; if you are only working on one project,
-  consider editing `torch/lib/build_all.sh` and commenting out the
-  `build` lines of libraries you are not working on.
+  every subdirectory in TH.
 
 On the initial build, you can also speed things up with the environment
 variables `DEBUG` and `NO_CUDA`.
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index eb7cf48e316a24..462a12b086d2d0 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -80,14 +80,20 @@ add_subdirectory(src/TH)
 set(TH_CPU_INCLUDE
   # dense
   ${CMAKE_CURRENT_SOURCE_DIR}/src/TH
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/THC
   ${CMAKE_CURRENT_BINARY_DIR}/src/TH
-  ${CMAKE_CURRENT_BINARY_DIR}/src/THC
-
   ${CMAKE_CURRENT_SOURCE_DIR}/src
   ${CMAKE_CURRENT_BINARY_DIR}/src
   ${CMAKE_BINARY_DIR}/aten/src)
 list(APPEND ATen_CPU_INCLUDE ${TH_CPU_INCLUDE})
+
+if(USE_CUDA OR USE_ROCM)
+  set(TH_CUDA_INCLUDE
+    # dense
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/THC
+    ${CMAKE_CURRENT_BINARY_DIR}/src/THC)
+  list(APPEND ATen_CUDA_INCLUDE ${TH_CUDA_INCLUDE})
+endif()
+
 add_subdirectory(src/THNN)
 
 # Find the HIP package, set the HIP paths, load the HIP CMake.
diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h
index a7084d474d8ab2..2ae326a68ecebc 100644
--- a/aten/src/ATen/ATen.h
+++ b/aten/src/ATen/ATen.h
@@ -21,3 +21,4 @@
 #include "ATen/TensorOptions.h"
 #include "ATen/Layout.h"
 #include "ATen/OptionsGuard.h"
+#include "ATen/CUDAGuard.h"
diff --git a/aten/src/ATen/Allocator.h b/aten/src/ATen/Allocator.h
index 867ae4cb5f54bb..c1c78102a0fef8 100644
--- a/aten/src/ATen/Allocator.h
+++ b/aten/src/ATen/Allocator.h
@@ -30,6 +30,9 @@ class DataPtr {
   DataPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter, Device device)
     : ptr_(data, ctx, ctx_deleter), device_(device) {}
   void* operator->() const { return ptr_.get(); }
+  void clear() {
+    ptr_.clear();
+  }
   void* get() const { return ptr_.get(); }
   void* get_context() const { return ptr_.get_context(); }
   void* release_context() { return ptr_.release_context(); }
diff --git a/aten/src/ATen/CUDAGuard.h b/aten/src/ATen/CUDAGuard.h
new file mode 100644
index 00000000000000..8027084caa7c30
--- /dev/null
+++ b/aten/src/ATen/CUDAGuard.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <ATen/ArrayRef.h>
+#include <ATen/CUDAStream.h>
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace at {
+
+/// A variant of `DeviceGuard` that augments it with an understanding of CUDA
+/// streams. This guard can not only set and reset the current CUDA device, but
+/// also set and reset the current CUDA stream. It is important to note that
+/// because a CUDA stream is intrinsically associated with the CUDA device to
+/// which it is bound, setting the CUDA stream *also* sets the current CUDA
+/// device to that of the stream.
+struct CUDAGuard {
+  /// Default constructor, does nothing and causes no change in the current
+  /// stream or device until `set_stream` or `set_device` is called.
+  CUDAGuard() = default;
+
+  /// Sets the CUDA stream and its associated device as the current one (calls
+  /// `set_stream`).
+  explicit CUDAGuard(const CUDAStream& stream) {
+    set_stream(stream);
+  }
+
+  /// Calls `set_device` with the given index.
+  explicit CUDAGuard(int32_t device) {
+    set_device(device);
+  }
+
+  CUDAGuard(const CUDAGuard&) = delete;
+  CUDAGuard& operator=(const CUDAGuard&) = delete;
+
+  /// Move-constructs this `CUDAGuard` from another `CUDAGuard`. The
+  /// moved-from `CUDAGuard` is modified such that its destruction has no
+  /// effect (does not reset the stream or device).
+  CUDAGuard(CUDAGuard&& other) noexcept = default;
+
+  /// Move-assigns this `CUDAGuard` from another `CUDAGuard`. The
+  /// moved-from `CUDAGuard` is modified such that its destruction has no
+  /// effect (does not reset the stream or device).
+  CUDAGuard& operator=(CUDAGuard&& other) {
+    device_guard_ = std::move(other.device_guard_);
+    original_streams_ = std::move(other.original_streams_);
+    other.original_streams_.clear();
+    return *this;
+  }
+
+  /// Resets the CUDA stream on each device to the one that was active upon
+  /// construction.
+  ~CUDAGuard() {
+    if (!original_streams_.empty()) {
+      for (size_t device = 0; device < original_streams_.size(); ++device) {
+        globalContext().uncheckedSetCurrentCUDAStreamOnDevice(
+            device, original_streams_[device]);
+      }
+    }
+  }
+
+  /// Sets the current CUDA device to the device associated with the given
+  /// stream, and then sets the current stream on that device to the one given.
+  void set_stream(const CUDAStream& stream) {
+    device_guard_.set_index(stream.device());
+    // If we haven't stored the current stream yet, store it now.
+    if (original_streams_.empty()) {
+      const size_t device_count = globalContext().getNumGPUs();
+      original_streams_.reserve(device_count);
+      for (size_t device = 0; device < device_count; ++device) {
+        original_streams_.push_back(
+            globalContext().getCurrentCUDAStreamOnDevice(device));
+      }
+    }
+    globalContext().setCurrentCUDAStreamOnDevice(
+        device_guard_.last_index(), stream);
+  }
+
+  /// Sets the CUDA device to the given one.
+  void set_device(int32_t device) {
+    device_guard_.set_index(device);
+  }
+
+  /// Returns the CUDA streams that were active in the first call to
+  /// `set_stream`. If there was no such call, the returned container is
+  /// empty.
+  ArrayRef<CUDAStream> original_streams() const noexcept {
+    return original_streams_;
+  }
+
+  /// Returns the device that was set upon construction of the guard.
+  int32_t original_device() const noexcept {
+    return device_guard_.original_index();
+  }
+
+  /// Returns the last device that was set via `set_device`, if any.
+  int32_t last_device() const noexcept {
+    return device_guard_.last_index();
+  }
+
+ private:
+  /// The guard for the current device.
+  DeviceGuard device_guard_;
+  /// The original streams that were active on all devices.
+  std::vector<CUDAStream> original_streams_;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/CUDAStream.cpp b/aten/src/ATen/CUDAStream.cpp
index ad9d51cbf46f2b..b8b8d588ffbfc4 100644
--- a/aten/src/ATen/CUDAStream.cpp
+++ b/aten/src/ATen/CUDAStream.cpp
@@ -1,10 +1,10 @@
-#include "ATen/CUDAStream.h"
+ #include "ATen/CUDAStream.h"
 #include "ATen/Error.h"
 #include "ATen/detail/CUDAHooksInterface.h"
 
 #include <mutex>
 
-// Internal implementation is entirely hidden 
+// Internal implementation is entirely hidden
 struct CUDAStreamInternals {
   bool is_destructible;
   std::atomic<int> refcount;
@@ -29,7 +29,7 @@ namespace detail {
   // Creates a(n indestructible) default stream for each device
   // Note: the default stream on each device is signified by a zero
   // value for the pointer, and so is not actually created as usual.
-  // In particular, we don't need to switch devices when creating the 
+  // In particular, we don't need to switch devices when creating the
   // streams.
   static void initDefaultCUDAStreams() {
     num_gpus = getCUDAHooks().getNumGPUs();
@@ -46,8 +46,8 @@ namespace detail {
   static void initCUDAStreamsOnce() {
     // Inits default streams (once, globally)
     std::call_once(init_flag, initDefaultCUDAStreams);
-    
-    // Inits current streams (thread local) to default streams    
+
+    // Inits current streams (thread local) to default streams
     if (current_streams) return;
     current_streams = (CUDAStreamInternals**) malloc(num_gpus * sizeof(CUDAStreamInternals*));
     for (auto i = decltype(num_gpus){0}; i < num_gpus; ++i) {
@@ -68,7 +68,7 @@ namespace detail {
 
   // Helper to verify the GPU index is valid
   static inline void check_gpu(int64_t device) {
-    AT_CHECK(device >= 0 && device < num_gpus);
+    AT_ASSERT(device >= 0 && device < num_gpus);
   }
 
   CUDAStreamInternals* CUDAStream_getDefaultStreamOnDevice(int64_t device) {
@@ -91,14 +91,14 @@ namespace detail {
   }
 
   // Note: despite not being "unsafe," is using these methods in a multithreaded
-  // environment then the caller must be sure that streams are valid 
+  // environment then the caller must be sure that streams are valid
   // when they're requested. These methods will throw an error if an
   // invalid stream is requested.
   CUDAStreamInternals* CUDAStream_getAndRetainCurrentStreamOnDevice(int64_t device) {
     initCUDAStreamsOnce();
     check_gpu(device);
     auto cur = current_streams[device];
-    AT_CHECK(CUDAStream_retain(cur));
+    AT_ASSERT(CUDAStream_retain(cur));
     return cur;
   }
   CUDAStreamInternals* CUDAStream_getAndRetainCurrentStream() {
@@ -120,44 +120,58 @@ namespace detail {
   void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) {
     initCUDAStreamsOnce();
     check_gpu(device);
-    AT_CHECK(ptr);
-    AT_CHECK(ptr->device == device);
-    AT_CHECK(CUDAStream_retain(ptr));
+    AT_ASSERT(ptr);
+    AT_ASSERT(ptr->device == device);
+    AT_ASSERT(CUDAStream_retain(ptr));
 
     CUDAStream_free(current_streams[device]);
     current_streams[device] = ptr;
   }
+
+  void CUDAStream_uncheckedSetStreamOnDevice(int64_t device, CUDAStreamInternals* ptr) {
+    initCUDAStreamsOnce();
+    CUDAStream_uncheckedFree(current_streams[device]);
+    current_streams[device] = ptr;
+  }
+
   void CUDAStream_setStream(CUDAStreamInternals* ptr) {
     CUDAStream_setStreamOnDevice(current_device(), ptr);
   }
 
   // Getters
   cudaStream_t CUDAStream_stream(CUDAStreamInternals* ptr) {
-    AT_CHECK(ptr);
+    AT_ASSERT(ptr);
     return ptr->stream;
   }
 
   int64_t CUDAStream_device(CUDAStreamInternals* ptr) {
-    AT_CHECK(ptr);
+    AT_ASSERT(ptr);
     return ptr->device;
   }
 
   // Memory management
   // Note: only destructible (non-default) streams are ref counted
   bool CUDAStream_retain(CUDAStreamInternals* ptr) {
-    AT_CHECK(ptr);
+    AT_ASSERT(ptr);
     if (ptr->is_destructible) return(++ptr->refcount > 1);
     return true;
   }
 
   void CUDAStream_free(CUDAStreamInternals*& ptr) {
     if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) {
-      AT_CHECK(ptr->refcount == 0);
+      AT_ASSERT(ptr->refcount == 0);
       DynamicCUDAInterface::cuda_stream_destroy(ptr->stream);
       free(ptr);
       ptr = nullptr;
     }
   }
+  void CUDAStream_uncheckedFree(CUDAStreamInternals*& ptr) {
+    if (ptr && ptr->stream && ptr->is_destructible && --ptr->refcount <= 0) {
+      DynamicCUDAInterface::unchecked_cuda_stream_destroy(ptr->stream);
+      free(ptr);
+      ptr = nullptr;
+    }
+  }
 
 } // namespace detail
 
@@ -167,17 +181,17 @@ namespace detail {
 
    // Copy constructor
   CUDAStream::CUDAStream(const CUDAStream& other) {
-    AT_CHECK(other.internals_);
-    AT_CHECK(detail::CUDAStream_retain(other.internals_));
+    AT_ASSERT(other.internals_);
+    AT_ASSERT(detail::CUDAStream_retain(other.internals_));
 
     internals_ = other.internals_;
   }
 
   // Move constructor
   CUDAStream::CUDAStream(CUDAStream&& other) {
-    AT_CHECK(other.internals_);
+    AT_ASSERT(other.internals_);
 
     std::swap(internals_, other.internals_);
   }
-  
+
 } // namespace at
diff --git a/aten/src/ATen/CUDAStream.h b/aten/src/ATen/CUDAStream.h
index 34a1295b712da8..c5fc8111e13bef 100644
--- a/aten/src/ATen/CUDAStream.h
+++ b/aten/src/ATen/CUDAStream.h
@@ -5,9 +5,9 @@
 
 /*
 * A CUDA stream interface with no CUDA build dependency.
-* 
+*
 * Includes the CUDAStream RAII class and a pointer-based stream API.
-* 
+*
 * The ATen Context interface should be preferred when working with streams.
 */
 
@@ -39,6 +39,9 @@ CUDAStreamInternals* CUDAStream_getCurrentStreamOnDeviceUnsafe(int64_t device);
 CUDAStreamInternals* CUDAStream_getCurrentStreamUnsafe();
 
 void CUDAStream_setStreamOnDevice(int64_t device, CUDAStreamInternals* internals);
+void CUDAStream_uncheckedSetStreamOnDevice(
+    int64_t device,
+    CUDAStreamInternals* internals);
 void CUDAStream_setStream(CUDAStreamInternals* internals);
 
 cudaStream_t CUDAStream_stream(CUDAStreamInternals*);
@@ -46,6 +49,7 @@ int64_t CUDAStream_device(CUDAStreamInternals*);
 
 bool CUDAStream_retain(CUDAStreamInternals*);
 void CUDAStream_free(CUDAStreamInternals*&);
+void CUDAStream_uncheckedFree(CUDAStreamInternals*&);
 
 } // namespace detail
 
@@ -58,23 +62,33 @@ struct CUDAStream {
 
   // Constructors
   CUDAStream() = default;
-  CUDAStream(CUDAStreamInternals* internals) : internals_{internals} { }
-  
+  /* implicit */ CUDAStream(CUDAStreamInternals* internals, bool retain = false)
+      : internals_{internals} {
+    if (retain) {
+      detail::CUDAStream_retain(internals_);
+    }
+  }
+
   // Destructor
-  ~CUDAStream() { detail::CUDAStream_free(internals_); }
+  ~CUDAStream() { detail::CUDAStream_uncheckedFree(internals_); }
 
   // Copy constructor
   CUDAStream(const CUDAStream& other);
 
   // Move constructor
-  CUDAStream(CUDAStream&& other);  
+  CUDAStream(CUDAStream&& other);
 
   // Assignment operator
-  CUDAStream& operator=(CUDAStream other) {
+  CUDAStream& operator=(CUDAStream other) noexcept {
     std::swap(internals_, other.internals_);
     return *this;
   }
 
+  // Returns true if the CUDAStream is not null.
+  explicit operator bool() const noexcept {
+    return internals_ != nullptr;
+  }
+
   // Implicit conversion to cudaStream_t
   operator cudaStream_t() const { return detail::CUDAStream_stream(internals_); }
 
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index accb57bbda67b4..9a9125ccedfe0a 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -114,6 +114,12 @@ class AT_API Context {
     return detail::CUDAStream_setStreamOnDevice(device, stream.internals());
   }
 
+  void uncheckedSetCurrentCUDAStreamOnDevice(int64_t device, CUDAStream stream)
+      const {
+    return detail::CUDAStream_uncheckedSetStreamOnDevice(
+        device, stream.internals());
+  }
+
 #ifndef __HIP_PLATFORM_HCC__
   cusparseHandle_t getCurrentCUDASparseHandle() const {
     return detail::getCUDAHooks().getCurrentCUDASparseHandle(thc_state.get());
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
index ceb6123301acf6..f38d4e8a674cb1 100644
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -2479,21 +2479,7 @@
         - THTensor* other
 ]]
 [[
-  name: clamp
-  cname: clamp
-  variants:
-    - method
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - real min
-    - real max
-]]
-[[
-  name: clamp_
+  name: _th_clamp_
   cname: clamp
   variants:
     - method
@@ -2506,20 +2492,7 @@
     - real max
 ]]
 [[
-  name: clamp_min
-  cname: cmaxValue
-  variants:
-    - method
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - real min
-]]
-[[
-  name: clamp_min_
+  name: _th_clamp_min_
   cname: cmaxValue
   variants:
     - method
@@ -2531,20 +2504,7 @@
     - real min
 ]]
 [[
-  name: clamp_max
-  cname: cminValue
-  variants:
-    - method
-    - function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - real max
-]]
-[[
-  name: clamp_max_
+  name: _th_clamp_max_
   cname: cminValue
   variants:
     - method
@@ -3130,7 +3090,7 @@
       default: S
 ]]
 [[
-  name: inverse
+  name: _getri
   cname: getri
   types:
     - Float
@@ -3691,38 +3651,6 @@
     - THTensor* src
 ]]
 
-[[
-  name: as_strided
-  variants: [method,function]
-  cpu_half: True
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-    - THSize* size
-    - THStride* stride
-    - arg: int64_t storage_offset
-  aten_custom_call: |
-    ${THTensor}_setStorage(${state,}result_->tensor, self_->tensor->storage, storage_offset, size_, stride_);
-    result_->maybeScalar(size.size() == 0);
-]]
-
-[[
-  name: as_strided_
-  variants: [method]
-  cpu_half: True
-  return: argument 0
-  arguments:
-    - THTensor* self
-    - THSize* size
-    - THStride* stride
-    - arg: int64_t storage_offset
-  aten_custom_call: |
-    ${THTensor}_setStorage(${state,}self_->tensor, self_->tensor->storage, storage_offset, size_, stride_);
-    self_->maybeScalar(size.size() == 0);
-]]
-
 [[
   name: _cat
   cname: catArray
diff --git a/aten/src/ATen/Device.h b/aten/src/ATen/Device.h
index a6afa187e55062..2d9c27f8d4cc3d 100644
--- a/aten/src/ATen/Device.h
+++ b/aten/src/ATen/Device.h
@@ -6,6 +6,7 @@
 #include <cstddef>
 #include <iosfwd>
 #include <string>
+#include <functional>
 
 namespace at {
 /// Represents a a compute device on which a tensor is located. A device is
@@ -112,3 +113,16 @@ struct Device {
 
 std::ostream& operator<<(std::ostream& stream, at::Device::Type type);
 std::ostream& operator<<(std::ostream& stream, const at::Device& device);
+
+namespace std {
+  template<> struct hash<at::Device>
+  {
+    size_t operator()(const at::Device& device) const noexcept {
+      size_t hash_val = static_cast<size_t>(device.index() + 1);
+      if (device.is_cuda()) {
+        hash_val += 2;
+      }
+      return hash_val;
+    }
+  };
+} // namespace std
diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h
index 6a3b84dcde9445..142eddab3d7345 100644
--- a/aten/src/ATen/DeviceGuard.h
+++ b/aten/src/ATen/DeviceGuard.h
@@ -28,7 +28,7 @@ struct DeviceGuard {
     }
   }
 
-  /// Calls `set_device` with the given index.
+  /// Calls `set_index` with the given index.
   explicit DeviceGuard(int32_t index) {
     set_index(index);
   }
@@ -46,6 +46,29 @@ struct DeviceGuard {
     }
   }
 
+  /// Copy is disallowed.
+  DeviceGuard(const DeviceGuard&) = delete;
+  DeviceGuard& operator=(const DeviceGuard&) = delete;
+
+  /// Move-constructs this `DeviceGuard` from another `DeviceGuard`. The
+  /// moved-from `DeviceGuard` is modified such that its destruction has no
+  /// effect (does not reset the device).
+  DeviceGuard(DeviceGuard&& other) noexcept {
+    *this = std::move(other);
+  }
+
+  /// Move-assigns this `DeviceGuard` from another `DeviceGuard`. The
+  /// moved-from `DeviceGuard` is modified such that its destruction has no
+  /// effect (does not reset the device).
+  DeviceGuard& operator=(DeviceGuard&& other) noexcept {
+    this->original_index_ = other.original_index_;
+    this->last_index_ = other.last_index_;
+    // Set other's original index to the unspecified/default state, so that it
+    // doesn't also reset the device in its constructor.
+    other.original_index_ = -1;
+    return *this;
+  }
+
   /// Resets the device to the index that was active at construction of the
   /// guard.
   ~DeviceGuard() {
@@ -88,7 +111,7 @@ struct DeviceGuard {
     return original_index_;
   }
 
-  // /// Returns the last device that was set via `set_device`, if any.
+  /// Returns the last device that was set via `set_index`, if any.
   int32_t last_index() const noexcept {
     return last_index_;
   }
@@ -96,7 +119,7 @@ struct DeviceGuard {
  private:
   /// The original device that was active at construction of this object.
   int32_t original_index_ = -1;
-  /// The last index that was set via `set_device`.
+  /// The last index that was set via `set_index`.
   int32_t last_index_ = -1;
 };
 } // namespace at
diff --git a/aten/src/ATen/THLongStorageView.h b/aten/src/ATen/THLongStorageView.h
index 11c6ca8103e218..d1e97e194561f5 100644
--- a/aten/src/ATen/THLongStorageView.h
+++ b/aten/src/ATen/THLongStorageView.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "TH/TH.h"
-#include "TH/THStorage.hpp"
+#include "TH/THStorageFunctions.hpp"
 #include "TH/THTypeConversion.hpp"
 
 namespace at {
@@ -37,7 +37,7 @@ class THLongStorageView {
   */
 
   THLongStorageView(ArrayRef<int64_t> ref, THLongStorageViewKind kind)
-  : zero_dim_to_null(false)
+  : storage(at::CTypeToScalarType<th::from_type<int64_t>>::to(), 0, getTHDefaultAllocator(), 0), zero_dim_to_null(false)
   {
     // zero_dim_to_one converts an empty ArrayRef into [1]
     // zero_dim_to_null converts an empty ArrayRef into a null THLongStorage
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 296992468916f0..5872764a905ce7 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -64,15 +64,22 @@ void cuda_stream_destroy(cudaStream_t stream) {
   check_status(cudaStreamDestroy(stream));
 }
 
+void unchecked_cuda_stream_destroy(cudaStream_t stream) {
+  const auto return_code = cudaStreamDestroy(stream);
+  (void)return_code;
+}
+
 struct DynamicCUDAInterfaceSetter {
   DynamicCUDAInterfaceSetter() {
-    at::detail::DynamicCUDAInterface::set_device = set_device;
-    at::detail::DynamicCUDAInterface::get_device = get_device;
-    at::detail::DynamicCUDAInterface::unchecked_set_device =
-        unchecked_set_device;
-    at::detail::DynamicCUDAInterface::cuda_stream_create_with_priority = 
-      cuda_stream_create_with_priority;
-    at::detail::DynamicCUDAInterface::cuda_stream_destroy = cuda_stream_destroy;
+    using at::detail::DynamicCUDAInterface;
+    DynamicCUDAInterface::set_device = set_device;
+    DynamicCUDAInterface::get_device = get_device;
+    DynamicCUDAInterface::unchecked_set_device = unchecked_set_device;
+    DynamicCUDAInterface::cuda_stream_create_with_priority =
+        cuda_stream_create_with_priority;
+    DynamicCUDAInterface::cuda_stream_destroy = cuda_stream_destroy;
+    DynamicCUDAInterface::unchecked_cuda_stream_destroy =
+        unchecked_cuda_stream_destroy;
   }
 };
 
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.cpp b/aten/src/ATen/detail/CUDAHooksInterface.cpp
index b6897ed0d6e270..288b066feafeb1 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.cpp
+++ b/aten/src/ATen/detail/CUDAHooksInterface.cpp
@@ -29,14 +29,20 @@ void default_unchecked_set_device(int32_t) {
 
 void default_cuda_stream_create_with_priority(cudaStream_t*, int32_t, int32_t) {
   AT_ERROR(
-    "DynamicCUDAInterface::cuda_stream_create_with_priority called "
-    "before CUDA library was loaded");
+      "DynamicCUDAInterface::cuda_stream_create_with_priority called "
+      "before CUDA library was loaded");
 }
 
 void default_cuda_stream_destroy(cudaStream_t) {
   AT_ERROR(
-    "DynamicCUDAInterface::cuda_stream_destroy called "
-    "before CUDA library was loaded");
+      "DynamicCUDAInterface::cuda_stream_destroy called "
+      "before CUDA library was loaded");
+}
+
+void default_unchecked_cuda_stream_destroy(cudaStream_t) {
+  AT_ERROR(
+      "DynamicCUDAInterface::unchecked_cuda_stream_destroy called "
+      "before CUDA library was loaded");
 }
 
 // Default the static members of DynamicCUDAInterface.
@@ -44,11 +50,14 @@ void (*DynamicCUDAInterface::set_device)(int32_t) = default_set_device;
 void (*DynamicCUDAInterface::get_device)(int32_t*) = default_get_device;
 void (*DynamicCUDAInterface::unchecked_set_device)(int32_t) =
     default_unchecked_set_device;
-void (*DynamicCUDAInterface::cuda_stream_create_with_priority)(cudaStream_t*, int32_t, int32_t) 
-  = default_cuda_stream_create_with_priority;
-void (*DynamicCUDAInterface::cuda_stream_destroy)(cudaStream_t) 
-  = default_cuda_stream_destroy;
-  
+void (*DynamicCUDAInterface::cuda_stream_create_with_priority)(
+    cudaStream_t*,
+    int32_t,
+    int32_t) = default_cuda_stream_create_with_priority;
+void (*DynamicCUDAInterface::cuda_stream_destroy)(cudaStream_t) =
+    default_cuda_stream_destroy;
+void (*DynamicCUDAInterface::unchecked_cuda_stream_destroy)(cudaStream_t) =
+    default_unchecked_cuda_stream_destroy;
 
 const CUDAHooksInterface& getCUDAHooks() {
   static std::unique_ptr<CUDAHooksInterface> cuda_hooks;
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index e15cf363bba038..f0596d01949d8f 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -177,6 +177,7 @@ struct AT_API DynamicCUDAInterface {
   static void (*unchecked_set_device)(int32_t);
   static void (*cuda_stream_create_with_priority)(cudaStream_t*, int32_t, int32_t);
   static void (*cuda_stream_destroy)(cudaStream_t);
+  static void (*unchecked_cuda_stream_destroy)(cudaStream_t);
 };
 } // namespace detail
 } // namespace at
diff --git a/aten/src/ATen/detail/UniqueVoidPtr.h b/aten/src/ATen/detail/UniqueVoidPtr.h
index 866c0efc011b5e..e277014a7935d6 100644
--- a/aten/src/ATen/detail/UniqueVoidPtr.h
+++ b/aten/src/ATen/detail/UniqueVoidPtr.h
@@ -45,6 +45,10 @@ class UniqueVoidPtr {
   UniqueVoidPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter)
     : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {}
   void* operator->() const { return data_; }
+  void clear() {
+    ctx_ = nullptr;
+    data_ = nullptr;
+  }
   void* get() const { return data_; }
   void* get_context() const { return ctx_.get(); }
   void* release_context() { return ctx_.release(); }
diff --git a/aten/src/ATen/native/DispatchStub.cpp b/aten/src/ATen/native/DispatchStub.cpp
new file mode 100644
index 00000000000000..662ae580c599a8
--- /dev/null
+++ b/aten/src/ATen/native/DispatchStub.cpp
@@ -0,0 +1,44 @@
+#include "DispatchStub.h"
+
+#include <ATen/Error.h>
+
+#include <cpuinfo.h>
+#include <cstdlib>
+#include <cstring>
+
+namespace at { namespace native {
+
+static CPUCapability compute_cpu_capability() {
+  auto envar = std::getenv("ATEN_CPU_CAPABILITY");
+  if (envar) {
+    if (strcmp(envar, "avx2") == 0) {
+      return CPUCapability::AVX2;
+    }
+    if (strcmp(envar, "avx") == 0) {
+      return CPUCapability::AVX;
+    }
+    if (strcmp(envar, "default") == 0) {
+      return CPUCapability::DEFAULT;
+    }
+    AT_WARN("ignoring invalid value for ATEN_CPU_CAPABILITY: ", envar);
+  }
+
+#ifndef __powerpc__
+  if (cpuinfo_initialize()) {
+    if (cpuinfo_has_x86_avx2() && cpuinfo_has_x86_fma3()) {
+      return CPUCapability::AVX2;
+    }
+    if (cpuinfo_has_x86_avx()) {
+      return CPUCapability::AVX;
+    }
+  }
+#endif
+  return CPUCapability::DEFAULT;
+}
+
+CPUCapability get_cpu_capability() {
+  static CPUCapability capability = compute_cpu_capability();
+  return capability;
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cpu/CapabilityDispatch.h b/aten/src/ATen/native/DispatchStub.h
similarity index 51%
rename from aten/src/ATen/native/cpu/CapabilityDispatch.h
rename to aten/src/ATen/native/DispatchStub.h
index 6cb0f279872d66..bbdf07a8458bf1 100644
--- a/aten/src/ATen/native/cpu/CapabilityDispatch.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <cpuinfo.h>
+#include <ATen/Error.h>
+#include <ATen/ScalarType.h>
 #include <type_traits>
-#include <iostream>
 
 // Implements instruction set specific function dispatch.
 //
@@ -23,72 +23,82 @@
 //   REGISTER_DISPATCH(stub, &kernel);
 //
 // To call:
-//   stub(tensor);
+//   stub(kCPU, tensor);
 //
 
 namespace at {
 namespace native {
 
-enum class CPUCapability { DEFAULT, AVX, AVX2, NUM_OPTIONS };
+enum class CPUCapability {
+  DEFAULT = 0,
+  AVX = 1,
+  AVX2 = 2,
+  NUM_OPTIONS
+};
+
+CPUCapability get_cpu_capability();
 
 template <typename FnPtr>
 struct DispatchStub {
   static_assert(std::is_pointer<FnPtr>::value, "FnPtr should be a pointer type");
 
   template <typename... ArgTypes>
-  void operator()(ArgTypes... args) {
-    if (!dispatch_ptr) {
-      dispatch_ptr = choose_impl();
+  void operator()(Backend backend, ArgTypes... args) {
+    if (backend == Backend::CPU) {
+      if (!dispatch_ptr) {
+        dispatch_ptr = choose_cpu_impl();
+      }
+      (*dispatch_ptr)(args...);
+    } else if (backend == Backend::CUDA) {
+      AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel");
+      (*cuda_dispatch_ptr)(args...);
+    } else {
+      AT_ERROR("DispatchStub: unsupported backend", backend);
     }
-    (*dispatch_ptr)(args...);
   }
 
-  FnPtr choose_impl() {
-// Do not use cpuinfo on PowerPC as it shows confusing errors when run on ppc
-#ifndef __powerpc__
-    if (cpuinfo_initialize()) {
-      int avx2 = static_cast<int>(CPUCapability::AVX2);
-      if (!std::getenv("ATEN_DISABLE_AVX2") && cpuinfo_has_x86_avx2() &&
-          cpuinfo_has_x86_fma3() && table[avx2]) {
-        return table[avx2];
-      }
-      int avx = static_cast<int>(CPUCapability::AVX);
-      if (!std::getenv("ATEN_DISABLE_AVX") && cpuinfo_has_x86_avx() && table[avx]) {
-        return table[avx];
-      }
-    }
-#endif
+  FnPtr choose_cpu_impl() {
     int def = static_cast<int>(CPUCapability::DEFAULT);
+    int avx = static_cast<int>(CPUCapability::AVX);
+    int avx2 = static_cast<int>(CPUCapability::AVX2);
+
+    auto capability = static_cast<int>(get_cpu_capability());
+    if (capability >= avx2 && table[avx2]) {
+      return table[avx2];
+    }
+    if (capability >= avx && table[avx]) {
+      return table[avx];
+    }
     AT_ASSERTM(table[def], "DispatchStub: missing default kernel");
     return table[def];
   }
 
   FnPtr dispatch_ptr = nullptr;
+  FnPtr cuda_dispatch_ptr = nullptr;
   FnPtr table[static_cast<int>(CPUCapability::NUM_OPTIONS)];
 };
 
 
-#if defined(CPU_CAPABILITY)
+#if defined(CPU_CAPABILITY) || defined(__CUDACC__)
 
-constexpr CPUCapability CURRENT_CAPABILITY = CPUCapability::CPU_CAPABILITY;
+namespace {
 
-// Registers an implementation a kernel for the current CPU capability.
-template<typename FnPtr>
+template <typename FnPtr>
 struct RegisterDispatch {
   RegisterDispatch(DispatchStub<FnPtr>& stub, FnPtr value) {
-    stub.table[static_cast<int>(CURRENT_CAPABILITY)] = value;
+#if defined(__CUDACC__)
+    stub.cuda_dispatch_ptr = value;
+#else
+    int cap = static_cast<int>(CPUCapability::CPU_CAPABILITY);
+    AT_ASSERT(!stub.table[cap])
+    stub.table[cap] = value;
+#endif
   }
 };
 
-// We only define the stub once in the DEFAULT capability compilation
-#if defined(CPU_CAPABILITY_DEFAULT)
-#define _DEFINE_STUB(stub, fn) DispatchStub<decltype(fn)> stub
-#else
-#define _DEFINE_STUB(stub, fn)
-#endif
+} // anonymous namespace
 
 #define REGISTER_DISPATCH(stub, fn) \
-  _DEFINE_STUB(stub, fn); \
   static RegisterDispatch<decltype(fn)> stub ## __register(stub, fn);
 
 #endif
diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp
index 42ea2813bc0726..9720adb4895769 100644
--- a/aten/src/ATen/native/Indexing.cpp
+++ b/aten/src/ATen/native/Indexing.cpp
@@ -294,10 +294,6 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten
    AT_ERROR(
         "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")");
   }
-  if (source.dim() > 0 && numIndices != source.size(dim)) {
-   AT_ERROR(
-        "index_copy_(): Number of indices (", numIndices, ") should be equal to source.size(dim) (", source.size(dim), ")");
-  }
   if (index.type().scalarType() != ScalarType::Long) {
    AT_ERROR("index_copy_(): Expected LongTensor for index");
   }
@@ -309,7 +305,7 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten
   }
   auto sourceSlicedSizes = std::vector<int64_t>(source.sizes());
   if (sourceSlicedSizes.size() > 0) {
-    sourceSlicedSizes.erase(sourceSlicedSizes.begin());
+    sourceSlicedSizes.erase(sourceSlicedSizes.begin() + dim);
   }
   if (selfSlicedSizes.size() != sourceSlicedSizes.size() ||
       !std::equal(selfSlicedSizes.begin(), selfSlicedSizes.end(),
@@ -320,6 +316,10 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten
     ss << " and source slice shape: " << sourceSlicedSizes << " at dimension 0.";
     throw std::runtime_error(ss.str());
   }
+  if (source.dim() > 0 && numIndices != source.size(dim)) {
+     AT_ERROR(
+          "index_copy_(): Number of indices (", numIndices, ") should be equal to source.size(dim) (", source.size(dim), ")");
+  }
 
   return self._indexCopy_(dim, index, source);
 }
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index ea87d42dfa58f0..388d704a834d48 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -83,10 +83,33 @@ std::tuple<Tensor, Tensor> slogdet(const Tensor& self) {
   return std::make_tuple(det.sign(), diag_U.abs_().log_().sum());
 }
 
+Tensor inverse(const Tensor& self) {
+  Tensor result = self.type().tensor();
+  return at::native::inverse_out(result, self);
+}
+
+Tensor& inverse_out(Tensor &result, const Tensor &self) {
+  AT_CHECK(self.type().backend() == kCPU || self.type().backend() == kCUDA,
+           "tensor should have CPU or CUDA backend");
+  AT_CHECK(self.dim() == 2, "tensor should be 2 dimensional");
+  AT_CHECK(self.size(0) == self.size(1), "tensor should be square");
+  AT_CHECK(at::isFloatingType(self.type().scalarType()), "tensor should be of floating-point type");
+  if (self.size(0) == 0) {
+    return result.resize_({0, 0});
+  } else {
+    return at::_getri_out(result, self);
+  }
+}
+
 Tensor pinverse(const Tensor& self, double rcond) {
   AT_CHECK(at::isFloatingType(self.type().scalarType()) && self.dim() == 2,
            "pinverse(", self.type(), "{", self.sizes(), "}): expected a 2D tensor "
            "of floating types");
+  AT_CHECK(self.dim() == 2, "tensor should be 2 dimensional");
+  if (self.numel() == 0) {
+    // Match NumPy
+    return self.type().tensor({self.size(1), self.size(0)});
+  }
   Tensor U, S, V;
   std::tie(U, S, V) = self.svd();
   double max_val = S[0].toCDouble();
diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index fd40bb3ab1f92c..d4ad799948b0c0 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -95,7 +95,9 @@ bound to ATen (in practice, C++ and Python.)
 **Argument names.** Argument names are meaningful; downstream binding code may make use of the specific
 argument name you provide, and a rename of an argument name is considered a BC-breaking
 change (e.g., you will probably need to update `tools/autograd/derivatives.yaml` at
-least).
+least). In `native_functions.yaml`, if your function (usually functions named with 'out' affix) args
+include the result Tensor, you need to call the argument `Tensor result`. And if there are more
+than one result Tensors, you need to name the args `Tensor result0, Tensor result1, ...`.
 
 TODO: Do argument names affect Python keyword arguments?
 
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index affa9d24059d99..d055a91afa7596 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -17,6 +17,9 @@
 namespace at {
 namespace native {
 
+DispatchStub<reduce_fn> sum_kernel;
+DispatchStub<reduce_fn> prod_kernel;
+
 static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
   ScalarType scalarType = self.type().scalarType();
   ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType) ? ScalarType::Long : scalarType);
@@ -127,7 +130,7 @@ Tensor sum(const Tensor &self) {
 Tensor _sum_cpu(const Tensor& self) {
   if (self.is_contiguous()) {
     Tensor result = at::empty({}, self.type());
-    sum_kernel(result, self, at::nullopt);
+    sum_kernel(kCPU, result, self, at::nullopt);
     return result;
   }
   return self._sumall();
@@ -148,7 +151,7 @@ Tensor prod(const Tensor &self) {
 Tensor _prod_cpu(const Tensor &self) {
   if (self.is_contiguous()) {
     Tensor result = at::empty({}, self.type());
-    prod_kernel(result, self, at::nullopt);
+    prod_kernel(kCPU, result, self, at::nullopt);
     return result;
   }
   return self._prodall();
@@ -222,7 +225,7 @@ Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
     return result;
   if (self.is_contiguous() && result.is_contiguous()) {
     _dimreduce_setup(result, self, dim);
-    sum_kernel(result, self, dim);
+    sum_kernel(kCPU, result, self, dim);
     if (!keepdim) result.squeeze_(dim);
     return result;
   }
@@ -260,7 +263,7 @@ Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
     return result;
   if (self.is_contiguous() && result.is_contiguous()) {
     _dimreduce_setup(result, self, dim);
-    prod_kernel(result, self, dim);
+    prod_kernel(kCPU, result, self, dim);
     if (!keepdim) result.squeeze_(dim);
     return result;
   }
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index ff83c78f4554f6..56e0ab6ca6ba10 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -14,6 +14,9 @@ template <typename scalar_t, bool LogSoftMax>
 void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
   int64_t outer_size = 1;
   int64_t dim_size = input.size(dim);
+  if (input.numel() == 0) {
+    return;
+  }
   int64_t inner_size = 1;
   for (int64_t i = 0; i < dim; ++i)
     outer_size *= input.size(i);
@@ -125,7 +128,7 @@ Tensor softmax_cpu(const Tensor& input_, const int64_t dim_) {
       dim >= 0 && dim < input.dim(),
       "dim must be non-negative and less than input dimensions");
   if (input.ndimension() > 0 && dim == input.ndimension() - 1) {
-    softmax_lastdim_kernel(output, input);
+    softmax_lastdim_kernel(kCPU, output, input);
   } else {
     AT_DISPATCH_FLOATING_TYPES(input.type(), "softmax", [&] {
       host_softmax<scalar_t, false>(output, input, dim);
@@ -144,7 +147,7 @@ Tensor log_softmax_cpu(const Tensor& input_, const int64_t dim_) {
       dim >= 0 && dim < input.dim(),
       "dim must be non-negative and less than input dimensions");
   if (input.ndimension() > 0 && dim == input.ndimension() - 1) {
-    log_softmax_lastdim_kernel(output, input);
+    log_softmax_lastdim_kernel(kCPU, output, input);
   } else {
     AT_DISPATCH_FLOATING_TYPES(input.type(), "log_softmax", [&] {
       host_softmax<scalar_t, true>(output, input, dim);
@@ -173,7 +176,7 @@ Tensor softmax_backward_cpu(
       dim >= 0 && dim < grad.dim(),
       "dim must be non-negative and less than input dimensions");
   if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) {
-    softmax_backward_lastdim_kernel(grad_input, grad, output);
+    softmax_backward_lastdim_kernel(kCPU, grad_input, grad, output);
   } else {
     AT_DISPATCH_FLOATING_TYPES(grad.type(), "softmax_backward", [&] {
       host_softmax_backward<scalar_t, false>(grad_input, grad, output, dim);
@@ -202,7 +205,7 @@ Tensor log_softmax_backward_cpu(
       dim >= 0 && dim < grad.dim(),
       "dim must be non-negative and less than input dimensions");
   if (grad.ndimension() > 0 && dim == grad.ndimension() - 1) {
-    log_softmax_backward_lastdim_kernel(grad_input, grad, output);
+    log_softmax_backward_lastdim_kernel(kCPU, grad_input, grad, output);
   } else {
     AT_DISPATCH_FLOATING_TYPES(grad.type(), "log_softmax_backward", [&] {
       host_softmax_backward<scalar_t, true>(grad_input, grad, output, dim);
@@ -210,5 +213,11 @@ Tensor log_softmax_backward_cpu(
   }
   return grad_input;
 }
+
+DispatchStub<forward_fn> softmax_lastdim_kernel;
+DispatchStub<forward_fn> log_softmax_lastdim_kernel;
+DispatchStub<backward_fn> softmax_backward_lastdim_kernel;
+DispatchStub<backward_fn> log_softmax_backward_lastdim_kernel;
+
 }
 }
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 17770cba906534..5d1c883bee3f1f 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -188,26 +188,24 @@ Tensor irfft(const Tensor& self, const int64_t signal_ndim, const bool normalize
 }
 
 
-Tensor stft(const Tensor& self, const int64_t frame_length,
-                                const int64_t hop, const int64_t fft_size,
-                                const bool normalized, const bool onesided,
-                                const Tensor& window, const int64_t pad_end) {
+Tensor stft(const Tensor& self, const int64_t n_fft, const int64_t hop_length,
+            const int64_t win_length, const Tensor& window,
+            const bool normalized, const bool onesided) {
   #define REPR(SS) \
-    SS << "stft(" << self.type() << self.sizes() << ", frame_length=" \
-       << frame_length << ", hop=" << hop << ", fft_size=" << fft_size \
-       << ", normalized=" << normalized << ", onesided=" << onesided << \
-       ", window="; \
+    SS << "stft(" << self.type() << self.sizes() << ", n_fft=" << n_fft \
+       << ", hop_length=" << hop_length << ", win_length=" << win_length \
+       << ", window="; \
     if (window.defined()) { \
       SS << window.type() << "{" << window.sizes() << "}"; \
     } else { \
       SS << "None"; \
     } \
-    SS << ", pad_end=" << pad_end << ")"
+    SS << ", normalized=" << normalized << ", onesided=" << onesided << ")"
 
   if (!at::isFloatingType(self.type().scalarType()) || self.dim() > 2 || self.dim() < 1) {
     std::ostringstream ss;
     REPR(ss) << ": expected a 1D or 2D tensor of floating types";
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
   Tensor input = self;
   if (self.dim() == 1) {
@@ -215,66 +213,52 @@ Tensor stft(const Tensor& self, const int64_t frame_length,
   }
   int64_t batch = input.size(0);
   int64_t len = input.size(1);
-  if (pad_end < 0) {
-    std::ostringstream ss;
-    REPR(ss) << ": expected pad_end >= 0, but got pad_end=" << pad_end;
-    throw std::runtime_error(ss.str());
-  }
-  // pad zeros
-  if (pad_end != 0) {
-    Tensor padded_input = at::zeros({batch, len + pad_end}, self.type());
-    padded_input.narrow(1, 0, len).copy_(input);
-    len += pad_end;
-    input = padded_input;
-  }
-  if (frame_length <= 0 || frame_length > len) {
+  if (n_fft <= 0 || n_fft > len) {
     std::ostringstream ss;
-    REPR(ss) << ": expected 0 < frame_length < " << len
-             << ", but got frame_length=" << frame_length;
-    throw std::runtime_error(ss.str());
+    REPR(ss) << ": expected 0 < n_fft < " << len
+             << ", but got n_fft=" << win_length;
+    AT_ERROR(ss.str());
   }
-  if (hop <= 0) {
+  if (hop_length <= 0) {
     std::ostringstream ss;
-    REPR(ss) << " expected hop > 0, but got hop=" << hop;
+    REPR(ss) << ": expected hop_length > 0, but got hop_length=" << hop_length;
     throw std::runtime_error(ss.str());
   }
-  if (fft_size <= 0) {
+  if (win_length <= 0 || win_length > n_fft) {
     std::ostringstream ss;
-    REPR(ss) << " expected fft_size > 0, but got fft_size=" << fft_size;
-    throw std::runtime_error(ss.str());
+    REPR(ss) << ": expected 0 < win_length <= n_fft, but got win_length="
+             << win_length;
+    AT_ERROR(ss.str());
   }
-  if (window.defined() && (window.dim() != 1 || window.size(0) != frame_length)) {
+  if (window.defined() && (window.dim() != 1 || window.size(0) != win_length)) {
     std::ostringstream ss;
-    REPR(ss) << ": expected a 1D window tensor of size equal to "
-             << "frame_length=" << frame_length
-             << ", but got window with size " << window.sizes();
-    throw std::runtime_error(ss.str());
+    REPR(ss) << ": expected a 1D window tensor of size equal to win_length="
+             << win_length << ", but got window with size " << window.sizes();
+    AT_ERROR(ss.str());
   }
   #undef REPR
-  int64_t return_size = onesided ? infer_ft_real_to_complex_onesided_size(fft_size) : fft_size;
-  // build ft kernel
-  // k[omega, t] = cos (2 pi omega t / N) - j sin (2 pi omega t / N)
-  double N = static_cast<double>(fft_size);
-  auto freq_arange = at::arange(0, return_size, self.type()).mul_(M_PI * 2. / N);
-  auto time_arange = at::arange(0, frame_length, self.type());
-  auto arange_2d = at::ger(freq_arange, time_arange);
-  auto re_kernel = arange_2d.cos();
-  auto im_kernel = arange_2d.sin().neg_();
-  auto kernel = at::cat({re_kernel, im_kernel}, 0);
-  if (window.defined()) {
-    kernel *= window.view({1, -1});
+  auto window_ = window;
+  if (win_length < n_fft) {
+    // pad center
+    window_ = at::zeros({n_fft}, self.options());
+    auto left = (n_fft - win_length) / 2;
+    if (window.defined()) {
+      window_.narrow(0, left, win_length).copy_(window);
+    } else {
+      window_.narrow(0, left, win_length).fill_(1);
+    }
   }
-  if (normalized) {
-    double T = static_cast<double>(frame_length);
-    kernel.div_(std::sqrt(T));
+  int64_t n_frames = 1 + (len - n_fft) / hop_length;
+  // time2col
+  input = input.as_strided(
+    {batch, n_frames, n_fft},
+    {input.stride(0), hop_length * input.stride(1), input.stride(1)}
+  );
+  if (window_.defined()) {
+    input = input.mul(window_);
   }
-  // prepare for conv1d
-  input = input.view({batch, 1, len});
-  kernel = kernel.view({return_size * 2, 1, frame_length});
-  // conv is actually correlation, so we are good
-  auto conv_out = at::conv1d(input, kernel, {}, hop).squeeze_(-1);
-  // transpose to [batch x time x freq x (re/im)]
-  auto out = conv_out.view({batch, 2, return_size, -1}).transpose_(1, -1);
+  // rfft and transpose to get (batch x fft_size x num_frames)
+  auto out = input.rfft(1, normalized, onesided).transpose_(1, 2);
   if (self.dim() == 1) {
     return out.squeeze_(0);
   } else {
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index c8f6158994acb3..13887a52633bc2 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -135,12 +135,20 @@ Tensor expand_as(const Tensor& self, const Tensor& other) {
   return self.expand(other.sizes());
 }
 
+Tensor as_strided(const Tensor& self, IntList size, IntList stride, int64_t storage_offset) {
+  return self.type().tensor().set_(*self.storage(), storage_offset, size, stride);
+}
+
+Tensor &as_strided_(Tensor& self, IntList size, IntList stride, int64_t storage_offset) {
+  return self.set_(*self.storage(), storage_offset, size, stride);
+}
+
 Tensor as_strided(const Tensor& self, IntList size, IntList stride) {
-  return self.as_strided(size, stride, self.storage_offset());
+  return at::as_strided(self, size, stride, self.storage_offset());
 }
 
 Tensor &as_strided_(Tensor& self, IntList size, IntList stride) {
-  return self.as_strided_(size, stride, self.storage_offset());
+  return at::as_strided_(self, size, stride, self.storage_offset());
 }
 
 Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
@@ -265,6 +273,10 @@ Tensor reshape(const Tensor& self, IntList proposed_shape) {
   return at::_unsafe_view(self.clone(), shape);
 }
 
+Tensor reshape_as(const Tensor& self, const Tensor& other) {
+  return self.reshape(other.sizes());
+}
+
 Tensor select(const Tensor& self, int64_t dim, int64_t index) {
   int64_t ndim = self.dim();
   AT_CHECK(ndim > 0, "select() cannot be applied to a 0-dim tensor.");
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index f32a206123ad75..17f5a437b00155 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -24,6 +24,55 @@
 namespace at {
 namespace native {
 
+Tensor clamp(const Tensor& self, Scalar min, Scalar max) {
+  Tensor result = self.type().tensor();
+  return clamp_out(result, self, min, max);
+}
+
+Tensor clamp_max(const Tensor& self, Scalar max) {
+  Tensor result = self.type().tensor();
+  return clamp_max_out(result, self, max);
+}
+
+Tensor clamp_min(const Tensor& self, Scalar min) {
+  Tensor result = self.type().tensor();
+  return clamp_min_out(result, self, min);
+}
+
+Tensor& _clamp__cpu(Tensor& self, Scalar min, Scalar max) {
+  return _th_clamp_(self, min, max);
+}
+
+Tensor& _clamp_out_cpu(
+    Tensor& result,
+    const Tensor& self,
+    Scalar min,
+    Scalar max) {
+  result.resize_(self.sizes());
+  result.copy_(self);
+  return _th_clamp_(result, min, max);
+}
+
+Tensor& _clamp_max__cpu(Tensor& self, Scalar max) {
+  return _th_clamp_max_(self, max);
+}
+
+Tensor& _clamp_max_out_cpu(Tensor& result, const Tensor& self, Scalar max) {
+  result.resize_(self.sizes());
+  result.copy_(self);
+  return _th_clamp_max_(result, max);
+}
+
+Tensor& _clamp_min__cpu(Tensor& self, Scalar min) {
+  return _th_clamp_min_(self, min);
+}
+
+Tensor& _clamp_min_out_cpu(Tensor& result, const Tensor& self, Scalar min) {
+  result.resize_(self.sizes());
+  result.copy_(self);
+  return _th_clamp_min_(result, min);
+}
+
 Tensor& fill_(Tensor& self, Scalar value) {
   return self._fill_(value);
 }
@@ -43,14 +92,14 @@ Tensor& fill_(Tensor& self, const Tensor& value) {
   Tensor& _##op##__cpu(Tensor& self_) {                         \
     if (self_.numel() > 0) {                                    \
       Tensor self = sort_strides(self_);                        \
-      op##Impl(self, self);                                     \
+      op##Impl(kCPU, self, self);                               \
     }                                                           \
     return self_;                                               \
   }                                                             \
   Tensor& _##op##_out_cpu(Tensor& result, const Tensor& self) { \
     result.resize_(self.sizes());                               \
     if (result.numel() > 0) {                                   \
-      op##Impl(result, self);                                   \
+      op##Impl(kCPU, result, self);                             \
     }                                                           \
     return result;                                              \
   }
@@ -96,5 +145,29 @@ IMPLEMENT_UNARY_OP_VEC(tan)
 IMPLEMENT_UNARY_OP_VEC(tanh)
 IMPLEMENT_UNARY_OP_VEC(trunc)
 
+DispatchStub<unary_fn> absImpl;
+DispatchStub<unary_fn> acosImpl;
+DispatchStub<unary_fn> asinImpl;
+DispatchStub<unary_fn> atanImpl;
+DispatchStub<unary_fn> ceilImpl;
+DispatchStub<unary_fn> cosImpl;
+DispatchStub<unary_fn> erfImpl;
+DispatchStub<unary_fn> erfcImpl;
+DispatchStub<unary_fn> expImpl;
+DispatchStub<unary_fn> expm1Impl;
+DispatchStub<unary_fn> floorImpl;
+DispatchStub<unary_fn> logImpl;
+DispatchStub<unary_fn> log10Impl;
+DispatchStub<unary_fn> log1pImpl;
+DispatchStub<unary_fn> log2Impl;
+DispatchStub<unary_fn> roundImpl;
+DispatchStub<unary_fn> rsqrtImpl;
+DispatchStub<unary_fn> sigmoidImpl;
+DispatchStub<unary_fn> sinImpl;
+DispatchStub<unary_fn> sqrtImpl;
+DispatchStub<unary_fn> tanImpl;
+DispatchStub<unary_fn> tanhImpl;
+DispatchStub<unary_fn> truncImpl;
+
 }
 } // namespace at
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.h b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
index 9481b90fe76965..5a7854d0094cd5 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.h
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
@@ -1,8 +1,8 @@
 #pragma once
 
 #include <ATen/ATen.h>
+#include <ATen/native/DispatchStub.h>
 #include <ATen/optional.h>
-#include "CapabilityDispatch.h"
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cpu/SoftmaxKernel.h b/aten/src/ATen/native/cpu/SoftmaxKernel.h
index dbd703b6d3c028..39d7e68a8b20c9 100644
--- a/aten/src/ATen/native/cpu/SoftmaxKernel.h
+++ b/aten/src/ATen/native/cpu/SoftmaxKernel.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include "CapabilityDispatch.h"
+#include <ATen/native/DispatchStub.h>
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 7416923cfd8867..459838a9b6c689 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -4,7 +4,7 @@
 #include "ATen/Dispatch.h"
 #include "ATen/cpu/vml.h"
 #include "ATen/CPUApplyUtils.h"
-#include "ATen/native/cpu/CapabilityDispatch.h"
+#include "ATen/native/DispatchStub.h"
 #ifdef __AVX2__
 #include "ATen/native/cpu/avx_mathfun.h"
 #endif
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.h b/aten/src/ATen/native/cpu/UnaryOpsKernel.h
index d9bffadd1e1fbd..d4845760f7248d 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.h
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.h
@@ -1,8 +1,8 @@
 #pragma once
 
 #include <ATen/ATen.h>
+#include <ATen/native/DispatchStub.h>
 #include <stdexcept>
-#include "CapabilityDispatch.h"
 
 namespace at { namespace native {
 
diff --git a/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp b/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp
index 2e524f4d8e62d4..2bef41ee251955 100644
--- a/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp
+++ b/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp
@@ -2,6 +2,40 @@
 
 namespace at { namespace native {
 
+Tensor& _clamp__cuda(Tensor& self, Scalar min, Scalar max) {
+  return _th_clamp_(self, min, max);
+}
+
+Tensor& _clamp_out_cuda(
+    Tensor& result,
+    const Tensor& self,
+    Scalar min,
+    Scalar max) {
+  result.resize_(self.sizes());
+  result.copy_(self);
+  return _th_clamp_(result, min, max);
+}
+
+Tensor& _clamp_max__cuda(Tensor& self, Scalar max) {
+  return _th_clamp_max_(self, max);
+}
+
+Tensor& _clamp_max_out_cuda(Tensor& result, const Tensor& self, Scalar max) {
+  result.resize_(self.sizes());
+  result.copy_(self);
+  return _th_clamp_max_(result, max);
+}
+
+Tensor& _clamp_min__cuda(Tensor& self, Scalar min) {
+  return _th_clamp_min_(self, min);
+}
+
+Tensor& _clamp_min_out_cuda(Tensor& result, const Tensor& self, Scalar min) {
+  result.resize_(self.sizes());
+  result.copy_(self);
+  return _th_clamp_min_(result, min);
+}
+
 // These are just forwarding stubs
 
 #define IMPLEMENT_UNARY_OP_PREQUEL(op)                           \
diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu
index fa1c7628afb9e3..cc8e78c292dbc9 100644
--- a/aten/src/ATen/native/cuda/TensorTransformations.cu
+++ b/aten/src/ATen/native/cuda/TensorTransformations.cu
@@ -39,7 +39,8 @@ kernel_pointwise_flip_apply2(const cuda::detail::TensorInfo<scalar_t, IndexType>
 
 template <typename scalar_t>
 __global__
-void flip_cuda_kernel(scalar_t* in_tensor, scalar_t* out_tensor, int64_t N, int64_t* flip_dims, int64_t flip_dims_size, int64_t* strides, int64_t* strides_contiguous, int64_t* shape, int64_t total_dims) {
+void flip_cuda_kernel(scalar_t* in_tensor, scalar_t* out_tensor, int64_t N, int64_t* flip_dims, int64_t flip_dims_size,
+                      int64_t* strides, int64_t* strides_contiguous, int64_t* shape, int64_t total_dims) {
 
   int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x;
   if (linear_index >= N) {
@@ -99,18 +100,22 @@ Tensor flip_cuda(const Tensor& self, IntList dims) {
 
   auto out_tensor = at::empty_like(in_tensor);
 
-  // stride_contiguous is the stride of non-contiguous tensor after called contiguous(), it is used to compute indices for each element in non-contiguous tensor
+  // stride_contiguous is the stride of non-contiguous tensor after calling contiguous(),
+  // it is used to compute indices for each element in non-contiguous tensor
   Tensor stride_contiguous = at::zeros({total_dims}, kLong);
   int64_t* stride_contiguous_d = stride_contiguous.data<int64_t>();
-  int64_t tmp = N;
-  for (int64_t i = 0; i < total_dims; i++) {
-    tmp = tmp / shape[i];
-    stride_contiguous_d[i] = tmp;
+  for (int64_t i = total_dims - 1; i >= 0; i--) {
+    if (i == total_dims - 1) {
+      stride_contiguous_d[i] = 1;
+    } else {
+      stride_contiguous_d[i] = std::max<int64_t>(shape[i+1], 1) * stride_contiguous_d[i + 1];
+    }
   }
 
   AT_DISPATCH_ALL_TYPES_AND_HALF(in_tensor.type(), "flip_cuda", [&] {
     flip_cuda_kernel<<<dim_grid, dim_block, 0, globalContext().getCurrentCUDAStream()>>>(
-      in_tensor.data<scalar_t>(), out_tensor.data<scalar_t>(), N, flip_dims_t.toType(CUDA(kLong)).data<int64_t>(), flip_dims_size, strides_t.toType(CUDA(kLong)).data<int64_t>(), stride_contiguous.toType(CUDA(kLong)).data<int64_t>(), shape_t.toType(CUDA(kLong)).data<int64_t>(), total_dims);
+      in_tensor.data<scalar_t>(), out_tensor.data<scalar_t>(), N, flip_dims_t.toType(CUDA(kLong)).data<int64_t>(), flip_dims_size,
+      strides_t.toType(CUDA(kLong)).data<int64_t>(), stride_contiguous.toType(CUDA(kLong)).data<int64_t>(), shape_t.toType(CUDA(kLong)).data<int64_t>(), total_dims);
   });
 
   return out_tensor;
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 74e6481e7d63d8..89f2771b8dadf3 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -159,11 +159,18 @@
 - func: argmin(Tensor self) -> Tensor
 - func: _argmin(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
 
-# The actual implementations live in Declarations.cwrap. These are just to
-# provide default values for storage_offset=self.storage_offset()
 - func: as_strided(Tensor self, IntList size, IntList stride) -> Tensor
+
 - func: as_strided_(Tensor self, IntList size, IntList stride) -> Tensor
 
+- func: as_strided(Tensor self, IntList size, IntList stride, int64_t storage_offset) -> Tensor
+  python_default_init:
+    storage_offset: self.storage_offset()
+
+- func: as_strided_(Tensor self, IntList size, IntList stride, int64_t storage_offset) -> Tensor
+  python_default_init:
+    storage_offset: self.storage_offset()
+
 - func: asin(Tensor self) -> Tensor
 
 - func: asin_(Tensor self) -> Tensor
@@ -246,6 +253,45 @@
 
 - func: chunk(Tensor self, int64_t chunks, int64_t dim=0) -> TensorList
 
+- func: clamp(Tensor self, Scalar min, Scalar max) -> Tensor
+
+- func: clamp_(Tensor self, Scalar min, Scalar max) -> Tensor
+  dispatch:
+    CPU: _clamp__cpu
+    CUDA: _clamp__cuda
+
+- func: clamp_out(Tensor result, Tensor self, Scalar min, Scalar max) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _clamp_out_cpu
+    CUDA: _clamp_out_cuda
+
+- func: clamp_max(Tensor self, Scalar max) -> Tensor
+
+- func: clamp_max_(Tensor self, Scalar max) -> Tensor
+  dispatch:
+    CPU: _clamp_max__cpu
+    CUDA: _clamp_max__cuda
+
+- func: clamp_max_out(Tensor result, Tensor self, Scalar max) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _clamp_max_out_cpu
+    CUDA: _clamp_max_out_cuda
+
+- func: clamp_min(Tensor self, Scalar min) -> Tensor
+
+- func: clamp_min_(Tensor self, Scalar min) -> Tensor
+  dispatch:
+    CPU: _clamp_min__cpu
+    CUDA: _clamp_min__cuda
+
+- func: clamp_min_out(Tensor result, Tensor self, Scalar min) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _clamp_min_out_cpu
+    CUDA: _clamp_min_out_cuda
+
 - func: cudnn_is_acceptable(Tensor self) -> bool
   variants: function
   device_guard: false
@@ -718,6 +764,11 @@
 
 - func: index_put_(Tensor self, TensorList indices, Tensor values) -> Tensor
 
+- func: inverse(Tensor self) -> Tensor
+
+- func: inverse_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+
 - func: isclose(Tensor self, Tensor other, double rtol=1e-5, double atol=1e-8, bool equal_nan=False) -> Tensor
 
 - func: is_cuda(Tensor self) -> bool
@@ -1096,6 +1147,9 @@
 
 - func: reshape(Tensor self, IntList shape) -> Tensor
 
+- func: reshape_as(Tensor self, Tensor other) -> Tensor
+  variants: method
+
 - func: RoiPooling2d_forward(Tensor input, Tensor rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale) -> (Tensor, Tensor)
   variants: function
   dispatch:
@@ -1250,9 +1304,14 @@
 - func: stack_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor
   variants: function
 
-- func: stft(Tensor self, int64_t frame_length, int64_t hop, int64_t fft_size, bool normalized=false, bool onesided=true, Tensor? window={}, int64_t pad_end=0) -> Tensor
+# The signature is designed to be consistent with librosa except that it is
+# missing the `pad_mode` and `center` arguments, which are taken care of at
+# `torch.functional.py`. They shall be moved here once we have mapping between
+# Python strings and C++ Enum in codegen.
+- func: stft(Tensor self, int64_t n_fft, int64_t hop_length, int64_t win_length, Tensor? window={}, bool normalized=false, bool onesided=true) -> Tensor
   python_default_init:
-    fft_size: frame_length
+    hop_length: n_fft >> 2
+    win_length: n_fft
 
 - func: stride(Tensor self, int64_t dim) -> int64_t
   device_guard: false
diff --git a/aten/src/ATen/templates/TensorDense.cpp b/aten/src/ATen/templates/TensorDense.cpp
index 92ffeb32352f5a..9f977d50ead2b5 100644
--- a/aten/src/ATen/templates/TensorDense.cpp
+++ b/aten/src/ATen/templates/TensorDense.cpp
@@ -1,7 +1,9 @@
 // included as 'TensorDenseOrSparse' in TensorDerived.cpp
 
 IntList ${Tensor}::strides() const {
-  return IntList(tensor->stride,dim());
+  // NB: THTensor doesn't agree with Tensor for scalars, so we
+  // have to construct a fresh IntList
+  return IntList(THTensor_getStridePtr(tensor), dim());
 }
 Scalar ${Tensor}::localScalar() {
   int64_t numel = ${THTensor}_nElement(${state,}tensor);
diff --git a/aten/src/ATen/templates/TensorDerived.cpp b/aten/src/ATen/templates/TensorDerived.cpp
index e15eb5fcb07dda..d89e84ee5e702e 100644
--- a/aten/src/ATen/templates/TensorDerived.cpp
+++ b/aten/src/ATen/templates/TensorDerived.cpp
@@ -31,7 +31,9 @@ const char * ${Tensor}::toString() const {
 }
 
 IntList ${Tensor}::sizes() const {
-  return IntList(tensor->size,dim());
+  // NB: dim in tensor is not synchronized with THTensor, so it's
+  // important to apply dim here
+  return IntList(THTensor_getSizePtr(tensor), dim());
 }
 
 int64_t ${Tensor}::dim() const {
diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp
index 894602639b462e..1f877e3e8b5987 100644
--- a/aten/src/ATen/test/stream_test.cpp
+++ b/aten/src/ATen/test/stream_test.cpp
@@ -5,16 +5,18 @@
 
 #include "cuda_runtime.h"
 
-#include <thread>
 #include <functional>
+#include <thread>
 
 /*
 Tests related to ATen streams.
 */
-TEST_CASE("Copying and Moving Streams", "Verifies streams are live through copying and moving") {
+TEST_CASE(
+    "Copying and Moving Streams",
+    "Verifies streams are live through copying and moving") {
   int32_t device = -1;
   cudaStream_t cuda_stream;
-  
+
   // Tests that copying works as expected and preserves the stream
   at::CUDAStream copyStream;
   {
@@ -23,7 +25,7 @@ TEST_CASE("Copying and Moving Streams", "Verifies streams are live through copyi
     cuda_stream = s.stream();
 
     copyStream = s;
-    
+
     REQUIRE(copyStream.internals() == s.internals());
     REQUIRE(copyStream.device() == device);
     REQUIRE(copyStream.stream() == cuda_stream);
@@ -57,7 +59,7 @@ TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") {
   // Sets and gets
   at::globalContext().setCurrentCUDAStream(myStream);
   at::CUDAStream curStream = at::globalContext().getCurrentCUDAStream();
-  
+
   REQUIRE(myStream == curStream);
 
   // Gets, sets, and gets default stream
@@ -71,8 +73,7 @@ TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") {
 
 TEST_CASE("Stream API retain/free", "Ensures streams are destroyed properly") {
   auto ptr = at::detail::CUDAStream_createAndRetainWithOptions(
-    at::CUDAStream::DEFAULT_FLAGS
-  , at::CUDAStream::DEFAULT_PRIORITY);
+      at::CUDAStream::DEFAULT_FLAGS, at::CUDAStream::DEFAULT_PRIORITY);
 
   at::detail::CUDAStream_free(ptr);
   REQUIRE(ptr == nullptr);
@@ -85,7 +86,9 @@ void thread_fun(at::CUDAStream& cur_thread_stream) {
   REQUIRE(cur_thread_stream == new_stream);
 }
 
-TEST_CASE("Multithread Getting and Setting", "Ensures streams are thread local") {
+TEST_CASE(
+    "Multithread Getting and Setting",
+    "Ensures streams are thread local") {
   at::CUDAStream s0, s1;
 
   std::thread t0{thread_fun, std::ref(s0)};
@@ -101,3 +104,98 @@ TEST_CASE("Multithread Getting and Setting", "Ensures streams are thread local")
   REQUIRE(cur_stream != s1);
   REQUIRE(s0 != s1);
 }
+
+TEST_CASE("CUDAGuard") {
+  if (at::globalContext().getNumGPUs() < 2) {
+    return;
+  }
+
+  // -- begin setup
+
+  REQUIRE(at::current_device() == 0);
+  std::vector<at::CUDAStream> streams0 = {
+      at::globalContext().getDefaultCUDAStream(),
+      at::globalContext().createCUDAStream()};
+  REQUIRE(streams0[0].device() == 0);
+  REQUIRE(streams0[1].device() == 0);
+  at::globalContext().setCurrentCUDAStreamOnDevice(0, streams0[0]);
+
+  std::vector<at::CUDAStream> streams1;
+  {
+    at::DeviceGuard device_guard(1);
+    streams1.push_back(at::globalContext().getDefaultCUDAStream());
+    streams1.push_back(at::globalContext().createCUDAStream());
+  }
+  REQUIRE(streams1[0].device() == 1);
+  REQUIRE(streams1[1].device() == 1);
+  at::globalContext().setCurrentCUDAStreamOnDevice(1, streams1[0]);
+
+  REQUIRE(at::current_device() == 0);
+
+  // -- end setup
+
+  // Test that all original streams are recorded.
+  {
+    at::CUDAGuard guard;
+    REQUIRE(guard.original_streams().empty());
+    guard.set_stream(streams0[0]);
+    REQUIRE(
+        guard.original_streams().size() == at::globalContext().getNumGPUs());
+    REQUIRE(guard.original_streams()[0] == streams0[0]);
+    REQUIRE(guard.original_streams()[1] == streams1[0]);
+  }
+
+  // Setting a stream changes the current device and the stream on that device
+  {
+    at::CUDAGuard guard(streams1[1]);
+    REQUIRE(guard.last_device() == 1);
+    REQUIRE(at::current_device() == 1);
+    REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[1]);
+  }
+
+  // Device and stream are now reset
+  REQUIRE(at::current_device() == 0);
+  REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[0]);
+
+  // Setting only the device changes only the current device and not the stream
+  {
+    at::CUDAGuard guard(/*device=*/1);
+    REQUIRE(guard.last_device() == 1);
+    REQUIRE(at::current_device() == 1);
+    REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[0]);
+  }
+
+  REQUIRE(at::current_device() == 0);
+  REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(0) == streams0[0]);
+
+  // Setting the stream first, and then the device, first changes the devices
+  // back, and then resets the stream on the initial device.
+
+  {
+    at::CUDAGuard guard(streams0[1]);
+    guard.set_device(1);
+  }
+
+  REQUIRE(at::current_device() == 0);
+  REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(0) == streams0[0]);
+  REQUIRE(at::globalContext().getCurrentCUDAStreamOnDevice(1) == streams1[0]);
+}
+
+TEST_CASE("CUDAGuardIsMovable") {
+  if (at::globalContext().getNumGPUs() < 2) {
+    return;
+  }
+  const auto stream = at::globalContext().createCUDAStream();
+  const auto device_count = at::globalContext().getNumGPUs();
+  at::CUDAGuard first(stream);
+  first.set_device(1);
+  at::CUDAGuard second(std::move(first));
+  REQUIRE(second.original_streams().size() == device_count);
+  REQUIRE(second.original_device() == 0);
+  REQUIRE(second.last_device() == 1);
+  at::CUDAGuard third;
+  third = std::move(second);
+  REQUIRE(third.original_streams().size() == device_count);
+  REQUIRE(third.original_device() == 0);
+  REQUIRE(third.last_device() == 1);
+}
diff --git a/aten/src/README.md b/aten/src/README.md
index a641ea1b5ffb8f..530d8dd6b48c6e 100644
--- a/aten/src/README.md
+++ b/aten/src/README.md
@@ -75,7 +75,7 @@ under some conditions you have to have to call, e.g., `newContiguous`, to get
 it into the correct form:
 
 ```
-  if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
+  if (!(k_->stride(3) == 1) || !(k_->stride[2] == k_->size(3))) {
     kernel = THTensor_(newContiguous)(k_);
   } else {
     THTensor_(retain)(k_);
diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt
index 5d588df14ada2e..86fd8db5ff55c6 100644
--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@@ -21,7 +21,7 @@ IF(C_AVX2_FOUND)
 ENDIF(C_AVX2_FOUND)
 
 SET(hdr
-  THGeneral.h THHalf.h THAllocator.h THSize.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
+  THGeneral.h THHalf.h THAllocator.h THSize.h THStorage.h THStorageFunctions.h THTensor.h THTensorApply.h THBlas.h THMath.h
   THLapack.h THLogAdd.h THRandom.h THVector.h )
 
 set(ATen_TH_SRCS
@@ -29,7 +29,8 @@ set(ATen_TH_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/THHalf.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/THAllocator.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/THSize.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/THStorage.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THStorageClass.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/THStorageFunctions.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/THTensor.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/THBlas.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/THLapack.cpp
@@ -86,13 +87,15 @@ INSTALL(FILES
   THRandom.h
   THSize.h
   THStorage.h
+  THStorageFunctions.h
   THTensor.h
   THTensorApply.h
   THTensorDimApply.h
   THVector.h
   THHalf.h
   THTensor.hpp
-  THStorage.hpp
+  THStorageClass.hpp
+  THStorageFunctions.hpp
   THGenerator.hpp
   THTypeConversion.hpp
   DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH")
diff --git a/aten/src/TH/TH.h b/aten/src/TH/TH.h
index 08bdde867ce45e..1faf6e52b5a468 100644
--- a/aten/src/TH/TH.h
+++ b/aten/src/TH/TH.h
@@ -12,7 +12,7 @@
 #include "THLogAdd.h"
 #include "THRandom.h"
 #include "THSize.h"
-#include "THStorage.h"
+#include "THStorageFunctions.h"
 #include "THTensor.h"
 #include "THTensorApply.h"
 #include "THTensorDimApply.h"
diff --git a/aten/src/TH/THAllocator.h b/aten/src/TH/THAllocator.h
index c054c0c07c052c..460f23873fff08 100644
--- a/aten/src/TH/THAllocator.h
+++ b/aten/src/TH/THAllocator.h
@@ -25,7 +25,7 @@ typedef struct at_THAllocator THAllocator;
 /* default malloc/free allocator. malloc and realloc raise an error (using
  * THError) on allocation failure.
  */
-TH_API THAllocator* getTHDefaultAllocator();
+TH_API THAllocator* getTHDefaultAllocator(void);
 
 #ifdef __cplusplus
 // Sentinel value/type to help distinguish the file descriptor constructor from
diff --git a/aten/src/TH/THFile.cpp b/aten/src/TH/THFile.cpp
index f3e17419dfaca9..ae0fdf10455b6e 100644
--- a/aten/src/TH/THFile.cpp
+++ b/aten/src/TH/THFile.cpp
@@ -1,5 +1,5 @@
 #include "THFile.h"
-#include "THStorage.hpp"
+#include "THStorageFunctions.hpp"
 #include "THFilePrivate.h"
 
 #define IMPLEMENT_THFILE_RW(TYPEC, TYPE)                          \
diff --git a/aten/src/TH/THFile.h b/aten/src/TH/THFile.h
index 27041f51c70982..8844b0eca66d26 100644
--- a/aten/src/TH/THFile.h
+++ b/aten/src/TH/THFile.h
@@ -1,7 +1,7 @@
 #ifndef TH_FILE_INC
 #define TH_FILE_INC
 
-#include "THStorage.h"
+#include "THStorageFunctions.h"
 
 typedef struct THFile__ THFile;
 
diff --git a/aten/src/TH/THGeneral.cpp b/aten/src/TH/THGeneral.cpp
index 667d7fbf253d47..1b89e17dce997c 100644
--- a/aten/src/TH/THGeneral.cpp
+++ b/aten/src/TH/THGeneral.cpp
@@ -303,7 +303,7 @@ TH_API void THInferNumThreads(void)
 #endif
 }
 
-TH_API THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim) {
+THDescBuff _THSizeDesc(const int64_t *size, const int64_t ndim) {
   const int L = TH_DESC_BUFF_LEN;
   THDescBuff buf;
   char *str = buf.str;
diff --git a/aten/src/TH/THMemoryFile.cpp b/aten/src/TH/THMemoryFile.cpp
index e13b02f8c29c0b..46582c913270cb 100644
--- a/aten/src/TH/THMemoryFile.cpp
+++ b/aten/src/TH/THMemoryFile.cpp
@@ -1,5 +1,5 @@
 #include "THMemoryFile.h"
-#include "THStorage.hpp"
+#include "THStorageFunctions.hpp"
 #include "THFilePrivate.h"
 #include "THDiskFile.h"
 #include "stdint.h"
diff --git a/aten/src/TH/THMemoryFile.h b/aten/src/TH/THMemoryFile.h
index b54cdcc2f2cfa0..c8cab3667b8ffc 100644
--- a/aten/src/TH/THMemoryFile.h
+++ b/aten/src/TH/THMemoryFile.h
@@ -2,7 +2,7 @@
 #define TH_MEMORY_FILE_INC
 
 #include "THFile.h"
-#include "THStorage.h"
+#include "THStorageFunctions.h"
 
 TH_API THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode);
 TH_API THFile *THMemoryFile_new(const char *mode);
diff --git a/aten/src/TH/THStorage.h b/aten/src/TH/THStorage.h
index ce53827b9f6fce..3d25c2129682ff 100644
--- a/aten/src/TH/THStorage.h
+++ b/aten/src/TH/THStorage.h
@@ -1,25 +1,4 @@
 #pragma once
+#include "THStorageFunctions.h"
 
-#include "THGeneral.h"
-#include "THAllocator.h"
-
-#define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)
-
-#include "generic/THStorage.h"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THStorage.h"
-#include "THGenerateHalfType.h"
-
-#include "generic/THStorageCopy.h"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THStorageCopy.h"
-#include "THGenerateHalfType.h"
-
-// This exists to have a data-type independent way of freeing (necessary for THPPointer).
-TH_API void THStorage_free(THStorage *storage);
-TH_API void THStorage_weakFree(THStorage *storage);
-
-TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size);
-TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement);
+// Compatability header. Use THStorageFunctions.h instead if you need this.
diff --git a/aten/src/TH/THStorageClass.cpp b/aten/src/TH/THStorageClass.cpp
new file mode 100644
index 00000000000000..49e9a6b6c8c20c
--- /dev/null
+++ b/aten/src/TH/THStorageClass.cpp
@@ -0,0 +1,28 @@
+#include "THStorageClass.hpp"
+
+THStorage::THStorage(
+    at::ScalarType scalar_type,
+    ptrdiff_t size,
+    at::DataPtr data_ptr,
+    at::Allocator* allocator,
+    char flag)
+    : scalar_type(scalar_type),
+      data_ptr(std::move(data_ptr)),
+      size(size),
+      refcount(1),
+      weakcount(1), // from the strong reference
+      flag(flag),
+      allocator(allocator),
+      finalizer(nullptr) {}
+
+THStorage::THStorage(
+    at::ScalarType scalar_type,
+    ptrdiff_t size,
+    at::Allocator* allocator,
+    char flag)
+    : THStorage(
+          scalar_type,
+          size,
+          allocator->allocate(at::elementSize(scalar_type) * size),
+          allocator,
+          flag) {}
diff --git a/aten/src/TH/THStorageClass.hpp b/aten/src/TH/THStorageClass.hpp
new file mode 100644
index 00000000000000..99031f635b7035
--- /dev/null
+++ b/aten/src/TH/THStorageClass.hpp
@@ -0,0 +1,76 @@
+#pragma once
+
+// STOP!!! Thinking of including this header directly?  Please
+// read Note [TH abstraction violation]
+
+#include <ATen/Allocator.h>
+
+#include <ATen/ScalarType.h>
+#include <ATen/ScalarTypeUtils.h>
+#include "THTypeConversion.hpp"
+#include <atomic>
+
+// Note [Weak references for intrusive refcounting]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Here's the scheme:
+//
+//  - refcount == number of strong references to the object
+//    weakcount == number of weak references to the object,
+//      plus one more if refcount > 0
+//
+//  - THStorage stays live as long as there are any strong
+//    or weak pointers to it (weakcount > 0, since strong
+//    references count as a +1 to weakcount)
+//
+//  - finalizers are called and data_ptr is deallocated when refcount == 0
+//
+//  - Once refcount == 0, it can never again be > 0 (the transition
+//    from > 0 to == 0 is monotonic)
+//
+//  - When you access THStorage via a weak pointer, you must
+//    atomically increment the use count, if it is greater than 0.
+//    If it is not, you must report that the storage is dead.
+//
+
+struct THFinalizer {
+  virtual void operator()() = 0;
+  virtual ~THFinalizer() {};
+};
+
+struct THStorage
+{
+  THStorage() = delete;
+  THStorage(at::ScalarType, ptrdiff_t, at::DataPtr, at::Allocator*, char);
+  THStorage(at::ScalarType, ptrdiff_t, at::Allocator*, char);
+  at::ScalarType scalar_type;
+  at::DataPtr data_ptr;
+  ptrdiff_t size;
+  std::atomic<int> refcount;
+  std::atomic<int> weakcount;
+  char flag;
+  at::Allocator* allocator;
+  std::unique_ptr<THFinalizer> finalizer;
+  struct THStorage* view;
+  THStorage(THStorage&) = delete;
+  THStorage(const THStorage&) = delete;
+  THStorage(THStorage&&) = delete;
+  THStorage(const THStorage&&) = delete;
+
+  template <typename T>
+  inline T* data() const {
+    auto scalar_type_T = at::CTypeToScalarType<th::from_type<T>>::to();
+    if (scalar_type != scalar_type_T) {
+      AT_ERROR(
+          "Attempt to access Storage having data type ",
+          at::toString(scalar_type),
+          " as data type ",
+          at::toString(scalar_type_T));
+    }
+    return unsafe_data<T>();
+  }
+
+  template <typename T>
+  inline T* unsafe_data() const {
+    return static_cast<T*>(this->data_ptr.get());
+  }
+};
diff --git a/aten/src/TH/THStorage.cpp b/aten/src/TH/THStorageFunctions.cpp
similarity index 59%
rename from aten/src/TH/THStorage.cpp
rename to aten/src/TH/THStorageFunctions.cpp
index f4910c3f07fe32..c3db776b632e75 100644
--- a/aten/src/TH/THStorage.cpp
+++ b/aten/src/TH/THStorageFunctions.cpp
@@ -1,6 +1,6 @@
 #include <climits>
 
-#include "THStorage.hpp"
+#include "THStorageFunctions.hpp"
 
 #include "generic/THStorage.cpp"
 #include "THGenerateAllTypes.h"
@@ -25,8 +25,8 @@ void THStorage_free(THStorage *storage) {
       if (storage->finalizer) {
         (*storage->finalizer)();
       }
-      storage->finalizer.~unique_ptr<THFinalizer>();
-      storage->data_ptr.~DataPtr();
+      storage->finalizer = nullptr;
+      storage->data_ptr.clear();
       THStorage_weakFree(storage);
     }
   }
@@ -40,9 +40,7 @@ void THStorage_weakRetain(THStorage *weak_storage) {
 // Releases a weak reference
 void THStorage_weakFree(THStorage *weak_storage) {
   if (--weak_storage->weakcount == 0) {
-    weak_storage->refcount.~atomic<int>();
-    weak_storage->weakcount.~atomic<int>();
-    THFree(weak_storage);
+    delete weak_storage;
   }
 }
 
@@ -91,62 +89,11 @@ THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElemen
   return copy;
 }
 
-THStorage* THStorage_new(at::ScalarType scalar_type)
-{
-  return THStorage_newWithSize(scalar_type, 0);
-}
-
-THStorage* THStorage_newWithSize(at::ScalarType scalar_type, ptrdiff_t size)
-{
-  return THStorage_newWithAllocator(scalar_type, size, getTHDefaultAllocator());
-}
-
-THStorage* THStorage_newWithAllocator(at::ScalarType scalar_type, ptrdiff_t size,
-                                      at::Allocator *allocator)
-{
-  THStorage *storage = static_cast<THStorage*>(THAlloc(sizeof(THStorage)));
-  storage->scalar_type = scalar_type;
-  new (&storage->data_ptr) at::DataPtr(allocator->allocate(at::elementSize(scalar_type)*size));
-  storage->size = size;
-  new (&storage->refcount) std::atomic<int>(1);
-  new (&storage->weakcount) std::atomic<int>(1); // from the strong reference
-  new (&storage->finalizer) std::unique_ptr<THFinalizer>(nullptr);
-  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE;
-  storage->allocator = allocator;
-  return storage;
-}
-
 ptrdiff_t THStorage_size(const THStorage *self)
 {
   return self->size;
 }
 
-size_t THStorage_elementSize(const THStorage *self)
-{
-  return at::elementSize(self->scalar_type);
-}
-
-THStorage* THStorage_newWithMapping(at::ScalarType scalar_type, const char *filename, ptrdiff_t size, int flags)
-{
-  size_t actual_size = -1;
-  THStorage *storage = THStorage_newWithDataAndAllocator(scalar_type,
-                                                         THMapAllocator::makeDataPtr(
-                                                            filename,
-                                                            flags,
-                                                            size * at::elementSize(scalar_type),
-                                                            &actual_size),
-                                                         size,
-                                                         /* allocator */ nullptr);
-
-  if (size <= 0) {
-    storage->size = actual_size/THStorage_elementSize(storage);
-  }
-
-  THStorage_clearFlag(storage, TH_STORAGE_RESIZABLE);
-
-  return storage;
-}
-
 void THStorage_setFlag(THStorage *storage, const char flag)
 {
   storage->flag |= flag;
@@ -173,21 +120,6 @@ THStorage* THStorage_newWithData(at::ScalarType scalar_type, std::unique_ptr<at:
 }
 */
 
-THStorage* THStorage_newWithDataAndAllocator(at::ScalarType scalar_type,
-                                             at::DataPtr&& data, ptrdiff_t size,
-                                             THAllocator* allocator) {
-  THStorage *storage = static_cast<THStorage*>(THAlloc(sizeof(THStorage)));
-  storage->scalar_type = scalar_type;
-  new (&storage->data_ptr) at::DataPtr(std::move(data));
-  storage->size = size;
-  new (&storage->refcount) std::atomic<int>(1);
-  new (&storage->weakcount) std::atomic<int>(1); // from the strong reference
-  new (&storage->finalizer) std::unique_ptr<THFinalizer>(nullptr);
-  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE;
-  storage->allocator = allocator;
-  return storage;
-}
-
 void THStorage_resize(THStorage *storage, ptrdiff_t size)
 {
   if (storage->flag & TH_STORAGE_RESIZABLE)
diff --git a/aten/src/TH/THStorageFunctions.h b/aten/src/TH/THStorageFunctions.h
new file mode 100644
index 00000000000000..ce53827b9f6fce
--- /dev/null
+++ b/aten/src/TH/THStorageFunctions.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "THGeneral.h"
+#include "THAllocator.h"
+
+#define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)
+
+#include "generic/THStorage.h"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THStorage.h"
+#include "THGenerateHalfType.h"
+
+#include "generic/THStorageCopy.h"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THStorageCopy.h"
+#include "THGenerateHalfType.h"
+
+// This exists to have a data-type independent way of freeing (necessary for THPPointer).
+TH_API void THStorage_free(THStorage *storage);
+TH_API void THStorage_weakFree(THStorage *storage);
+
+TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size);
+TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement);
diff --git a/aten/src/TH/THStorage.hpp b/aten/src/TH/THStorageFunctions.hpp
similarity index 50%
rename from aten/src/TH/THStorage.hpp
rename to aten/src/TH/THStorageFunctions.hpp
index e02e265062d94b..9ef48dcfbd870e 100644
--- a/aten/src/TH/THStorage.hpp
+++ b/aten/src/TH/THStorageFunctions.hpp
@@ -3,7 +3,8 @@
 // STOP!!! Thinking of including this header directly?  Please
 // read Note [TH abstraction violation]
 
-#include "THStorage.h"
+#include "THStorageClass.hpp"
+#include "THStorageFunctions.h"
 
 #include <ATen/ScalarType.h>
 #include <ATen/ScalarTypeUtils.h>
@@ -32,52 +33,11 @@
 //    If it is not, you must report that the storage is dead.
 //
 
-struct THFinalizer {
-  virtual void operator()() = 0;
-  virtual ~THFinalizer() {};
-};
-
-typedef struct THStorage
-{
-    at::ScalarType scalar_type;
-    at::DataPtr data_ptr;
-    ptrdiff_t size;
-    std::atomic<int> refcount;
-    std::atomic<int> weakcount;
-    char flag;
-    at::Allocator *allocator;
-    std::unique_ptr<THFinalizer> finalizer;
-
-    template <typename T>
-    inline T * data() const {
-      auto scalar_type_T = at::CTypeToScalarType<th::from_type<T>>::to();
-      if (scalar_type != scalar_type_T) {
-        AT_ERROR("Attempt to access Storage having data type ", at::toString(scalar_type),
-                 " as data type ", at::toString(scalar_type_T));
-      }
-      return unsafe_data<T>();
-    }
-
-    template <typename T>
-    inline T * unsafe_data() const {
-      return static_cast<T*>(this->data_ptr.get());
-    }
-} THStorage;
-
-TH_API THStorage* THStorage_new(at::ScalarType scalar_type);
-TH_API THStorage* THStorage_newWithSize(at::ScalarType scalar_type, ptrdiff_t size);
-TH_API THStorage* THStorage_newWithAllocator(at::ScalarType scalar_type, ptrdiff_t size,
-                                             at::Allocator *allocator);
-
 ptrdiff_t THStorage_size(const THStorage *self);
-size_t THStorage_elementSize();
-THStorage* THStorage_newWithMapping(at::ScalarType scalar_type, const char *filename, ptrdiff_t size, int flags);
+
 void THStorage_setFlag(THStorage *storage, const char flag);
 void THStorage_clearFlag(THStorage *storage, const char flag);
 void THStorage_retain(THStorage *storage);
-THStorage* THStorage_newWithDataAndAllocator(at::ScalarType scalar_type,
-                                             at::DataPtr&& data, ptrdiff_t size,
-                                             at::Allocator* allocator);
 void THStorage_resize(THStorage *storage, ptrdiff_t size);
 void THStorage_swap(THStorage *storage1, THStorage *storage2);
 
diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp
index b2815ca4868dcd..48ddcd2c57ba10 100644
--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@@ -43,17 +43,9 @@ void THTensor_free(THTensor *self)
   if(!self)
     return;
 
-  if(self->flag & TH_TENSOR_REFCOUNTED)
+  if(--self->refcount == 0)
   {
-    if(--self->refcount == 0)
-    {
-      THFree(self->size);
-      THFree(self->stride);
-      if(self->storage)
-        THStorage_free(self->storage);
-      self->refcount.~atomic<int>();
-      THFree(self);
-    }
+    delete self;
   }
 }
 
diff --git a/aten/src/TH/THTensor.h b/aten/src/TH/THTensor.h
index 3984bf9172ff0c..3335a6f5d8cc50 100644
--- a/aten/src/TH/THTensor.h
+++ b/aten/src/TH/THTensor.h
@@ -1,7 +1,7 @@
 #ifndef TH_TENSOR_INC
 #define TH_TENSOR_INC
 
-#include "THStorage.h"
+#include "THStorageFunctions.h"
 #include "THTensorApply.h"
 
 #define THTensor_(NAME)   TH_CONCAT_4(TH,Real,Tensor_,NAME)
diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp
index 71236afa4b5626..bc9f23ee5e3cee 100644
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@@ -4,24 +4,38 @@
 // read Note [TH abstraction violation]
 
 #include "THTensor.h"
-#include "THStorage.hpp"
+#include "THStorageFunctions.hpp"
 
 #include <atomic>
 #include <ATen/ATen.h>
 
-typedef struct THTensor
+struct THTensor
 {
-    int64_t *size;
-    int64_t *stride;
-    int64_t dim_;
+    THTensor(THStorage* storage)
+      : refcount(1)
+      , storage(storage)
+      , storageOffset(0)
+      , sizes_{0}
+      , strides_{1}
+      , dim_(1)
+      {}
+
+    ~THTensor() {
+      if (storage) {
+        THStorage_free(storage);
+      }
+    }
+
+    std::atomic<int> refcount;
 
     // Note: storage->size may be greater than the recorded size
     // of a tensor
     THStorage *storage;
     ptrdiff_t storageOffset;
-    std::atomic<int> refcount;
 
-    char flag;
+    std::vector<int64_t> sizes_;
+    std::vector<int64_t> strides_;
+    int64_t dim_;
 
     template <typename T>
     inline T * data() const {
@@ -47,21 +61,65 @@ typedef struct THTensor
     // represents that numel() == 0.
     inline bool is_empty() const {
       for (int64_t i = 0; i < dim_; ++i) {
-        if (size[i] == 0) {
-          return true;  
+        if (sizes_[i] == 0) {
+          return true;
         }
       }
       return false;
     }
 
+    int64_t size(int64_t d) const {
+      d = at::maybe_wrap_dim(d, dim(), false);
+      return sizes_[d];
+    }
+
+    int64_t stride(int64_t d) const {
+      d = at::maybe_wrap_dim(d, dim(), false);
+      return strides_[d];
+    }
+
     inline at::IntList sizes() {
-      return at::IntList(size, dim_);
+      return sizes_;
     }
-} THTensor;
+
+    inline at::IntList strides() {
+      return strides_;
+    }
+};
 
 #include "generic/THTensorFastGetSet.hpp"
 #include "THGenerateAllTypes.h"
 
+inline int64_t* THTensor_getSizePtr(THTensor* tensor) {
+  return tensor->sizes_.data();
+}
+
+inline int64_t* THTensor_getStridePtr(THTensor* tensor) {
+  return tensor->strides_.data();
+}
+
+inline void THTensor_resizeDim(THTensor* tensor, int64_t ndim) {
+  tensor->dim_ = ndim;
+  // NB: This is *truly* a resize; calling code (e.g., squeeze)
+  // assumes that old values are preserved
+  tensor->sizes_.resize(ndim);
+  tensor->strides_.resize(ndim);
+}
+
+inline void THTensor_setSizesAndStrides(THTensor* tensor, std::vector<int64_t>&& new_size, std::vector<int64_t>&& new_stride) {
+  tensor->dim_ = new_size.size();
+  tensor->sizes_ = std::move(new_size);
+  tensor->strides_ = std::move(new_stride);
+}
+
+inline void THTensor_setSizeAtDim(THTensor* tensor, int dim, int64_t new_size) {
+  tensor->sizes_[dim] = new_size;
+}
+
+inline void THTensor_setStrideAtDim(THTensor* tensor, int dim, int64_t new_stride) {
+  tensor->strides_[dim] = new_stride;
+}
+
 TH_API void THTensor_free(THTensor *self);
 at::optional<std::vector<int64_t>> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride,
                                                            at::IntList newshape);
diff --git a/aten/src/TH/THTensorApply.h b/aten/src/TH/THTensorApply.h
index 0b699e89d832c6..514a4969df83e2 100644
--- a/aten/src/TH/THTensorApply.h
+++ b/aten/src/TH/THTensorApply.h
@@ -37,7 +37,7 @@
   int TENSOR##_contiguous = ALLOW_CONTIGUOUS && DIM < 0; \
   TENSOR##_n = 1; \
   for(TENSOR##_i = 0; TENSOR##_i < TENSOR->dim(); TENSOR##_i++) \
-    TENSOR##_n *= TENSOR->size[TENSOR##_i]; \
+    TENSOR##_n *= TENSOR->size(TENSOR##_i); \
 \
   if(TENSOR->is_empty()) \
     TH_TENSOR_APPLY_hasFinished = 1; \
@@ -47,9 +47,9 @@
     TENSOR##_size = 1; \
     TENSOR##_stride = 1; \
     for(TENSOR##_i = TENSOR->_dim()-1; TENSOR##_i >= 0; TENSOR##_i--) { \
-      if(TENSOR->size[TENSOR##_i] != 1) { \
-        if(TENSOR->stride[TENSOR##_i] == TENSOR##_size && TENSOR##_i != DIM) \
-          TENSOR##_size *= TENSOR->size[TENSOR##_i]; \
+      if(TENSOR->size(TENSOR##_i) != 1) { \
+        if(TENSOR->stride(TENSOR##_i) == TENSOR##_size && TENSOR##_i != DIM) \
+          TENSOR##_size *= TENSOR->size(TENSOR##_i); \
         else{ \
           TENSOR##_contiguous = 0; \
           break; \
@@ -61,7 +61,7 @@
       TENSOR##_dim = 1; \
       for(TENSOR##_i = TENSOR->_dim()-2; TENSOR##_i >= 0; TENSOR##_i--) \
       { \
-        if(TENSOR->stride[TENSOR##_i] != TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] || TENSOR##_i == DIM || TENSOR##_i+1 == DIM) \
+        if(TENSOR->stride(TENSOR##_i) != TENSOR->stride(TENSOR##_i+1) * TENSOR->size(TENSOR##_i+1) || TENSOR##_i == DIM || TENSOR##_i+1 == DIM) \
           TENSOR##_dim++; \
       } \
       /* Allocate an array of 3*dim elements, where dim is the number of contiguous sections */ \
@@ -70,8 +70,8 @@
       TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \
       TH_TENSOR_dim_index = TENSOR##_dim-1; \
       TENSOR##_dimOffset = (DIM == TENSOR->_dim()-1) ? &TENSOR##_i : &TENSOR##_counter[DIM]; \
-      TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR->_dim()-1]; \
-      TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR->_dim()-1]; \
+      TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size(TENSOR->_dim()-1); \
+      TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride(TENSOR->_dim()-1); \
       /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \
       /* storage is given by storage_offset + (i * j), where i is the stride */ \
       /* vector and j is tensor_counter vector. This sets the starting position for the loop. */ \
@@ -79,14 +79,14 @@
         TENSOR##_counter[TENSOR##_i] = 0; \
       } \
       for(TENSOR##_i = TENSOR->_dim()-2; TENSOR##_i >= 0; --TENSOR##_i) { \
-        if (TENSOR->stride[TENSOR##_i] == TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] && TENSOR##_i != DIM && TENSOR##_i+1 != DIM) { \
-          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i] * TENSOR##_sizes[TH_TENSOR_dim_index]; \
+        if (TENSOR->stride(TENSOR##_i) == TENSOR->stride(TENSOR##_i+1) * TENSOR->size(TENSOR##_i+1) && TENSOR##_i != DIM && TENSOR##_i+1 != DIM) { \
+          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size(TENSOR##_i) * TENSOR##_sizes[TH_TENSOR_dim_index]; \
           if (DIM != TENSOR->_dim()-1 && TENSOR##_i < DIM) \
             TENSOR##_dimOffset--; \
         } else { \
           --TH_TENSOR_dim_index; \
-          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i]; \
-          TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR##_i]; \
+          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size(TENSOR##_i); \
+          TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride(TENSOR##_i); \
         } \
       } \
       /* Size of the inner most section */ \
@@ -160,13 +160,12 @@
     elements_equal = 0;                                                 \
   }                                                                     \
   if (elements_equal == 0) {                                            \
-    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \
-    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \
-    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->dim()); \
-    THError("inconsistent tensor size, expected %s %s, %s %s and %s %s to have the same " \
-            "number of elements, but got %d, %d and %d elements respectively", \
-            #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, \
-            TENSOR1##_n, TENSOR2##_n, TENSOR3##_n);                     \
+    AT_ERROR("inconsistent tensor size, expected ",                     \
+            #TENSOR1, " ", TENSOR1->sizes(), ", ",                      \
+            #TENSOR2, " ", TENSOR2->sizes(), " and ",                   \
+            #TENSOR3, " ", TENSOR3->sizes(), " to have the same "       \
+            "number of elements, but got ", TENSOR1##_n, ", ",          \
+            TENSOR2##_n, " and ", TENSOR3##_n, " elements respectively"); \
   }                                                                     \
                                                                         \
   while(!TH_TENSOR_APPLY_hasFinished) \
@@ -199,11 +198,11 @@
   __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
 \
     if(TENSOR1##_n != TENSOR2##_n) {                                    \
-      THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->dim()); \
-      THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->dim()); \
-      THError("inconsistent tensor size, expected %s %s and %s %s to have the same " \
-              "number of elements, but got %d and %d elements respectively", \
-              #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, TENSOR1##_n, TENSOR2##_n); \
+      AT_ERROR("inconsistent tensor size, expected ",                   \
+      #TENSOR1, " ", TENSOR1->sizes(), " and ",                         \
+      #TENSOR2, " ", TENSOR2->sizes(),                                  \
+      " to have the same number of elements, but got ",                 \
+      TENSOR1##_n, " and ", TENSOR2##_n, " elements respectively");     \
     }                                                                   \
   while(!TH_TENSOR_APPLY_hasFinished) \
   { \
diff --git a/aten/src/TH/THTensorDimApply.h b/aten/src/TH/THTensorDimApply.h
index 828b92dcb3ae69..e85bd0e9137e87 100644
--- a/aten/src/TH/THTensorDimApply.h
+++ b/aten/src/TH/THTensorDimApply.h
@@ -9,25 +9,21 @@
 #define TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \
 { \
   int shape_check_flag = 0;                                             \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
   { \
     if (TH_TENSOR_DIM_APPLY_i == DIMENSION) \
       continue; \
-    if (TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \
+    if (TENSOR1->size(TH_TENSOR_DIM_APPLY_i) != TENSOR2->size(TH_TENSOR_DIM_APPLY_i)) { \
       shape_check_flag = 1; \
       break; \
     } \
-    if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR3->size[TH_TENSOR_DIM_APPLY_i]) { \
+    if(TENSOR1->size(TH_TENSOR_DIM_APPLY_i) != TENSOR3->size(TH_TENSOR_DIM_APPLY_i)) { \
       shape_check_flag = 1; \
       break; \
     } \
   } \
   if (shape_check_flag == 1) { \
-    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \
-    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \
-    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->_dim()); \
-    THError("Expected %s %s, %s %s and %s %s to have the same size apart from dimension %d", \
-            #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, DIMENSION); \
+    AT_ERROR("Expected ", #TENSOR1, " ", TENSOR1->sizes(), ", ", #TENSOR2, " ", TENSOR2->sizes(), " and ", #TENSOR3, " ", TENSOR3->sizes(), " to have the same size apart from dimension ", DIMENSION); \
   } \
 }
 
@@ -40,55 +36,54 @@
   TYPE3 *TENSOR3##_data = NULL; \
   TH_UNUSED int64_t TENSOR3##_stride = 0, TENSOR3##_size = 0; \
   int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
-  int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
+  int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \
   int TH_TENSOR_DIM_APPLY_i; \
 \
-  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->_dim()) ) \
-    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->_dim()); \
+  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \
+    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->dim()); \
   int same_dims = 1;                                                    \
-  if( TENSOR1->_dim() != TENSOR2->_dim() ) {                    \
+  if( TENSOR1->dim() != TENSOR2->dim() ) {                    \
     same_dims = 0;                                                      \
   } \
-  if( TENSOR1->_dim() != TENSOR3->_dim() ) { \
+  if( TENSOR1->dim() != TENSOR3->dim() ) { \
     same_dims = 0;                                   \
   } \
   if (same_dims == 0) { \
-    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \
-    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \
-    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->_dim()); \
-    THError("inconsistent tensor size, expected %s %s, %s %s and %s %s to have the same " \
-            "number of dimensions", #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str); \
+    AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), ", ", #TENSOR2, " ", TENSOR2->sizes(), " and ", #TENSOR3, " ",TENSOR3->sizes() , " to have the same number of dimensions"); \
   }                                                                     \
   SIZE_CHECK(TENSOR1, TENSOR2, TENSOR3, DIMENSION)                      \
 \
-  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->_dim())); \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \
+  if (TH_TENSOR_DIM_APPLY_hasFinished) { \
+    return; \
+  } \
+  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
     TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
   TENSOR1##_data = (TENSOR1)->storage->data<TYPE1>()+(TENSOR1)->storageOffset; \
-  TENSOR1##_stride = (TENSOR1)->stride[DIMENSION]; \
-  TENSOR1##_size = TENSOR1->size[DIMENSION]; \
+  TENSOR1##_stride = (TENSOR1)->stride(DIMENSION); \
+  TENSOR1##_size = TENSOR1->size(DIMENSION); \
 \
   TENSOR2##_data = (TENSOR2)->storage->data<TYPE2>()+(TENSOR2)->storageOffset; \
-  TENSOR2##_stride = (TENSOR2)->stride[DIMENSION]; \
-  TENSOR2##_size = TENSOR2->size[DIMENSION]; \
+  TENSOR2##_stride = (TENSOR2)->stride(DIMENSION); \
+  TENSOR2##_size = TENSOR2->size(DIMENSION); \
 \
   TENSOR3##_data = (TENSOR3)->storage->data<TYPE3>()+(TENSOR3)->storageOffset; \
-  TENSOR3##_stride = (TENSOR3)->stride[DIMENSION]; \
-  TENSOR3##_size = TENSOR3->size[DIMENSION]; \
+  TENSOR3##_stride = (TENSOR3)->stride(DIMENSION); \
+  TENSOR3##_size = TENSOR3->size(DIMENSION); \
 \
   while(!TH_TENSOR_DIM_APPLY_hasFinished) \
   { \
     CODE \
 \
-    if(TENSOR1->_dim() == 1) \
+    if(TENSOR1->dim() == 1) \
        break; \
  \
-    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \
+    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
     { \
       if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
       { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->_dim()-1) \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
         { \
           TH_TENSOR_DIM_APPLY_hasFinished = 1; \
           break; \
@@ -97,22 +92,22 @@
       } \
 \
       TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
-      TENSOR1##_data += TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
-      TENSOR2##_data += TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
-      TENSOR3##_data += TENSOR3->stride[TH_TENSOR_DIM_APPLY_i]; \
+      TENSOR1##_data += TENSOR1->stride(TH_TENSOR_DIM_APPLY_i); \
+      TENSOR2##_data += TENSOR2->stride(TH_TENSOR_DIM_APPLY_i); \
+      TENSOR3##_data += TENSOR3->stride(TH_TENSOR_DIM_APPLY_i); \
 \
-      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) \
+      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) \
       { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->_dim()-1) \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
         { \
           TH_TENSOR_DIM_APPLY_hasFinished = 1; \
           break; \
         } \
         else \
         { \
-          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
-          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
-          TENSOR3##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR3->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride(TH_TENSOR_DIM_APPLY_i); \
+          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride(TH_TENSOR_DIM_APPLY_i); \
+          TENSOR3##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR3->stride(TH_TENSOR_DIM_APPLY_i); \
           TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
         } \
       } \
@@ -147,54 +142,51 @@
   TYPE2 *TENSOR2##_data = NULL; \
   TH_UNUSED int64_t TENSOR2##_stride = 0, TENSOR2##_size = 0; \
   int64_t *TH_TENSOR_DIM_APPLY_counter = NULL; \
-  int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
+  int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \
   int TH_TENSOR_DIM_APPLY_i; \
 \
-  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->_dim()) ) \
+  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \
     THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->_dim()); \
-  if( TENSOR1->_dim() != TENSOR2->_dim() ) {                    \
-    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \
-    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \
-    THError("inconsistent tensor size, expected %s %s and %s %s to have the same " \
-            "number of dimensions", #TENSOR1, T1buff.str, #TENSOR2, T2buff.str);        \
+  if( TENSOR1->dim() != TENSOR2->dim() ) {                    \
+    AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same number of dimensions");        \
   }                                                                     \
   TH_UNUSED int shape_check_flag = 0;                                             \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
   { \
     if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
       continue; \
-    if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \
-      THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \
-      THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \
-      THError("Expected %s %s and %s %s to have the same size in dimension %d", \
-              #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, DIMENSION);   \
+    if(TENSOR1->size(TH_TENSOR_DIM_APPLY_i) != TENSOR2->size(TH_TENSOR_DIM_APPLY_i)) { \
+      AT_ERROR("Expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same size in dimension ", DIMENSION); \
     }                                                                   \
   } \
 \
-  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->_dim())); \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \
+  if (TH_TENSOR_DIM_APPLY_hasFinished) { \
+    return; \
+  } \
+  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
     TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
   TENSOR1##_data = (TENSOR1)->storage->data<TYPE1>()+(TENSOR1)->storageOffset; \
-  TENSOR1##_stride = (TENSOR1)->stride[DIMENSION]; \
-  TENSOR1##_size = TENSOR1->size[DIMENSION]; \
+  TENSOR1##_stride = (TENSOR1)->stride(DIMENSION); \
+  TENSOR1##_size = TENSOR1->size(DIMENSION); \
 \
   TENSOR2##_data = (TENSOR2)->storage->data<TYPE2>()+(TENSOR2)->storageOffset; \
-  TENSOR2##_stride = (TENSOR2)->stride[DIMENSION]; \
-  TENSOR2##_size = TENSOR2->size[DIMENSION]; \
+  TENSOR2##_stride = (TENSOR2)->stride(DIMENSION); \
+  TENSOR2##_size = TENSOR2->size(DIMENSION); \
 \
   while(!TH_TENSOR_DIM_APPLY_hasFinished) \
   { \
     CODE \
 \
-    if(TENSOR1->_dim() == 1) \
+    if(TENSOR1->dim() == 1) \
        break; \
  \
-    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \
+    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
     { \
       if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
       { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->_dim()-1) \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
         { \
           TH_TENSOR_DIM_APPLY_hasFinished = 1; \
           break; \
@@ -203,20 +195,20 @@
       } \
 \
       TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
-      TENSOR1##_data += TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
-      TENSOR2##_data += TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
+      TENSOR1##_data += TENSOR1->stride(TH_TENSOR_DIM_APPLY_i); \
+      TENSOR2##_data += TENSOR2->stride(TH_TENSOR_DIM_APPLY_i); \
 \
-      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) \
+      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) \
       { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->_dim()-1) \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
         { \
           TH_TENSOR_DIM_APPLY_hasFinished = 1; \
           break; \
         } \
         else \
         { \
-          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
-          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride(TH_TENSOR_DIM_APPLY_i); \
+          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride(TH_TENSOR_DIM_APPLY_i); \
           TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
         } \
       } \
@@ -278,8 +270,8 @@
     THError("invalid dimension"); \
 \
   TENSOR##_data = (TENSOR)->storage->data<TYPE>()+(TENSOR)->storageOffset; \
-  TENSOR##_stride = (TENSOR)->stride[DIMENSION]; \
-  TENSOR##_size = TENSOR->size[DIMENSION]; \
+  TENSOR##_stride = (TENSOR)->stride(DIMENSION); \
+  TENSOR##_size = TENSOR->size(DIMENSION); \
   /* Counter stores the indices into the Tensor at any time */ \
   TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR->_dim())); \
   for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR->_dim(); TH_TENSOR_DIM_APPLY_i++) \
@@ -310,9 +302,9 @@
 \
       /* Bump the counter at this index, update the pointer */ \
       TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
-      TENSOR##_data += TENSOR->stride[TH_TENSOR_DIM_APPLY_i]; \
+      TENSOR##_data += TENSOR->stride(TH_TENSOR_DIM_APPLY_i); \
 \
-      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR->size[TH_TENSOR_DIM_APPLY_i]) \
+      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR->size(TH_TENSOR_DIM_APPLY_i)) \
       { \
         /* Handled TENSOR_size(dim) iterations for DIM_APPLY_i. If this is the last dimension, exit */ \
         if(TH_TENSOR_DIM_APPLY_i == TENSOR->_dim()-1) \
@@ -323,7 +315,7 @@
         else \
         { \
           /* Reset the counter, and the pointer to the beginning of the storage for this combination of indices */ \
-          TENSOR##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TENSOR##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR->stride(TH_TENSOR_DIM_APPLY_i); \
           TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
         } \
       } \
diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp
index 2d499b0b67578c..de4b7035fcd4d0 100644
--- a/aten/src/TH/generic/THStorage.cpp
+++ b/aten/src/TH/generic/THStorage.cpp
@@ -21,24 +21,55 @@ size_t THStorage_(elementSize)()
 
 THStorage* THStorage_(new)(void)
 {
-  return THStorage_new(at::CTypeToScalarType<th::from_type<real>>::to());
+  THStorage* storage = new THStorage(
+      at::CTypeToScalarType<th::from_type<real>>::to(),
+      0,
+      getTHDefaultAllocator(),
+      TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE);
+  return storage;
 }
 
 THStorage* THStorage_(newWithSize)(ptrdiff_t size)
 {
-  return THStorage_newWithSize(at::CTypeToScalarType<th::from_type<real>>::to(), size);
+  THStorage* storage = new THStorage(
+      at::CTypeToScalarType<th::from_type<real>>::to(),
+      size,
+      getTHDefaultAllocator(),
+      TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE);
+  return storage;
 }
 
 THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
                                         at::Allocator *allocator)
 {
-  return THStorage_newWithAllocator(at::CTypeToScalarType<th::from_type<real>>::to(), size, allocator);
+  THStorage* storage = new THStorage(
+      at::CTypeToScalarType<th::from_type<real>>::to(),
+      size,
+      allocator,
+      TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE);
+  return storage;
 }
 
 
 THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags)
 {
-  return THStorage_newWithMapping(at::CTypeToScalarType<th::from_type<real>>::to(), filename, size, flags);
+  auto scalar_type = at::CTypeToScalarType<th::from_type<real>>::to();
+  size_t actual_size = -1;
+  THStorage* storage = new THStorage(
+      scalar_type,
+      size,
+      THMapAllocator::makeDataPtr(
+          filename, flags, size * at::elementSize(scalar_type), &actual_size),
+      /* allocator */ nullptr,
+      TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE);
+
+  if (size <= 0) {
+    storage->size = actual_size / at::elementSize(scalar_type);
+  }
+
+  THStorage_clearFlag(storage, TH_STORAGE_RESIZABLE);
+
+  return storage;
 }
 
 THStorage* THStorage_(newWithSize1)(real data0)
@@ -101,7 +132,13 @@ void THStorage_(free)(THStorage *storage)
 
 THStorage* THStorage_(newWithDataAndAllocator)(at::DataPtr&& data, ptrdiff_t size,
                                                at::Allocator* allocator) {
-  return THStorage_newWithDataAndAllocator(at::CTypeToScalarType<th::from_type<real>>::to(), std::move(data), size, allocator);
+  THStorage* storage = new THStorage(
+      at::CTypeToScalarType<th::from_type<real>>::to(),
+      size,
+      std::move(data),
+      allocator,
+      TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE);
+  return storage;
 }
 
 void THStorage_(resize)(THStorage *storage, ptrdiff_t size)
diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp
index d03c7294f58c94..92314de69bae29 100644
--- a/aten/src/TH/generic/THTensor.cpp
+++ b/aten/src/TH/generic/THTensor.cpp
@@ -29,27 +29,27 @@ int64_t THTensor_(size)(const THTensor *self, int dim)
 {
   THArgCheck((dim >= 0) && (dim < self->dim()), 2, "dimension %d out of range of %dD tensor",
       dim+TH_INDEX_BASE, THTensor_(nDimension)(self));
-  return self->size[dim];
+  return self->size(dim);
 }
 
 int64_t THTensor_(stride)(const THTensor *self, int dim)
 {
   THArgCheck((dim >= 0) && (dim < self->dim()), 2, "dimension %d out of range of %dD tensor",
       dim+TH_INDEX_BASE, THTensor_(nDimension)(self));
-  return self->stride[dim];
+  return self->stride(dim);
 }
 
 THLongStorage *THTensor_(newSizeOf)(THTensor *self)
 {
   THLongStorage *size = THLongStorage_newWithSize(self->dim());
-  THLongStorage_rawCopy(size, self->size);
+  THLongStorage_rawCopy(size, THTensor_getSizePtr(self));
   return size;
 }
 
 THLongStorage *THTensor_(newStrideOf)(THTensor *self)
 {
   THLongStorage *stride = THLongStorage_newWithSize(self->dim());
-  THLongStorage_rawCopy(stride, self->stride);
+  THLongStorage_rawCopy(stride, THTensor_getStridePtr(self));
   return stride;
 }
 
@@ -61,53 +61,36 @@ real *THTensor_(data)(const THTensor *self)
     return NULL;
 }
 
-void THTensor_(setFlag)(THTensor *self, const char flag)
-{
-  self->flag |= flag;
-}
-
-void THTensor_(clearFlag)(THTensor *self, const char flag)
-{
-  self->flag &= ~flag;
-}
-
 /**** creation methods ****/
 
-static void THTensor_(rawInit)(THTensor *self);
-
-
 /* Empty init */
 THTensor *THTensor_(new)(void)
 {
-  THTensor *self = (THTensor *)THAlloc(sizeof(THTensor));
-  THTensor_(rawInit)(self);
-  return self;
+  return new THTensor(THStorage_(new)());
 }
 
 /* Pointer-copy init */
 THTensor *THTensor_(newWithTensor)(THTensor *tensor)
 {
-  THTensor *self = (THTensor *)THAlloc(sizeof(THTensor));
-  THTensor_(rawInit)(self);
+  THTensor *self = new THTensor(THStorage_(new)());
   THTensor_(setStorageNd)(self,
                           tensor->storage,
                           tensor->storageOffset,
                           tensor->dim(),
-                          tensor->size,
-                          tensor->stride);
+                          THTensor_getSizePtr(tensor),
+                          THTensor_getStridePtr(tensor));
   return self;
 }
 
 /* Storage init */
 THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset, THLongStorage *size, THLongStorage *stride)
 {
-  THTensor *self = (THTensor *)THAlloc(sizeof(THTensor));
   if(size && stride) {
     THArgCheck(size->size == stride->size, 4, "inconsistent size");
   }
-
   AT_CHECK(size, "size must not be null");
-  THTensor_(rawInit)(self);
+
+  THTensor *self = new THTensor(THStorage_(new)());
 #ifdef DEBUG
   THAssert(size->size <= INT_MAX);
 #endif
@@ -123,8 +106,7 @@ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset,
 
 THTensor *THTensor_(newWithStorageIntLists)(THStorage *storage, ptrdiff_t storageOffset, at::IntList sizes, at::IntList strides) {
   AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
-  THTensor *self = (THTensor *)THAlloc(sizeof(THTensor));
-  THTensor_(rawInit)(self);
+  THTensor *self = new THTensor(THStorage_(new)());
   THTensor_(setStorageNd)(self, storage, storageOffset, sizes.size(),
                           const_cast<int64_t*>(sizes.data()), const_cast<int64_t*>(strides.data()));
 
@@ -169,8 +151,7 @@ THTensor *THTensor_(newWithSize)(THLongStorage *size, THLongStorage *stride)
 }
 
 THTensor *THTensor_(newWithSizeIntList)(at::IntList sizes) {
-  THTensor *self = (THTensor *)THAlloc(sizeof(THTensor));
-  THTensor_(rawInit)(self);
+  THTensor *self = new THTensor(THStorage_(new)());
   THTensor_(resizeNd)(self, sizes.size(), const_cast<int64_t*>(sizes.data()), nullptr);
 
   return self;
@@ -248,8 +229,8 @@ THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size)
   ptrdiff_t numel = THTensor_(nElement)(tensor);
   THTensor *self = THTensor_(new)();
   THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel);
-  auto stride = THTensor_compute_stride(at::IntList(tensor->size, tensor->dim()),
-                                        at::IntList(tensor->stride, tensor->dim()),
+  auto stride = THTensor_compute_stride(tensor->sizes(),
+                                        tensor->strides(),
                                         at::IntList(inferred_size->data<int64_t>(), inferred_size->size));
   THArgCheck(stride.has_value(), 2, "view size is "
     "not compatible with input tensor's size and stride (at least one dimension spans "
@@ -279,7 +260,7 @@ void THTensor_(resize)(THTensor *self, THLongStorage *size, THLongStorage *strid
 void THTensor_(resizeAs)(THTensor *self, THTensor *src)
 {
   if(!THTensor_(isSameSizeAs)(self, src))
-    THTensor_(resizeNd)(self, src->dim(), src->size, NULL);
+    THTensor_(resizeNd)(self, src->dim(), THTensor_getSizePtr(src), NULL);
 }
 
 void THTensor_(resize1d)(THTensor *tensor, int64_t size0)
@@ -319,8 +300,8 @@ void THTensor_(set)(THTensor *self, THTensor *src)
                             src->storage,
                             src->storageOffset,
                             src->dim(),
-                            src->size,
-                            src->stride);
+                            THTensor_getSizePtr(src),
+                            THTensor_getStridePtr(src));
 }
 
 void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
@@ -401,14 +382,14 @@ void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t fir
 #else
   THArgCheck( size > 0, 4, "out of range");
 #endif
-  THArgCheck(firstIndex <= src->size[dimension] - size, 4, "out of range");
+  THArgCheck(firstIndex <= src->size(dimension) - size, 4, "out of range");
 
   THTensor_(set)(self, src);
 
   if(firstIndex > 0)
-    self->storageOffset += firstIndex*self->stride[dimension];
+    self->storageOffset += firstIndex*self->stride(dimension);
 
-  self->size[dimension] = size;
+  THTensor_setSizeAtDim(self, dimension, size);
 }
 
 void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sliceIndex)
@@ -418,20 +399,24 @@ void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sli
   if(!src)
     src = self;
 
-#ifndef USE_TH_SCALAR
+#ifndef USE_TH_SIZE_ZERO_DIM
   THArgCheck(src->_dim() > 1, 1, "cannot select on a vector");
+#else
+#ifndef USE_TH_SCALAR
+  THArgCheck(src->dim() > 1, 1, "cannot select on a vector");
+#endif
 #endif
   THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range");
-  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 3, "out of range");
+  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 3, "out of range");
 
   THTensor_(set)(self, src);
   THTensor_(narrow)(self, NULL, dimension, sliceIndex, 1);
   for(d = dimension; d < self->dim()-1; d++)
   {
-    self->size[d] = self->size[d+1];
-    self->stride[d] = self->stride[d+1];
+    THTensor_setSizeAtDim(self, d, self->size(d+1));
+    THTensor_setStrideAtDim(self, d, self->stride(d+1));
   }
-  self->dim_--;
+  THTensor_resizeDim(self, self->dim_ - 1);
 }
 
 void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dimension2)
@@ -441,26 +426,24 @@ void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dim
   if(!src)
     src = self;
 
-  THArgCheck( (dimension1 >= 0) && (dimension1 < src->_dim()), 1, "out of range");
-  THArgCheck( (dimension2 >= 0) && (dimension2 < src->_dim()), 2, "out of range");
+  THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range");
+  THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range");
 
   THTensor_(set)(self, src);
 
   if(dimension1 == dimension2)
     return;
 
-  z = self->stride[dimension1];
-  self->stride[dimension1] = self->stride[dimension2];
-  self->stride[dimension2] = z;
-  z = self->size[dimension1];
-  self->size[dimension1] = self->size[dimension2];
-  self->size[dimension2] = z;
+  z = self->stride(dimension1);
+  THTensor_setStrideAtDim(self, dimension1, self->stride(dimension2));
+  THTensor_setStrideAtDim(self, dimension2, z);
+  z = self->size(dimension1);
+  THTensor_setSizeAtDim(self, dimension1, self->size(dimension2));
+  THTensor_setSizeAtDim(self, dimension2, z);
 }
 
 void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t size, int64_t step)
 {
-  int64_t *newSize;
-  int64_t *newStride;
   int d;
 
   if(!src)
@@ -470,36 +453,31 @@ void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t siz
   THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor");
 #endif
   THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range");
-  THArgCheck(size <= src->size[dimension], 3, "out of range");
+  THArgCheck(size <= src->size(dimension), 3, "out of range");
   THArgCheck(step > 0, 4, "invalid step");
 
   THTensor_(set)(self, src);
 
-  newSize = (int64_t *)THAlloc(sizeof(int64_t)*(self->dim()+1));
-  newStride = (int64_t *)THAlloc(sizeof(int64_t)*(self->dim()+1));
+  std::vector<int64_t> newSize(/* size */ self->dim()+1);
+  std::vector<int64_t> newStride(/* size */ self->dim()+1);
 
   newSize[self->dim()] = size;
-  newStride[self->dim()] = self->stride[dimension];
+  newStride[self->dim()] = self->stride(dimension);
   for(d = 0; d < self->dim(); d++)
   {
     if(d == dimension)
     {
-      newSize[d] = (self->size[d] - size) / step + 1;
-      newStride[d] = step*self->stride[d];
+      newSize[d] = (self->size(d) - size) / step + 1;
+      newStride[d] = step*self->stride(d);
     }
     else
     {
-      newSize[d] = self->size[d];
-      newStride[d] = self->stride[d];
+      newSize[d] = self->size(d);
+      newStride[d] = self->stride(d);
     }
   }
 
-  THFree(self->size);
-  THFree(self->stride);
-
-  self->size = newSize;
-  self->stride = newStride;
-  self->dim_++;
+  THTensor_setSizesAndStrides(self, std::move(newSize), std::move(newStride));
 }
 
 /* we have to handle the case where the result is a number */
@@ -515,12 +493,12 @@ void THTensor_(squeeze)(THTensor *self, THTensor *src)
 
   for(d = 0; d < src->dim(); d++)
   {
-    if(src->size[d] != 1)
+    if(src->size(d) != 1)
     {
       if(d != ndim)
       {
-        self->size[ndim] = src->size[d];
-        self->stride[ndim] = src->stride[d];
+        THTensor_setSizeAtDim(self, ndim, src->size(d));
+        THTensor_setStrideAtDim(self, ndim, src->stride(d));
       }
       ndim++;
     }
@@ -530,12 +508,12 @@ void THTensor_(squeeze)(THTensor *self, THTensor *src)
   /* right now, we do not handle 0-dimension tensors */
   if(ndim == 0 && src->dim() > 0)
   {
-    self->size[0] = 1;
-    self->stride[0] = 1;
+    THTensor_setSizeAtDim(self, 0, 1);
+    THTensor_setStrideAtDim(self, 0, 1);
     ndim = 1;
   }
 #endif
-  self->dim_ = ndim;
+  THTensor_resizeDim(self, ndim);
 }
 
 void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension)
@@ -550,17 +528,17 @@ void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension)
   THTensor_(set)(self, src);
 
 #ifdef USE_TH_SCALAR
-  if(src->size[dimension] == 1)
+  if(src->size(dimension) == 1)
 #else
-  if(src->size[dimension] == 1 && src->dim() > 1)
+  if(src->size(dimension) == 1 && src->dim() > 1)
 #endif
   {
     for(d = dimension; d < self->dim()-1; d++)
     {
-      self->size[d] = self->size[d+1];
-      self->stride[d] = self->stride[d+1];
+      THTensor_setSizeAtDim(self, d, self->size(d+1));
+      THTensor_setStrideAtDim(self, d, self->stride(d+1));
     }
-    self->dim_--;
+    THTensor_resizeDim(self, self->dim_ - 1);
   }
 }
 
@@ -578,19 +556,17 @@ void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension)
 
   THTensor_(set)(self, src);
 
-  self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->dim()+1));
-  self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->dim()+1));
-  self->dim_++;
+  THTensor_resizeDim(self, self->dim() + 1);
   for (d = self->dim()-1; d > dimension; d--) {
-    self->size[d] = self->size[d-1];
-    self->stride[d] = self->stride[d-1];
+    THTensor_setSizeAtDim(self, d, self->size(d-1));
+    THTensor_setStrideAtDim(self, d, self->stride(d-1));
   }
   if (dimension+1 < self->dim()) {
-    self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
+    THTensor_setStrideAtDim(self, dimension, self->size(dimension+1) * self->stride(dimension+1));
   } else {
-    self->stride[dimension] = 1;
+    THTensor_setStrideAtDim(self, dimension, 1);
   }
-  self->size[dimension] = 1;
+  THTensor_setSizeAtDim(self, dimension, 1);
 }
 
 int THTensor_(isTransposed)(const THTensor *self)
@@ -603,13 +579,13 @@ int THTensor_(isTransposed)(const THTensor *self)
   int64_t z = 1;
   int d;
   for (d = 0; d < self->_dim(); ++d) {
-    if (self->stride[d] == 0 && self->size[d] != 1)
+    if (self->stride(d) == 0 && self->size(d) != 1)
       return 0;
-    if (self->stride[d] > max_stride) {
-      max_stride = self->stride[d];
-      size_max_stride = self->size[d];
+    if (self->stride(d) > max_stride) {
+      max_stride = self->stride(d);
+      size_max_stride = self->size(d);
     }
-    z *= self->size[d];
+    z *= self->size(d);
   }
   if (z == max_stride * size_max_stride) {
     return 1;
@@ -624,10 +600,10 @@ int THTensor_(isContiguous)(const THTensor *self)
   int d;
   for(d = self->dim()-1; d >= 0; d--)
   {
-    if(self->size[d] != 1)
+    if(self->size(d) != 1)
     {
-      if(self->stride[d] == z)
-        z *= self->size[d];
+      if(self->stride(d) == z)
+        z *= self->size(d);
       else
         return 0;
     }
@@ -643,7 +619,7 @@ int THTensor_(isSize)(const THTensor *self, const THLongStorage *dims)
 
   for(d = 0; d < self->_dim(); ++d)
   {
-    if(self->size[d] != THLongStorage_data(dims)[d])
+    if(self->size(d) != THLongStorage_data(dims)[d])
       return 0;
   }
   return 1;
@@ -656,7 +632,7 @@ int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor* src)
     return 0;
   for(d = 0; d < self->dim(); ++d)
   {
-    if(self->size[d] != src->size[d])
+    if(self->size(d) != src->size(d))
       return 0;
   }
   return 1;
@@ -673,7 +649,7 @@ int THTensor_(isSetTo)(const THTensor *self, const THTensor* src)
     int d;
     for (d = 0; d < self->_dim(); ++d)
     {
-      if (self->size[d] != src->size[d] || self->stride[d] != src->stride[d])
+      if (self->size(d) != src->size(d) || self->stride(d) != src->stride(d))
         return 0;
     }
     return 1;
@@ -690,15 +666,14 @@ ptrdiff_t THTensor_(nElement)(const THTensor *self)
     ptrdiff_t nElement = 1;
     int d;
     for(d = 0; d < self->_dim(); d++)
-      nElement *= self->size[d];
+      nElement *= self->size(d);
     return nElement;
   }
 }
 
 void THTensor_(retain)(THTensor *self)
 {
-  if(self->flag & TH_TENSOR_REFCOUNTED)
-    ++self->refcount;
+  ++self->refcount;
 }
 
 void THTensor_(free)(THTensor *self)
@@ -716,19 +691,6 @@ void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst)
 
 /*******************************************************************************/
 
-static void THTensor_(rawInit)(THTensor *self)
-{
-  new (&self->refcount) std::atomic<int>(1);
-  self->storage = THStorage_(new)();
-  self->storageOffset = 0;
-  self->size = static_cast<int64_t *>(THAlloc(sizeof(int64_t)));
-  self->stride = static_cast<int64_t *>(THAlloc(sizeof(int64_t)));
-  self->size[0] = 0;
-  self->stride[0] = 1;
-  self->dim_ = 1;
-  self->flag = TH_TENSOR_REFCOUNTED;
-}
-
 void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride)
 {
   /* storage */
@@ -778,12 +740,12 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t
       AT_CHECK(size[d] > 0, "sizes must be non-negative");
     }
 #endif
-    if((self->dim() > d) && (size[d] != self->size[d])) {
+    if((self->dim() > d) && (size[d] != self->size(d))) {
       hascorrectsize = false;
     }
 
     // NB: this used to test that stride[d] was >= 0
-    if((self->dim() > d) && stride && (stride[d] != self->stride[d])) {
+    if((self->dim() > d) && stride && (stride[d] != self->stride(d))) {
       hascorrectsize = false;
     }
   }
@@ -798,26 +760,24 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t
 
   if(nDimension != self->dim())
   {
-    self->size = (int64_t *)THRealloc(self->size, sizeof(int64_t)*nDimension);
-    self->stride = (int64_t *)THRealloc(self->stride, sizeof(int64_t)*nDimension);
-    self->dim_ = nDimension;
+    THTensor_resizeDim(self, nDimension);
   }
 
   totalSize = 1;
   for(d = nDimension-1; d >= 0; d--)
   {
-    self->size[d] = size[d];
+    THTensor_setSizeAtDim(self, d, size[d]);
     if(stride && (stride[d] >= 0) ) {
-      self->stride[d] = stride[d];
+      THTensor_setStrideAtDim(self, d, stride[d]);
     } else {
       if(d == nDimension-1) {
-        self->stride[d] = 1;
+        THTensor_setStrideAtDim(self, d, 1);
       } else {
         // Keep stride monotonically increasing to match NumPy.
-        self->stride[d] = std::max<int64_t>(self->size[d+1], 1)*self->stride[d+1];
+        THTensor_setStrideAtDim(self, d, std::max<int64_t>(self->size(d+1), 1)*self->stride(d+1));
       }
     }
-    totalSize += (self->size[d]-1)*self->stride[d];
+    totalSize += (self->size(d)-1)*self->stride(d);
   }
 
   if(totalSize+self->storageOffset > 0)
@@ -834,57 +794,57 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t
 void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value)
 {
   THArgCheck(tensor->_dim() == 1, 1, "tensor must have one dimension");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
-  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value);
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
+  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0), value);
 }
 
 real THTensor_(get1d)(const THTensor *tensor, int64_t x0)
 {
   THArgCheck(tensor->_dim() == 1, 1, "tensor must have one dimension");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
-  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]);
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
+  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0));
 }
 
 void THTensor_(set2d)(THTensor *tensor, int64_t x0, int64_t x1, real value)
 {
   THArgCheck(tensor->_dim() == 2, 1, "tensor must have two dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
-  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value);
+  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range");
+  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1), value);
 }
 
 real THTensor_(get2d)(const THTensor *tensor, int64_t x0, int64_t x1)
 {
   THArgCheck(tensor->_dim() == 2, 1, "tensor must have two dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
-  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]);
+  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range");
+  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1));
 }
 
 void THTensor_(set3d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value)
 {
   THArgCheck(tensor->_dim() == 3, 1, "tensor must have three dimensions");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
-  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value);
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range");
+  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value);
 }
 
 real THTensor_(get3d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2)
 {
   THArgCheck(tensor->_dim() == 3, 1, "tensor must have three dimensions");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
-  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]);
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range");
+  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2));
 }
 
 void THTensor_(set4d)(THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value)
 {
   THArgCheck(tensor->_dim() == 4, 1, "tensor must have four dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
-  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value);
+  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range");
+  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value);
 }
 
 real THTensor_(get4d)(const THTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3)
 {
   THArgCheck(tensor->_dim() == 4, 1, "tensor must have four dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
-  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3]);
+  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range");
+  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3));
 }
 
 THDescBuff THTensor_(desc)(const THTensor *tensor) {
@@ -898,7 +858,7 @@ THDescBuff THTensor_(desc)(const THTensor *tensor) {
   int i;
   for(i = 0; i < tensor->_dim(); i++) {
     if(n >= L) break;
-    n += snprintf(str+n, L-n, "%" PRId64, tensor->size[i]);
+    n += snprintf(str+n, L-n, "%" PRId64, tensor->size(i));
     if(i < tensor->_dim()-1) {
       n += snprintf(str+n, L-n, "x");
     }
diff --git a/aten/src/TH/generic/THTensor.h b/aten/src/TH/generic/THTensor.h
index 03cc0169677fb9..cdc8f7edef41ce 100644
--- a/aten/src/TH/generic/THTensor.h
+++ b/aten/src/TH/generic/THTensor.h
@@ -4,7 +4,7 @@
 
 /* a la lua? dim, storageoffset, ...  et les methodes ? */
 
-#define TH_TENSOR_REFCOUNTED 1
+#define THCTensor THTensor
 
 // Struct definition moved to THTensor.hpp
 typedef struct THTensor THTensor;
@@ -33,9 +33,6 @@ TH_API THLongStorage *THTensor_(newSizeOf)(THTensor *self);
 TH_API THLongStorage *THTensor_(newStrideOf)(THTensor *self);
 TH_API real *THTensor_(data)(const THTensor *self);
 
-TH_API void THTensor_(setFlag)(THTensor *self, const char flag);
-TH_API void THTensor_(clearFlag)(THTensor *self, const char flag);
-
 
 /**** creation methods ****/
 TH_API THTensor *THTensor_(new)(void);
diff --git a/aten/src/TH/generic/THTensorConv.cpp b/aten/src/TH/generic/THTensorConv.cpp
index fb4670cf0f7903..0c590d6f9e400e 100644
--- a/aten/src/TH/generic/THTensorConv.cpp
+++ b/aten/src/TH/generic/THTensorConv.cpp
@@ -600,15 +600,15 @@ void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_,
   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);
 
-  nInputPlane = input->size[0];
-  istride0    = input->stride[0];
-  nInputRows  = input->size[1];
-  nInputCols  = input->size[2];
+  nInputPlane = input->size(0);
+  istride0    = input->stride(0);
+  nInputRows  = input->size(1);
+  nInputCols  = input->size(2);
 
-  kstride0 = kernel->stride[0];
-  nKernelPlane = kernel->size[0];
-  nKernelRows = kernel->size[1];
-  nKernelCols = kernel->size[2];
+  kstride0 = kernel->stride(0);
+  nKernelPlane = kernel->size(0);
+  nKernelRows = kernel->size(1);
+  nKernelCols = kernel->size(2);
 
   THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "covn2DRevger : Input image is smaller than kernel");
 
@@ -627,7 +627,7 @@ void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_,
     /*THTensor_(zero)(r_);*/
 
 #pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    for (k = 0; k < r_->size(0)*r_->size(1); k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       int64_t l;
@@ -639,7 +639,7 @@ void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_,
   {
     /*THTensor_(mul)(r_, beta);*/
 #pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    for (k = 0; k < r_->size(0)*r_->size(1); k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       int64_t l;
@@ -706,21 +706,21 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_,
   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);
 
-  istride0    = input->stride[0];
-  istride1    = input->stride[1];
-  nbatch      = input->size[0];
-  nInputPlane = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
+  istride0    = input->stride(0);
+  istride1    = input->stride(1);
+  nbatch      = input->size(0);
+  nInputPlane = input->size(1);
+  nInputRows  = input->size(2);
+  nInputCols  = input->size(3);
 
-  kstride0 = kernel->stride[0];
-  kstride1 = kernel->stride[1];
-  nKernelPlane = kernel->size[1];
-  nKernelRows = kernel->size[2];
-  nKernelCols = kernel->size[3];
+  kstride0 = kernel->stride(0);
+  kstride1 = kernel->stride(1);
+  nKernelPlane = kernel->size(1);
+  nKernelRows = kernel->size(2);
+  nKernelCols = kernel->size(3);
 
   THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv2DRevger : Input image is smaller than kernel");
-  THArgCheck(kernel->size[0] == input->size[0] , 2, "conv2DRevger : Input batch and kernel batch is not same size");
+  THArgCheck(kernel->size(0) == input->size(0) , 2, "conv2DRevger : Input batch and kernel batch is not same size");
 
   nOutputRows = nInputRows - (nKernelRows - 1) * srow;
   nOutputCols = nInputCols - (nKernelCols - 1) * scol;
@@ -737,7 +737,7 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_,
     /*THTensor_(zero)(r_);*/
 
 #pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    for (k = 0; k < r_->size(0)*r_->size(1); k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       int64_t l;
@@ -749,7 +749,7 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_,
   {
     /*THTensor_(mul)(r_, beta);*/
 #pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    for (k = 0; k < r_->size(0)*r_->size(1); k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       int64_t l;
@@ -820,15 +820,15 @@ void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);
 
-  nInputPlane = input->size[0];
-  istride0    = input->stride[0];
-  nInputRows  = input->size[1];
-  nInputCols  = input->size[2];
+  nInputPlane = input->size(0);
+  istride0    = input->stride(0);
+  nInputRows  = input->size(1);
+  nInputCols  = input->size(2);
 
-  kstride0 = kernel->stride[0];
-  nKernelPlane = kernel->size[0];
-  nKernelRows = kernel->size[1];
-  nKernelCols = kernel->size[2];
+  kstride0 = kernel->stride(0);
+  nKernelPlane = kernel->size(0);
+  nKernelRows = kernel->size(1);
+  nKernelCols = kernel->size(2);
 
   THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dger : Input image is smaller than kernel");
 
@@ -851,7 +851,7 @@ void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   {
     /*THTensor_(zero)(r_);*/
 #pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    for (k = 0; k < r_->size(0)*r_->size(1); k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       int64_t l;
@@ -863,7 +863,7 @@ void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   {
     /*THTensor_(mul)(r_, beta);*/
 #pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    for (k = 0; k < r_->size(0)*r_->size(1); k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       int64_t l;
@@ -949,24 +949,24 @@ void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
   THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
 
   input = THTensor_(newContiguous)(t_);
-  if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
+  if (!(k_->stride(3) == 1) || !(k_->stride(2) == k_->size(3))) {
     kernel = THTensor_(newContiguous)(k_);
   } else {
     THTensor_(retain)(k_);
     kernel = k_;
   }
 
-  nInputPlane = input->size[0];
-  istride0    = input->stride[0];
-  nInputRows  = input->size[1];
-  nInputCols  = input->size[2];
+  nInputPlane = input->size(0);
+  istride0    = input->stride(0);
+  nInputRows  = input->size(1);
+  nInputCols  = input->size(2);
 
-  kstride0    = kernel->stride[0];
-  kstride1    = kernel->stride[1];
-  nKernelRows = kernel->size[2];
-  nKernelCols = kernel->size[3];
-  nOutputPlane = kernel->size[0];
-  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
+  kstride0    = kernel->stride(0);
+  kstride1    = kernel->stride(1);
+  nKernelRows = kernel->size(2);
+  nKernelCols = kernel->size(3);
+  nOutputPlane = kernel->size(0);
+  THArgCheck(kernel->size(1) == nInputPlane, 2, "invalid number of input planes");
 
   THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");
 
@@ -989,7 +989,7 @@ void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
   {
     /*THTensor_(zero)(r_);*/
 #pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]; k++)
+    for (k = 0; k < r_->size(0); k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       int64_t l;
@@ -1001,7 +1001,7 @@ void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
   {
     /*THTensor_(mul)(r_, beta);*/
 #pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]; k++)
+    for (k = 0; k < r_->size(0); k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       int64_t l;
@@ -1087,24 +1087,24 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
   THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
 
   input = THTensor_(newContiguous)(t_);
-  if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
+  if (!(k_->stride(3) == 1) || !(k_->stride(2) == k_->size(3))) {
     kernel = THTensor_(newContiguous)(k_);
   } else {
     THTensor_(retain)(k_);
     kernel = k_;
   }
 
-  nbatch = input->size[0];
-  nInputPlane = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
+  nbatch = input->size(0);
+  nInputPlane = input->size(1);
+  nInputRows  = input->size(2);
+  nInputCols  = input->size(3);
 
-  kstride0    = kernel->stride[0];
-  kstride1    = kernel->stride[1];
-  nKernelRows = kernel->size[2];
-  nKernelCols = kernel->size[3];
-  nOutputPlane = kernel->size[0];
-  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
+  kstride0    = kernel->stride(0);
+  kstride1    = kernel->stride(1);
+  nKernelRows = kernel->size(2);
+  nKernelCols = kernel->size(3);
+  nOutputPlane = kernel->size(0);
+  THArgCheck(kernel->size(1) == nInputPlane, 2, "invalid number of input planes");
 
   THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");
 
@@ -1127,10 +1127,10 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
   {
     /*THTensor_(zero)(r_);*/
 #pragma omp parallel for private(p)
-    for (p=0; p < r_->size[0]; p++)
+    for (p=0; p < r_->size(0); p++)
     {
       int64_t k;
-      for (k = 0; k < r_->size[1]; k++)
+      for (k = 0; k < r_->size(1); k++)
       {
         real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
         int64_t l;
@@ -1143,10 +1143,10 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
   {
     /*THTensor_(mul)(r_, beta);*/
 #pragma omp parallel for private(p)
-    for(p=0; p < r_->size[0]; p++)
+    for(p=0; p < r_->size(0); p++)
     {
       int64_t k;
-      for (k = 0; k < r_->size[1]; k++)
+      for (k = 0; k < r_->size(1); k++)
       {
         real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
         int64_t l;
@@ -1236,10 +1236,10 @@ void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);
 
-  nInputRows  = input->size[0];
-  nInputCols  = input->size[1];
-  nKernelRows = kernel->size[0];
-  nKernelCols = kernel->size[1];
+  nInputRows  = input->size(0);
+  nInputCols  = input->size(1);
+  nKernelRows = kernel->size(0);
+  nKernelCols = kernel->size(1);
 
   THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmul : Input image is smaller than kernel");
 
@@ -1295,15 +1295,15 @@ void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, TH
   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);
 
-  istride0    = input->stride[0];
-  nInputPlane = input->size[0];
-  nInputRows  = input->size[1];
-  nInputCols  = input->size[2];
+  istride0    = input->stride(0);
+  nInputPlane = input->size(0);
+  nInputRows  = input->size(1);
+  nInputCols  = input->size(2);
 
-  kstride0    = kernel->stride[0];
-  nOutputPlane = kernel->size[0];
-  nKernelRows = kernel->size[1];
-  nKernelCols = kernel->size[2];
+  kstride0    = kernel->stride(0);
+  nOutputPlane = kernel->size(0);
+  nKernelRows = kernel->size(1);
+  nKernelCols = kernel->size(2);
 
   THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
   THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dcmul : Input image is smaller than kernel");
@@ -1374,15 +1374,15 @@ void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);
 
-  istride0    = input->stride[0];
-  nInputPlane = input->size[0];
-  nInputRows  = input->size[1];
-  nInputCols  = input->size[2];
+  istride0    = input->stride(0);
+  nInputPlane = input->size(0);
+  nInputRows  = input->size(1);
+  nInputCols  = input->size(2);
 
-  kstride0    = kernel->stride[0];
-  nOutputPlane = kernel->size[0];
-  nKernelRows = kernel->size[1];
-  nKernelCols = kernel->size[2];
+  kstride0    = kernel->stride(0);
+  nOutputPlane = kernel->size(0);
+  nKernelRows = kernel->size(1);
+  nKernelCols = kernel->size(2);
 
   THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
   THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols)
@@ -1405,7 +1405,7 @@ void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);
 
-  nmaps = map->size[0];
+  nmaps = map->size(0);
 
   for(k = 0; k < nmaps; k++)
   {
@@ -1462,17 +1462,17 @@ void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_,
   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);
 
-  nInputPlane = input->size[0];
-  istride0    = input->stride[0];
-  nInputDepth = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
+  nInputPlane = input->size(0);
+  istride0    = input->stride(0);
+  nInputDepth = input->size(1);
+  nInputRows  = input->size(2);
+  nInputCols  = input->size(3);
 
-  kstride0 = kernel->stride[0];
-  nKernelPlane = kernel->size[0];
-  nKernelDepth= kernel->size[1];
-  nKernelRows = kernel->size[2];
-  nKernelCols = kernel->size[3];
+  kstride0 = kernel->stride(0);
+  nKernelPlane = kernel->size(0);
+  nKernelDepth= kernel->size(1);
+  nKernelRows = kernel->size(2);
+  nKernelCols = kernel->size(3);
 
   THArgCheck(nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv3DRevger : Input image is smaller than kernel");
 
@@ -1550,17 +1550,17 @@ void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);
 
-  nInputPlane = input->size[0];
-  istride0    = input->stride[0];
-  nInputDepth = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
+  nInputPlane = input->size(0);
+  istride0    = input->stride(0);
+  nInputDepth = input->size(1);
+  nInputRows  = input->size(2);
+  nInputCols  = input->size(3);
 
-  kstride0     = kernel->stride[0];
-  nKernelPlane = kernel->size[0];
-  nKernelDepth = kernel->size[1];
-  nKernelRows  = kernel->size[2];
-  nKernelCols  = kernel->size[3];
+  kstride0     = kernel->stride(0);
+  nKernelPlane = kernel->size(0);
+  nKernelDepth = kernel->size(1);
+  nKernelRows  = kernel->size(2);
+  nKernelCols  = kernel->size(3);
 
   THArgCheck((nInputDepth >= nKernelDepth
               && nInputRows >= nKernelRows
@@ -1639,26 +1639,26 @@ void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe
   THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
 
   input = THTensor_(newContiguous)(t_);
-  if (!(k_->stride[4] == 1) || !(k_->stride[3] == k_->size[4])) {
+  if (!(k_->stride(4) == 1) || !(k_->stride(3) == k_->size(4))) {
     kernel = THTensor_(newContiguous)(k_);
   } else {
     THTensor_(retain)(k_);
     kernel = k_;
   }
 
-  nInputPlane = input->size[0];
-  istride0    = input->stride[0];
-  nInputDepth = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
+  nInputPlane = input->size(0);
+  istride0    = input->stride(0);
+  nInputDepth = input->size(1);
+  nInputRows  = input->size(2);
+  nInputCols  = input->size(3);
 
-  kstride0    = kernel->stride[0];
-  kstride1    = kernel->stride[1];
-  nKernelDepth = kernel->size[2];
-  nKernelRows = kernel->size[3];
-  nKernelCols = kernel->size[4];
-  nOutputPlane = kernel->size[0];
-  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
+  kstride0    = kernel->stride(0);
+  kstride1    = kernel->stride(1);
+  nKernelDepth = kernel->size(2);
+  nKernelRows = kernel->size(3);
+  nKernelCols = kernel->size(4);
+  nOutputPlane = kernel->size(0);
+  THArgCheck(kernel->size(1) == nInputPlane, 2, "invalid number of input planes");
 
   THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmv : Input image is smaller than kernel");
 
@@ -1736,12 +1736,12 @@ void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);
 
-  nInputDepth = input->size[0];
-  nInputRows  = input->size[1];
-  nInputCols  = input->size[2];
-  nKernelDepth = kernel->size[0];
-  nKernelRows = kernel->size[1];
-  nKernelCols = kernel->size[2];
+  nInputDepth = input->size(0);
+  nInputRows  = input->size(1);
+  nInputCols  = input->size(2);
+  nKernelDepth = kernel->size(0);
+  nKernelRows = kernel->size(1);
+  nKernelCols = kernel->size(2);
 
   THArgCheck((nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmul : Input image is smaller than kernel");
 
@@ -1802,17 +1802,17 @@ void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, TH
   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);
 
-  istride0    = input->stride[0];
-  nInputPlane = input->size[0];
-  nInputDepth = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
+  istride0    = input->stride(0);
+  nInputPlane = input->size(0);
+  nInputDepth = input->size(1);
+  nInputRows  = input->size(2);
+  nInputCols  = input->size(3);
 
-  kstride0    = kernel->stride[0];
-  nOutputPlane = kernel->size[0];
-  nKernelDepth = kernel->size[1];
-  nKernelRows = kernel->size[2];
-  nKernelCols = kernel->size[3];
+  kstride0    = kernel->stride(0);
+  nOutputPlane = kernel->size(0);
+  nKernelDepth = kernel->size(1);
+  nKernelRows = kernel->size(2);
+  nKernelCols = kernel->size(3);
 
   THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
   THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dcmul : Input image is smaller than kernel");
@@ -1889,17 +1889,17 @@ void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);
 
-  istride0    = input->stride[0];
-  nInputPlane = input->size[0];
-  nInputDepth = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
+  istride0    = input->stride(0);
+  nInputPlane = input->size(0);
+  nInputDepth = input->size(1);
+  nInputRows  = input->size(2);
+  nInputCols  = input->size(3);
 
-  kstride0    = kernel->stride[0];
-  nOutputPlane = kernel->size[0];
-  nKernelDepth = kernel->size[1];
-  nKernelRows = kernel->size[2];
-  nKernelCols = kernel->size[3];
+  kstride0    = kernel->stride(0);
+  nOutputPlane = kernel->size(0);
+  nKernelDepth = kernel->size(1);
+  nKernelRows = kernel->size(2);
+  nKernelCols = kernel->size(3);
 
   THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
   THArgCheck((nInputDepth >= nKernelDepth
@@ -1925,7 +1925,7 @@ void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);
 
-  nmaps = map->size[0];
+  nmaps = map->size(0);
 
   for(k = 0; k < nmaps; k++)
   {
diff --git a/aten/src/TH/generic/THTensorFastGetSet.hpp b/aten/src/TH/generic/THTensorFastGetSet.hpp
index de65f083ea38f3..fa989ddafaf403 100644
--- a/aten/src/TH/generic/THTensorFastGetSet.hpp
+++ b/aten/src/TH/generic/THTensorFastGetSet.hpp
@@ -3,43 +3,43 @@
 #else
 
 static inline real THTensor_(fastGet1d)(THTensor *self, int64_t x0) {
-  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]];
+  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)];
 }
 
 static inline real THTensor_(fastGet2d)(THTensor *self, int64_t x0, int64_t x1) {
-  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]];
+  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)];
 }
 
 static inline real THTensor_(fastGet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2) {
-  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]];
+  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)];
 }
 
 static inline real THTensor_(fastGet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3) {
-  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]];
+  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)];
 }
 
 static inline real THTensor_(fastGet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4) {
-  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]+(x4)*self->stride[4]];
+  return (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)];
 }
 
 static inline void THTensor_(fastSet1d)(THTensor *self, int64_t x0, real value) {
-  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]] = value;
+  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)] = value;
 }
 
 static inline void THTensor_(fastSet2d)(THTensor *self, int64_t x0, int64_t x1, real value) {
-  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]] = value;
+  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)] = value;
 }
 
 static inline void THTensor_(fastSet3d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, real value) {
-  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]] = value;
+  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)] = value;
 }
 
 static inline void THTensor_(fastSet4d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value) {
-  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]] = value;
+  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)] = value;
 }
 
 static inline void THTensor_(fastSet5d)(THTensor *self, int64_t x0, int64_t x1, int64_t x2, int64_t x3, int64_t x4, real value) {
-  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride[0]+(x1)*self->stride[1]+(x2)*self->stride[2]+(x3)*self->stride[3]+(x4)*self->stride[4]] = value;
+  (THStorage_(data)(self->storage)+self->storageOffset)[(x0)*self->stride(0)+(x1)*self->stride(1)+(x2)*self->stride(2)+(x3)*self->stride(3)+(x4)*self->stride(4)] = value;
 }
 
 #endif
diff --git a/aten/src/TH/generic/THTensorLapack.cpp b/aten/src/TH/generic/THTensorLapack.cpp
index ca562e3021aec2..4793dec43de2af 100644
--- a/aten/src/TH/generic/THTensorLapack.cpp
+++ b/aten/src/TH/generic/THTensorLapack.cpp
@@ -7,7 +7,7 @@ Check if self is transpose of a contiguous matrix
 */
 static int THTensor_(isTransposedContiguous)(THTensor *self)
 {
-  return self->stride[0] == 1 && self->stride[1] == self->size[0];
+  return self->stride(0) == 1 && self->stride(1) == self->size(0);
 }
 /*
 If a matrix is a regular contiguous matrix, make sure it is transposed
@@ -53,7 +53,7 @@ input space, like underdetermined gels.
 static THTensor *THTensor_(checkLapackClone)(THTensor *result, THTensor *src, int nrows)
 {
   /* check if user wants to reuse src and if it is correct shape/size */
-  if (src == result && THTensor_(isTransposedContiguous)(src) && src->size[1] == nrows)
+  if (src == result && THTensor_(isTransposedContiguous)(src) && src->size(1) == nrows)
     THTensor_(retain)(result);
   else if(src == result || result == NULL) /* in this case, user wants reuse of src, but its structure is not OK */
     result = THTensor_(new)();
@@ -77,14 +77,14 @@ static THTensor *THTensor_(cloneColumnMajorNrows)(THTensor *self, THTensor *src,
   if (src == result)
     return result;
 
-  THTensor_(resize2d)(result, src->size[1], nrows);
+  THTensor_(resize2d)(result, src->size(1), nrows);
   THTensor_(checkTransposed)(result);
 
-  if (src->size[0] == nrows)
+  if (src->size(0) == nrows)
     THTensor_(copy)(result, src);
   else
   {
-    view = THTensor_(newNarrow)(result, 0, 0, src->size[0]);
+    view = THTensor_(newNarrow)(result, 0, 0, src->size(0));
     THTensor_(copy)(view, src);
     THTensor_(free)(view);
   }
@@ -98,7 +98,7 @@ freed by calling function.
 */
 static THTensor *THTensor_(cloneColumnMajor)(THTensor *self, THTensor *src)
 {
-  return THTensor_(cloneColumnMajorNrows)(self, src, src->size[0]);
+  return THTensor_(cloneColumnMajorNrows)(self, src, src->size(0));
 }
 
 void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
@@ -106,18 +106,20 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
   int free_b = 0;
   if (a == NULL) a = ra_;
   if (b == NULL) b = rb_;
-  THArgCheck(a->_dim() == 2, 2, "A should have 2 dimensions, but has %d",
-      a->_dim());
-  THArgCheck(b->_dim() == 1 || b->_dim() == 2, 1, "B should have 1 or 2 "
-      "dimensions, but has %d", b->_dim());
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
-      a->size[0], a->size[1]);
-  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
-      "rows, B has %ld", a->size[0], b->size[0]);
-
-  if (b->_dim() == 1) {
-    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
-            b->stride[0], 1, 0);
+  THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d",
+      a->dim());
+  THArgCheck(!a->is_empty(), 2, "A should not be empty");
+  THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->dim());
+  THArgCheck(!b->is_empty(), 2, "B should not be empty");
+  THArgCheck(a->size(0) == a->size(1), 2, "A should be square, but is %ldx%ld",
+      a->size(0), a->size(1));
+  THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size(0), b->size(0));
+
+  if (b->dim() == 1) {
+    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0),
+            b->stride(0), 1, 0);
     free_b = 1;
   }
 
@@ -129,8 +131,8 @@ void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
   rb__ = THTensor_(cloneColumnMajor)(rb_, b);
 
-  n    = (int)ra__->size[0];
-  nrhs = (int)rb__->size[1];
+  n    = (int)ra__->size(0);
+  nrhs = (int)rb__->size(1);
   lda  = n;
   ldb  = n;
 
@@ -163,14 +165,14 @@ void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
       a->_dim());
   THArgCheck(b->_dim() == 1 || b->_dim() == 2, 1, "B should have 1 or 2 "
       "dimensions, but has %d", b->_dim());
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
-      a->size[0], a->size[1]);
-  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
-      "rows, B has %ld", a->size[0], b->size[0]);
+  THArgCheck(a->size(0) == a->size(1), 2, "A should be square, but is %ldx%ld",
+      a->size(0), a->size(1));
+  THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size(0), b->size(0));
 
   if (b->_dim() == 1) {
-    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
-            b->stride[0], 1, 0);
+    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0),
+            b->stride(0), 1, 0);
     free_b = 1;
   }
 
@@ -181,8 +183,8 @@ void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
   rb__ = THTensor_(cloneColumnMajor)(rb_, b);
 
-  n    = (int)ra__->size[0];
-  nrhs = (int)rb__->size[1];
+  n    = (int)ra__->size(0);
+  nrhs = (int)rb__->size(1);
   lda  = n;
   ldb  = n;
 
@@ -209,16 +211,18 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
   // Note that a = NULL is interpreted as a = ra_, and b = NULL as b = rb_.
   if (a == NULL) a = ra_;
   if (b == NULL) b = rb_;
-  THArgCheck(a->_dim() == 2, 2, "A should have 2 dimensions, but has %d",
-      a->_dim());
-  THArgCheck(b->_dim() == 1 || b->_dim() == 2, 1, "B should have 1 or 2 "
-      "dimensions, but has %d", b->_dim());
-  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
-      "rows, B has %ld", a->size[0], b->size[0]);
+  THArgCheck(a->dim() == 2, 2, "A should have 2 dimensions, but has %d",
+      a->dim());
+  THArgCheck(!a->is_empty(), 2, "A should not be empty");
+  THArgCheck(b->dim() == 1 || b->dim() == 2, 1, "B should have 1 or 2 "
+      "dimensions, but has %d", b->dim());
+  THArgCheck(!b->is_empty(), 1, "B should not be empty");
+  THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size(0), b->size(0));
 
   if (b->_dim() == 1) {
-    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
-            b->stride[0], 1, 0);
+    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0),
+            b->stride(0), 1, 0);
     free_b = 1;
   }
 
@@ -231,14 +235,14 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
 
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  m = ra__->size[0];
-  n = ra__->size[1];
+  m = ra__->size(0);
+  n = ra__->size(1);
   lda = m;
   ldb = (m > n) ? m : n;
 
   rb__ = THTensor_(cloneColumnMajorNrows)(rb_, b, ldb);
 
-  nrhs = rb__->size[1];
+  nrhs = rb__->size(1);
   info = 0;
 
 
@@ -277,7 +281,7 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
 void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr)
 {
   int n, lda, lwork, info, ldvr;
-  THTensor *work, *wi, *wr, *a;
+  THTensor *work=nullptr, *wi, *wr, *a;
   real wkopt;
   real *rv_data;
   int64_t i;
@@ -285,13 +289,13 @@ void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *job
   THTensor *re__ = NULL;
   THTensor *rv__ = NULL;
 
-  THArgCheck(a_->_dim() == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a_->size[0] == a_->size[1], 1,"A should be square");
+  THArgCheck(a_->dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a_->size(0) == a_->size(1), 1,"A should be square");
 
   /* we want to definitely clone a_ for geev*/
   a = THTensor_(cloneColumnMajor)(NULL, a_);
 
-  n = a->size[0];
+  n = a->size(0);
   lda = n;
 
   wi = THTensor_(newWithSize1d)(n);
@@ -310,24 +314,26 @@ void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *job
   THTensor_(resize2d)(re_,n,2);
   re__ = THTensor_(newContiguous)(re_);
 
-  /* get optimal workspace size */
-  THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
-      NULL, 1, rv_data, ldvr, &wkopt, -1, &info);
-
-  lwork = (int)wkopt;
-  work = THTensor_(newWithSize1d)(lwork);
-
-  THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
-      NULL, 1, rv_data, ldvr, THTensor_(data)(work), lwork, &info);
-
-  THLapackCheckWithCleanup(" Lapack Error in %s : %d off-diagonal elements of an didn't converge to zero",
-                           THCleanup(THTensor_(free)(re__);
-                                     THTensor_(free)(rv__);
-                                     THTensor_(free)(a);
-                                     THTensor_(free)(wi);
-                                     THTensor_(free)(wr);
-                                     THTensor_(free)(work);),
-                           "geev", info,"");
+  if (n > 0) {  // lapack doesn't work with size 0
+    /* get optimal workspace size */
+    THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
+        NULL, 1, rv_data, ldvr, &wkopt, -1, &info);
+
+    lwork = (int)wkopt;
+    work = THTensor_(newWithSize1d)(lwork);
+
+    THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
+        NULL, 1, rv_data, ldvr, THTensor_(data)(work), lwork, &info);
+
+    THLapackCheckWithCleanup(" Lapack Error in %s : %d off-diagonal elements of an didn't converge to zero",
+                             THCleanup(THTensor_(free)(re__);
+                                       THTensor_(free)(rv__);
+                                       THTensor_(free)(a);
+                                       THTensor_(free)(wi);
+                                       THTensor_(free)(wr);
+                                       THTensor_(free)(work);),
+                             "geev", info,"");
+  }
 
   {
     real *re_data = THTensor_(data)(re__);
@@ -355,11 +361,11 @@ void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *job
 void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz, const char *uplo)
 {
   if (a == NULL) a = rv_;
-  THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1,"A should be square");
+  THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size(0) == a->size(1), 1,"A should be square");
 
   int n, lda, lwork, info;
-  THTensor *work;
+  THTensor *work = nullptr;
   real wkopt;
 
   THTensor *rv__ = NULL;
@@ -367,25 +373,27 @@ void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz
 
   rv__ = THTensor_(cloneColumnMajor)(rv_, a);
 
-  n = rv__->size[0];
+  n = rv__->size(0);
   lda = n;
 
   THTensor_(resize1d)(re_,n);
   re__ = THTensor_(newContiguous)(re_);
 
   /* get optimal workspace size */
-  THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
-		  THTensor_(data)(re_), &wkopt, -1, &info);
-  lwork = (int)wkopt;
-  work = THTensor_(newWithSize1d)(lwork);
-  THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
-		  THTensor_(data)(re_), THTensor_(data)(work), lwork, &info);
-
-  THLapackCheckWithCleanup("Lapack Error %s : %d off-diagonal elements didn't converge to zero",
-                           THCleanup(THTensor_(free)(rv__);
-                                     THTensor_(free)(re__);
-                                     THTensor_(free)(work);),
-                           "syev", info,"");
+  if (n != 0) {
+    THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
+                    THTensor_(data)(re_), &wkopt, -1, &info);
+    lwork = (int)wkopt;
+    work = THTensor_(newWithSize1d)(lwork);
+    THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
+                    THTensor_(data)(re_), THTensor_(data)(work), lwork, &info);
+
+    THLapackCheckWithCleanup("Lapack Error %s : %d off-diagonal elements didn't converge to zero",
+                             THCleanup(THTensor_(free)(rv__);
+                                       THTensor_(free)(re__);
+                                       THTensor_(free)(work);),
+                             "syev", info,"");
+  }
 
   // No eigenvectors specified
   if (*jobz == 'N') {
@@ -407,7 +415,8 @@ void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a,
 void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char* jobu)
 {
   if (a == NULL) a = ra_;
-  THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(!a->is_empty(), 1, "A should not be empty");
 
   int k,m, n, lda, ldu, ldvt, lwork, info;
   THTensor *work;
@@ -421,8 +430,8 @@ void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
 
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  m = ra__->size[0];
-  n = ra__->size[1];
+  m = ra__->size(0);
+  n = ra__->size(1);
   k = (m < n ? m : n);
 
   lda = m;
@@ -490,7 +499,7 @@ void THTensor_(getri)(THTensor *ra_, THTensor *a)
 {
   if (a == NULL) a = ra_;
   THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+  THArgCheck(a->size(0) == a->size(1), 1, "A should be square");
 
   int m, n, lda, info, lwork;
   real wkopt;
@@ -500,8 +509,8 @@ void THTensor_(getri)(THTensor *ra_, THTensor *a)
 
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  m = ra__->size[0];
-  n = ra__->size[1];
+  m = ra__->size(0);
+  n = ra__->size(1);
   lda = m;
   ipiv = THIntTensor_newWithSize1d((int64_t)m);
 
@@ -533,9 +542,9 @@ void THTensor_(getri)(THTensor *ra_, THTensor *a)
 void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo)
 {
   THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+  THArgCheck(a->size(0) == a->size(1), 1, "A should be square");
 
-  int n = a->size[0];
+  int n = a->size(0);
 
   /* Build full matrix */
   real *p = THTensor_(data)(a);
@@ -566,9 +575,9 @@ void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo)
 void THTensor_(copyUpLoTriangle)(THTensor *a, const char *uplo)
 {
   THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+  THArgCheck(a->size(0) == a->size(1), 1, "A should be square");
 
-  int n = a->size[0];
+  int n = a->size(0);
 
   /* Build full matrix */
   real *p = THTensor_(data)(a);
@@ -600,14 +609,14 @@ void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo)
 {
   if (a == NULL) a = ra_;
   THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+  THArgCheck(a->size(0) == a->size(1), 1, "A should be square");
 
   int n, lda, info;
   THTensor *ra__ = NULL;
 
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  n = ra__->size[0];
+  n = ra__->size(0);
   lda = n;
 
   /* Run Factorization */
@@ -629,14 +638,14 @@ void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo)
       a->_dim());
   THArgCheck(b->_dim() == 1 || b->_dim() == 2, 1, "B should have 1 or 2 "
       "dimensions, but has %d", b->_dim());
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
-      a->size[0], a->size[1]);
-  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
-      "rows, B has %ld", a->size[0], b->size[0]);
+  THArgCheck(a->size(0) == a->size(1), 2, "A should be square, but is %ldx%ld",
+      a->size(0), a->size(1));
+  THArgCheck(a->size(0) == b->size(0), 2, "A,B size incompatible - A has %ld "
+      "rows, B has %ld", a->size(0), b->size(0));
 
   if (b->_dim() == 1) {
-    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
-            b->stride[0], 1, 0);
+    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size(0),
+            b->stride(0), 1, 0);
     free_b = 1;
   }
 
@@ -647,8 +656,8 @@ void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo)
   ra__ = THTensor_(cloneColumnMajor)(NULL, a);
   rb__ = THTensor_(cloneColumnMajor)(rb_, b);
 
-  n    = (int)ra__->size[0];
-  nrhs = (int)rb__->size[1];
+  n    = (int)ra__->size(0);
+  nrhs = (int)rb__->size(1);
   lda  = n;
   ldb  = n;
 
@@ -672,14 +681,14 @@ void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo)
 {
   if (a == NULL) a = ra_;
   THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+  THArgCheck(a->size(0) == a->size(1), 1, "A should be square");
 
   int n, lda, info;
   THTensor *ra__ = NULL;
 
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  n = ra__->size[0];
+  n = ra__->size(0);
   lda = n;
 
   /* Run inverse */
@@ -710,9 +719,9 @@ void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo)
  */
 void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor *a, const char *uplo, real tol) {
   THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+  THArgCheck(a->size(0) == a->size(1), 1, "A should be square");
 
-  int n = a->size[0];
+  int n = a->size(0);
 
   THTensor *ra__ = THTensor_(cloneColumnMajor)(ra_, a);
   THIntTensor_resize1d(rpiv_, n);
@@ -757,17 +766,17 @@ void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor *a, const char
 */
 void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a)
 {
-  int m = a->size[0];
-  int n = a->size[1];
+  int m = a->size(0);
+  int n = a->size(1);
   int k = (m < n ? m : n);
   THTensor *ra_ = THTensor_(new)();
   THTensor *rtau_ = THTensor_(new)();
   THTensor *rr__ = THTensor_(new)();
   THTensor_(geqrf)(ra_, rtau_, a);
-  THTensor_(resize2d)(rr__, k, ra_->size[1]);
+  THTensor_(resize2d)(rr__, k, ra_->size(1));
   THTensor_(narrow)(rr__, ra_, 0, 0, k);
   THTensor_(triu)(rr_, rr__, 0);
-  THTensor_(resize2d)(rq_, ra_->size[0], k);
+  THTensor_(resize2d)(rq_, ra_->size(0), k);
   THTensor_(orgqr)(rq_, ra_, rtau_);
   THTensor_(narrow)(rq_, rq_, 1, 0, k);
   THTensor_(free)(ra_);
@@ -795,15 +804,16 @@ void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a)
 void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a)
 {
   if (a == NULL) ra_ = a;
-  THArgCheck(a->_dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional");
+  THArgCheck(!a->is_empty(), 1, "A should not be empty");
 
   THTensor *ra__ = NULL;
 
   /* Prepare the input for LAPACK, making a copy if necessary. */
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  int m = ra__->size[0];
-  int n = ra__->size[1];
+  int m = ra__->size(0);
+  int n = ra__->size(1);
   int k = (m < n ? m : n);
   int lda = m;
   THTensor_(resize1d)(rtau_, k);
@@ -856,8 +866,8 @@ void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau)
   THTensor *ra__ = NULL;
   ra__ = THTensor_(cloneColumnMajor)(ra_, a);
 
-  int m = ra__->size[0];
-  int k = tau->size[0];
+  int m = ra__->size(0);
+  int k = tau->size(0);
   int lda = m;
 
   /* Dry-run to query the suggested size of the workspace. */
@@ -909,9 +919,9 @@ void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, co
   THTensor *ra__ = NULL;
   ra__ = THTensor_(cloneColumnMajor)(ra_, c);
 
-  int m = c->size[0];
-  int n = c->size[1];
-  int k = tau->size[0];
+  int m = c->size(0);
+  int n = c->size(1);
+  int k = tau->size(0);
   int lda;
   if (*side == 'L')
   {
@@ -948,7 +958,7 @@ void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, co
 
 void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, int pivot, THTensor *a)
 {
-  AT_CHECK(!a->is_empty() && THTensor_(nDimension)(a) == 3, "expected 3D tensor, got size: ", a->sizes());
+  AT_CHECK(THTensor_(nDimension)(a) == 3, "expected 3D tensor, got size: ", a->sizes());
   if (!pivot) {
     THError("btrifact without pivoting is not implemented on the CPU");
   }
@@ -958,8 +968,8 @@ void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinf
     THTensor_(copy)(ra_, a);
   }
 
-  int m = a->size[1];
-  int n = a->size[2];
+  int m = a->size(1);
+  int n = a->size(2);
   if (m != n) {
     THError("btrifact is only implemented for square matrices");
   }
@@ -967,9 +977,9 @@ void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinf
   THTensor *ra__;
   int lda;
 
-  if (ra_->stride[1] == 1) {
+  if (ra_->stride(1) == 1) {
     // column ordered, what BLAS wants
-    lda = ra_->stride[2];
+    lda = ra_->stride(2);
     ra__ = ra_;
   } else {
     // not column ordered, need to make it such (requires copy)
@@ -977,7 +987,7 @@ void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinf
     ra__ = THTensor_(newClone)(transp_r_);
     THTensor_(free)(transp_r_);
     THTensor_(transpose)(ra__, NULL, 1, 2);
-    lda = ra__->stride[2];
+    lda = ra__->stride(2);
   }
 
   THTensor *ai = THTensor_(new)();
@@ -1039,18 +1049,18 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor
     THTensor_(copy)(rb_, b);
   }
 
-  int64_t num_batches = atf->size[0];
-  int64_t n = atf->size[1];
-  int nrhs = rb_->_dim() > 2 ? rb_->size[2] : 1;
+  int64_t num_batches = atf->size(0);
+  int64_t n = atf->size(1);
+  int nrhs = rb_->_dim() > 2 ? rb_->size(2) : 1;
 
   int lda, ldb;
   THTensor *atf_;
   THTensor *rb__;
 
   // correct ordering of A
-  if (atf->stride[1] == 1) {
+  if (atf->stride(1) == 1) {
     // column ordered, what BLAS wants
-    lda = atf->stride[2];
+    lda = atf->stride(2);
     atf_ = atf;
   } else {
     // not column ordered, need to make it such (requires copy)
@@ -1061,16 +1071,16 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor
     atf_ = THTensor_(newClone)(transp_r_);
     THTensor_(free)(transp_r_);
     THTensor_(transpose)(atf_, NULL, 1, 2);
-    lda = atf_->stride[2];
+    lda = atf_->stride(2);
   }
 
   // correct ordering of B
-  if (rb_->stride[1] == 1) {
+  if (rb_->stride(1) == 1) {
     // column ordered
-    if (rb_->_dim() == 2 || rb_->size[2] == 1) {
+    if (rb_->_dim() == 2 || rb_->size(2) == 1) {
       ldb = n;
     } else {
-      ldb = rb_->stride[2];
+      ldb = rb_->stride(2);
     }
     rb__ = rb_;
   } else {
@@ -1080,7 +1090,7 @@ void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor
       rb__ = THTensor_(newClone)(transp_r_);
       THTensor_(free)(transp_r_);
       THTensor_(transpose)(rb__, NULL, 1, 2);
-      ldb = rb__->stride[2];
+      ldb = rb__->stride(2);
     } else {
       rb__ = THTensor_(newClone)(rb_);
       ldb = n;
diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp
index 4fa0003984b5f8..e4152432a3068a 100644
--- a/aten/src/TH/generic/THTensorMath.cpp
+++ b/aten/src/TH/generic/THTensorMath.cpp
@@ -109,10 +109,7 @@
 #define TH_CHECK_SAME_SIZE(TENSOR1, TENSOR2) \
 { \
   if(!THTensor_(isSameSizeAs)(TENSOR1, TENSOR2)) { \
-    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \
-    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \
-    THError("inconsistent tensor size, expected %s %s and %s %s to have the same size", \
-            #TENSOR1, T1buff.str, #TENSOR2, T2buff.str); \
+    AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same size"); \
   } \
 }
 
@@ -121,31 +118,27 @@
 //         TENSOR2 is src
 //         TENSOR3 is index
 // Tests:
-//   1. index->size[d] <= src->size[d] for all d
-//   2. index->size[d] <= real->size[d] for all d != dim
+//   1. index->size(d) <= src->size(d) for all d
+//   2. index->size(d) <= real->size(d) for all d != dim
 #define TH_TENSOR_DIM_APPLY3_SIZE_SCATTER(TENSOR1, TENSOR2, TENSOR3, DIMENSION) \
 { \
   int shape_check_flag = 0; \
   for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->_dim(); TH_TENSOR_DIM_APPLY_i++) \
   { \
-    int64_t TENSOR3##_dim_size = TENSOR3->size[TH_TENSOR_DIM_APPLY_i]; \
+    int64_t TENSOR3##_dim_size = TENSOR3->size(TH_TENSOR_DIM_APPLY_i); \
     if (TH_TENSOR_DIM_APPLY_i != DIMENSION) { \
-      if (TENSOR3##_dim_size > TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) { \
+      if (TENSOR3##_dim_size > TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) { \
         shape_check_flag = 1; \
         break; \
       } \
     } \
-    if (TENSOR3##_dim_size > TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \
+    if (TENSOR3##_dim_size > TENSOR2->size(TH_TENSOR_DIM_APPLY_i)) { \
       shape_check_flag = 1; \
       break; \
     } \
   } \
   if (shape_check_flag == 1) { \
-    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->_dim()); \
-    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->_dim()); \
-    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->_dim()); \
-    THError("Expected %s %s to be smaller size than %s %s and to be smaller than %s %s apart from dimension %d", \
-            #TENSOR3, T3buff.str, #TENSOR2, T2buff.str, #TENSOR1, T1buff.str, DIMENSION); \
+    AT_ERROR("Expected ", #TENSOR3, " ", TENSOR3->sizes(), " to be smaller size than ", #TENSOR2, " ", TENSOR2->sizes(), " and to be smaller than ", #TENSOR1, " ", TENSOR1->sizes(), " apart from dimension ", DIMENSION); \
   } \
 }
 
@@ -297,8 +290,8 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor)
                     div = 1;
 
                     for (dim = tensor->dim() - 1; dim >= 0; dim--) {
-                      *(subscript_data + dim) = (i/div) % tensor->size[dim];
-                      div *= tensor->size[dim];
+                      *(subscript_data + dim) = (i/div) % tensor->size(dim);
+                      div *= tensor->size(dim);
                     }
 
                     subscript_data += tensor->dim();
@@ -314,14 +307,20 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
   int64_t *index_data;
   real *tensor_data, *src_data;
 
+#ifndef USE_TH_SIZE_ZERO_DIM
   THArgCheck(index->_dim() <= 1, 3, "Index is supposed to be an empty tensor or a vector");
   THArgCheck(dim < src->_dim(), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
   THArgCheck(src->_dim() > 0, 2, "Source tensor is empty");
+#else
+  THArgCheck(index->dim() == 1, 3, "Index is supposed to be 1-dimensional");
+  THArgCheck(dim < src->dim(), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+  //THArgCheck(src->dim() > 0, 2, "Source tensor is empty");
+#endif
 
   numel = THLongTensor_nElement(index);
 
   newSize = THLongStorage_newWithSize(src->dim());
-  THLongStorage_rawCopy(newSize,src->size);
+  THLongStorage_rawCopy(newSize, THTensor_getSizePtr(src));
 #ifdef DEBUG
   THAssert(numel <= LONG_MAX);
 #endif
@@ -336,10 +335,10 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
   {
     tensor_data = THTensor_(data)(tensor);
     src_data = THTensor_(data)(src);
-    ptrdiff_t rowsize = THTensor_(nElement)(src) / src->size[0];
+    ptrdiff_t rowsize = src->size(0) == 0 ? 1: THTensor_(nElement)(src) / src->size(0);
 
     // check that the indices are within range
-    int64_t max = src->size[0] - 1 + TH_INDEX_BASE;
+    int64_t max = src->size(0) - 1 + TH_INDEX_BASE;
     for (i=0; i<numel; i++) {
       if (index_data[i] < TH_INDEX_BASE || index_data[i] > max) {
         THLongTensor_free(index);
@@ -347,7 +346,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
       }
     }
 
-    if (src->_dim() == 1) {
+    if (src->dim() == 1) {
       #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i)
       for (i=0; i<numel; i++)
         tensor_data[i] = src_data[index_data[i] - TH_INDEX_BASE];
@@ -357,7 +356,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
         memcpy(tensor_data + i*rowsize, src_data + (index_data[i] - TH_INDEX_BASE)*rowsize, rowsize*sizeof(real));
     }
   }
-  else if (src->_dim() == 1)
+  else if (src->dim() == 1)
   {
     for (i=0; i<numel; i++)
       THTensor_(set1d)(tensor,i,THTensor_(get1d)(src,index_data[i] - TH_INDEX_BASE));
@@ -392,7 +391,7 @@ void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTens
   index = THLongTensor_newContiguous(index);
   index_data = THLongTensor_data(index);
 
-  if (tensor->_dim() > 1 )
+  if (tensor->dim() > 1 )
   {
     tSlice = THTensor_(new)();
     sSlice = THTensor_(new)();
@@ -418,8 +417,8 @@ void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTens
 }
 
 static ptrdiff_t THTensor_(dataOffset)(THTensor* tensor, ptrdiff_t linearIndex) {
-  int64_t *size = tensor->size;
-  int64_t *stride = tensor->stride;
+  auto size = tensor->sizes();
+  auto stride = tensor->strides();
   int nDim = tensor->_dim();
   ptrdiff_t dataOffset = 0;
   for (int i = nDim - 1; i >= 0; i--) {
@@ -439,7 +438,7 @@ static inline int64_t THTensor_(wrapLinearIndex)(int64_t linearIndex, int64_t nu
 
 void THTensor_(take)(THTensor *r_, THTensor *src, THLongTensor *index)
 {
-  THTensor_(resizeNd)(r_, index->dim(), index->size, NULL);
+  THTensor_(resizeNd)(r_, index->dim(), THTensor_getSizePtr(index), NULL);
   THTensor* dst = THTensor_(newContiguous)(r_);
 
   index = THLongTensor_newContiguous(index);
@@ -513,14 +512,19 @@ void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTenso
   int64_t *index_data;
 
   numel = THLongTensor_nElement(index);
+#ifndef USE_TH_SIZE_ZERO_DIM
   THArgCheck(index->_dim() == 1, 3, "Index is supposed to be a vector");
   THArgCheck(dim < src->_dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
-  THArgCheck(numel == src->size[dim],4,"Number of indices should be equal to source:size(dim)");
+#else
+  THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < src->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#endif
+  THArgCheck(numel == src->size(dim),4,"Number of indices should be equal to source:size(dim)");
 
   index = THLongTensor_newContiguous(index);
   index_data = THLongTensor_data(index);
 
-  if (tensor->_dim() > 1)
+  if (tensor->dim() > 1)
   {
     tSlice = THTensor_(new)();
     sSlice = THTensor_(new)();
@@ -554,15 +558,20 @@ void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real v
   int64_t *index_data;
 
   numel = THLongTensor_nElement(index);
+#ifndef USE_TH_SIZE_ZERO_DIM
   THArgCheck(index->_dim() == 1, 3, "Index is supposed to be a vector");
   THArgCheck(dim < tensor->_dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#else
+  THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < tensor->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#endif
 
   index = THLongTensor_newContiguous(index);
   index_data = THLongTensor_data(index);
 
   for (i=0; i<numel; i++)
   {
-    if (tensor->_dim() > 1)
+    if (tensor->dim() > 1)
     {
       tSlice = THTensor_(new)();
       THTensor_(select)(tSlice, tensor,dim,index_data[i] - TH_INDEX_BASE);
@@ -581,11 +590,11 @@ void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *i
 {
   int64_t elems_per_row, i, idx;
 
-  THArgCheck(THLongTensor__nDimension(index) == THTensor_(_nDimension)(src), 4,
+  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(src), 4,
              "Index tensor must have same dimensions as input tensor");
-  THArgCheck(dim >= 0 && dim < THTensor_(_nDimension)(tensor), 3,
+  THArgCheck(dim >= 0 && dim < THTensor_(nDimension)(tensor), 3,
              "Index dimension is out of bounds");
-  THArgCheck(THTensor_(_nDimension)(src) == THTensor_(_nDimension)(tensor), 2,
+  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 2,
              "Input tensor must have same dimensions as output tensor");
 
   elems_per_row = THLongTensor_size(index, dim);
@@ -608,11 +617,19 @@ void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor
 {
   int64_t elems_per_row, i, idx;
 
+#ifndef USE_TH_SIZE_ZERO_DIM
   THArgCheck(dim < THTensor_(_nDimension)(tensor), 2, "Index dimension is out of bounds");
   THArgCheck(THLongTensor__nDimension(index) == THTensor_(_nDimension)(tensor), 3,
              "Index tensor must have same dimensions as output tensor");
   THArgCheck(THTensor_(_nDimension)(src) == THTensor_(_nDimension)(tensor), 4,
              "Input tensor must have same dimensions as output tensor");
+#else
+  THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
+  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
+             "Index tensor must have same dimensions as output tensor");
+  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4,
+             "Input tensor must have same dimensions as output tensor");
+#endif
 
   elems_per_row = THLongTensor_size(index, dim);
 
@@ -634,10 +651,10 @@ void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTen
 {
   int64_t elems_per_row, i, idx;
 
-  THArgCheck(dim < THTensor_(_nDimension)(tensor), 2, "Index dimension is out of bounds");
-  THArgCheck(THLongTensor__nDimension(index) == THTensor_(_nDimension)(tensor), 3,
+  THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
+  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
              "Index tensor must have same dimensions as output tensor");
-  THArgCheck(THTensor_(_nDimension)(src) == THTensor_(_nDimension)(tensor), 4,
+  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4,
              "Input tensor must have same dimensions as output tensor");
 
   elems_per_row = THLongTensor_size(index, dim);
@@ -1931,20 +1948,20 @@ void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, T
 
 void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat, THTensor *vec)
 {
-  if( (mat->_dim() != 2) || (vec->_dim() != 1) )
+  if( (mat->dim() != 2) || (vec->dim() != 1) )
     THError("matrix and vector expected, got %dD, %dD",
-      mat->_dim(), vec->_dim());
+      mat->dim(), vec->dim());
 
-  if( mat->size[1] != vec->size[0] ) {
+  if( mat->size(1) != vec->size(0) ) {
     THDescBuff bm = THTensor_(sizeDesc)(mat);
     THDescBuff bv = THTensor_(sizeDesc)(vec);
     THError("size mismatch, %s, %s", bm.str, bv.str);
   }
 
-  if(t->_dim() != 1)
-    THError("vector expected, got t: %dD", t->_dim());
+  if(t->dim() != 1)
+    THError("vector expected, got t: %dD", t->dim());
 
-  if(t->size[0] != mat->size[0]) {
+  if(t->size(0) != mat->size(0)) {
     THDescBuff bt = THTensor_(sizeDesc)(t);
     THDescBuff bm = THTensor_(sizeDesc)(mat);
     THError("size mismatch, t: %s, mat: %s", bt.str, bm.str);
@@ -1959,28 +1976,28 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
   // n == 1 || lda >= max(1, m)
   #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M)))
 
-  if(mat->stride[0] == 1 && LDA_COND(mat->size[0], mat->size[1], mat->stride[1]))
+  if(mat->stride(0) == 1 && LDA_COND(mat->size(0), mat->size(1), mat->stride(1)))
   {
-    THBlas_(gemv)('n', mat->size[0], mat->size[1],
-                  alpha, THTensor_(data)(mat), mat->stride[1],
-                  THTensor_(data)(vec), vec->stride[0],
-                  beta, THTensor_(data)(r_), r_->stride[0]);
+    THBlas_(gemv)('n', mat->size(0), mat->size(1),
+                  alpha, THTensor_(data)(mat), mat->stride(1),
+                  THTensor_(data)(vec), vec->stride(0),
+                  beta, THTensor_(data)(r_), r_->stride(0));
   }
-  else if(mat->stride[1] == 1 && LDA_COND(mat->size[1], mat->size[0], mat->stride[0]))
+  else if(mat->stride(1) == 1 && LDA_COND(mat->size(1), mat->size(0), mat->stride(0)))
   {
-    THBlas_(gemv)('t',  mat->size[1], mat->size[0],
-                  alpha, THTensor_(data)(mat), mat->stride[0],
-                  THTensor_(data)(vec), vec->stride[0],
-                  beta, THTensor_(data)(r_), r_->stride[0]);
+    THBlas_(gemv)('t',  mat->size(1), mat->size(0),
+                  alpha, THTensor_(data)(mat), mat->stride(0),
+                  THTensor_(data)(vec), vec->stride(0),
+                  beta, THTensor_(data)(r_), r_->stride(0));
   }
   else
   {
     THTensor *cmat = THTensor_(newContiguous)(mat);
 
-    THBlas_(gemv)('t',  mat->size[1], mat->size[0],
-                  alpha, THTensor_(data)(cmat), cmat->stride[0],
-                  THTensor_(data)(vec), vec->stride[0],
-                  beta, THTensor_(data)(r_), r_->stride[0]);
+    THBlas_(gemv)('t',  mat->size(1), mat->size(0),
+                  alpha, THTensor_(data)(cmat), cmat->stride(0),
+                  THTensor_(data)(vec), vec->stride(0),
+                  beta, THTensor_(data)(r_), r_->stride(0));
 
     THTensor_(free)(cmat);
   }
@@ -1990,8 +2007,8 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
 
 void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain)
 {
-  int64_t N1 = m1->size[0];
-  int64_t N2 = m2->size[0];
+  int64_t N1 = m1->size(0);
+  int64_t N2 = m2->size(0);
   int64_t dim;
   real *m1_p;
   real *m2_p;
@@ -2006,8 +2023,8 @@ void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain)
   THTensor_(resize2d)(m1, N1, THTensor_(nElement)(m1) / N1);
   THTensor_(resize2d)(m2, N2, THTensor_(nElement)(m2) / N2);
 
-  dim = m1->size[1];
-  THArgCheck(m1->size[1] == m2->size[1], 3, "m1 and m2 must have the same inner vector dim");
+  dim = m1->size(1);
+  THArgCheck(m1->size(1) == m2->size(1), 3, "m1 and m2 must have the same inner vector dim");
 
   m1_p = THTensor_(data)(m1);
   m2_p = THTensor_(data)(m2);
@@ -2037,19 +2054,19 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
   int free_m1 = 0;
   int free_m2 = 0;
 
-  if( (m1->_dim() != 2) || (m2->_dim() != 2))
-    THError("matrices expected, got %dD, %dD tensors", m1->_dim(), m2->_dim());
+  if( (m1->dim() != 2) || (m2->dim() != 2))
+    THError("matrices expected, got %dD, %dD tensors", m1->dim(), m2->dim());
 
-  if(m1->size[1] != m2->size[0]) {
+  if(m1->size(1) != m2->size(0)) {
     THDescBuff bm1 = THTensor_(sizeDesc)(m1);
     THDescBuff bm2 = THTensor_(sizeDesc)(m2);
     THError("size mismatch, m1: %s, m2: %s", bm1.str, bm2.str);
   }
 
-  if( t->_dim() != 2 )
-    THError("matrix expected, got %dD tensor for t", t->_dim());
+  if( t->dim() != 2 )
+    THError("matrix expected, got %dD tensor for t", t->dim());
 
-  if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) ) {
+  if( (t->size(0) != m1->size(0)) || (t->size(1) != m2->size(1)) ) {
     THDescBuff bt  = THTensor_(sizeDesc)(t);
     THDescBuff bm1 = THTensor_(sizeDesc)(m1);
     THDescBuff bm2 = THTensor_(sizeDesc)(m2);
@@ -2068,14 +2085,14 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
   #define LDC_COND(M, N, LDC) ((N) == 1 || (LDC) >= THMax(1, M))
 
   /* r_ */
-  if(r_->stride[0] == 1 &&
-     LDC_COND(r_->size[0], r_->size[1], r_->stride[1]))
+  if(r_->stride(0) == 1 &&
+     LDC_COND(r_->size(0), r_->size(1), r_->stride(1)))
   {
     transpose_r = 'n';
     r__ = r_;
   }
-  else if(r_->stride[1] == 1 &&
-          LDC_COND(r_->size[1], r_->size[0], r_->stride[0]))
+  else if(r_->stride(1) == 1 &&
+          LDC_COND(r_->size(1), r_->size(0), r_->stride(0)))
   {
     THTensor *swap = m2;
     m2 = m1;
@@ -2095,21 +2112,21 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
 
   #undef LDC_COND
 
-  int64_t m = r__->size[(transpose_r == 'n' ? 0 : 1)];
-  int64_t n = r__->size[(transpose_r == 'n' ? 1 : 0)];
-  int64_t k = m1->size[(transpose_r == 'n' ? 1 : 0)];
-  int64_t ldr__ = r__->stride[(transpose_r == 'n' ? 1 : 0)];
+  int64_t m = r__->size((transpose_r == 'n' ? 0 : 1));
+  int64_t n = r__->size((transpose_r == 'n' ? 1 : 0));
+  int64_t k = m1->size((transpose_r == 'n' ? 1 : 0));
+  int64_t ldr__ = r__->stride((transpose_r == 'n' ? 1 : 0));
 
   /* m1 */
   /* Need ldm1_ >= max(1, (transpose_m1 == 'n' ? m : k)) */
-  if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
-     m1->stride[(transpose_r == 'n' ? 1 : 0)] >= THMax(1, m))
+  if(m1->stride((transpose_r == 'n' ? 0 : 1)) == 1 &&
+     m1->stride((transpose_r == 'n' ? 1 : 0)) >= THMax(1, m))
   {
     transpose_m1 = 'n';
     m1_ = m1;
   }
-  else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
-          m1->stride[(transpose_r == 'n' ? 0 : 1)] >= THMax(1, k))
+  else if(m1->stride((transpose_r == 'n' ? 1 : 0)) == 1 &&
+          m1->stride((transpose_r == 'n' ? 0 : 1)) >= THMax(1, k))
   {
     transpose_m1 = 't';
     m1_ = m1;
@@ -2123,14 +2140,14 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
 
   /* m2 */
   /* Need ldm2_ >= max(1, (transpose_m2 == 'n' ? k : n)) */
-  if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
-     m2->stride[(transpose_r == 'n' ? 1 : 0)] >= THMax(1, k))
+  if(m2->stride((transpose_r == 'n' ? 0 : 1)) == 1 &&
+     m2->stride((transpose_r == 'n' ? 1 : 0)) >= THMax(1, k))
   {
     transpose_m2 = 'n';
     m2_ = m2;
   }
-  else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
-          m2->stride[(transpose_r == 'n' ? 0 : 1)] >= THMax(1, n))
+  else if(m2->stride((transpose_r == 'n' ? 1 : 0)) == 1 &&
+          m2->stride((transpose_r == 'n' ? 0 : 1)) >= THMax(1, n))
   {
     transpose_m2 = 't';
     m2_ = m2;
@@ -2142,8 +2159,8 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
     free_m2 = 1;
   }
 
-  int64_t ldm1_ = (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]);
-  int64_t ldm2_ = (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]);
+  int64_t ldm1_ = (transpose_m1 == 'n' ? m1_->stride((transpose_r == 'n' ? 1 : 0)) : m1_->stride((transpose_r == 'n' ? 0 : 1)));
+  int64_t ldm2_ = (transpose_m2 == 'n' ? m2_->stride((transpose_r == 'n' ? 1 : 0)) : m2_->stride((transpose_r == 'n' ? 0 : 1)));
 
 #pragma omp critical(blasgemm)
   /* do the operation */
@@ -2174,14 +2191,14 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
 
 void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2)
 {
-  if( (vec1->_dim() != 1) || (vec2->_dim() != 1) )
+  if( (vec1->dim() != 1) || (vec2->dim() != 1) )
     THError("vector and vector expected, got %dD, %dD tensors",
-        vec1->_dim(), vec2->_dim());
+        vec1->dim(), vec2->dim());
 
-  if(t->_dim() != 2)
-    THError("expected matrix, got %dD tensor for t", t->_dim());
+  if(t->dim() != 2)
+    THError("expected matrix, got %dD tensor for t", t->dim());
 
-  if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) {
+  if( (t->size(0) != vec1->size(0)) || (t->size(1) != vec2->size(0)) ) {
     THDescBuff bt  = THTensor_(sizeDesc)(t);
     THDescBuff bv1 = THTensor_(sizeDesc)(vec1);
     THDescBuff bv2 = THTensor_(sizeDesc)(vec2);
@@ -2203,28 +2220,28 @@ void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
   // n == 1 || lda >= max(1, m)
   #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M)))
 
-  if(r_->stride[0] == 1 && LDA_COND(vec1->size[0], vec2->size[0], r_->stride[1]))
+  if(r_->stride(0) == 1 && LDA_COND(vec1->size(0), vec2->size(0), r_->stride(1)))
   {
-    THBlas_(ger)(vec1->size[0], vec2->size[0],
-                 alpha, THTensor_(data)(vec1), vec1->stride[0],
-                 THTensor_(data)(vec2), vec2->stride[0],
-                 THTensor_(data)(r_), r_->stride[1]);
+    THBlas_(ger)(vec1->size(0), vec2->size(0),
+                 alpha, THTensor_(data)(vec1), vec1->stride(0),
+                 THTensor_(data)(vec2), vec2->stride(0),
+                 THTensor_(data)(r_), r_->stride(1));
   }
-  else if(r_->stride[1] == 1 && LDA_COND(vec2->size[0], vec1->size[0], r_->stride[0]))
+  else if(r_->stride(1) == 1 && LDA_COND(vec2->size(0), vec1->size(0), r_->stride(0)))
   {
-    THBlas_(ger)(vec2->size[0], vec1->size[0],
-                 alpha, THTensor_(data)(vec2), vec2->stride[0],
-                 THTensor_(data)(vec1), vec1->stride[0],
-                 THTensor_(data)(r_), r_->stride[0]);
+    THBlas_(ger)(vec2->size(0), vec1->size(0),
+                 alpha, THTensor_(data)(vec2), vec2->stride(0),
+                 THTensor_(data)(vec1), vec1->stride(0),
+                 THTensor_(data)(r_), r_->stride(0));
   }
   else
   {
     THTensor *cr = THTensor_(newClone)(r_);
 
-    THBlas_(ger)(vec2->size[0], vec1->size[0],
-                 alpha, THTensor_(data)(vec2), vec2->stride[0],
-                 THTensor_(data)(vec1), vec1->stride[0],
-                 THTensor_(data)(cr), cr->stride[0]);
+    THBlas_(ger)(vec2->size(0), vec1->size(0),
+                 alpha, THTensor_(data)(vec2), vec2->stride(0),
+                 THTensor_(data)(vec1), vec1->stride(0),
+                 THTensor_(data)(cr), cr->stride(0));
 
     THTensor_(freeCopyTo)(cr, r_);
   }
@@ -2236,8 +2253,8 @@ void THTensor_(addbmm)(THTensor *result, real beta, THTensor *t, real alpha, THT
 {
   int64_t batch;
 
-  THArgCheck(!batch1->is_empty() && THTensor_(nDimension)(batch1) == 3, 1, "expected non-empty 3D tensor");
-  THArgCheck(!batch2->is_empty() && THTensor_(nDimension)(batch2) == 3, 2, "expected non-empty 3D tensor");
+  THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor");
+  THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor");
   THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2,
              "equal number of batches expected, got %d, %d",
              THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0));
@@ -2277,8 +2294,8 @@ void THTensor_(baddbmm)(THTensor *result, real beta, THTensor *t, real alpha, TH
 {
   int64_t batch;
 
-  THArgCheck(THTensor_(_nDimension)(batch1) == 3, 1, "expected 3D tensor, got %dD", THTensor_(_nDimension)(batch1));
-  THArgCheck(THTensor_(_nDimension)(batch2) == 3, 2, "expected 3D tensor, got %dD", THTensor_(_nDimension)(batch2));
+  THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch1));
+  THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch2));
   THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2,
              "equal number of batches expected, got %d, %d",
              THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0));
@@ -2357,7 +2374,7 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
   THLongStorage_free(dim);
 
   // two implementations optimized for data locality
-  if (t->stride[dimension] == 1) {
+  if (t->stride(dimension) == 1) {
     real theMax;
     real value;
     int64_t theIndex;
@@ -2390,7 +2407,7 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
     }
     THLongTensor_zero(indices_);
 
-    if(t->size[dimension] == 1) {
+    if(t->size(dimension) == 1) {
       if (!keepdim) {
         THTensor_(squeeze1d)(values_, values_, dimension);
         THLongTensor_squeeze1d(indices_, indices_, dimension);
@@ -2400,13 +2417,13 @@ void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
 
     THTensor *tempValues_ = THTensor_(newWithTensor)(values_);
     // tempValues_.expand_as(t)
-    tempValues_->size[dimension] = t->size[dimension];
-    tempValues_->stride[dimension] = 0;
+    THTensor_setSizeAtDim(tempValues_, dimension, t->size(dimension));
+    THTensor_setStrideAtDim(tempValues_, dimension, 0);
 
     THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_);
     // tempIndices_.expand_as(t)
-    tempIndices_->size[dimension] = t->size[dimension];
-    tempIndices_->stride[dimension] = 0;
+    THTensor_setSizeAtDim(tempIndices_, dimension, t->size(dimension));
+    THTensor_setStrideAtDim(tempIndices_, dimension, 0);
 
     TH_TENSOR_APPLY3_D(real, t, real, tempValues_, int64_t, tempIndices_, dimension,
                           if(!(*t_data <= *tempValues__data) && !th_isnan(*tempValues__data)) {
@@ -2441,7 +2458,7 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
   THLongStorage_free(dim);
 
   // two implementations optimized for data locality
-  if (t->stride[dimension] == 1) {
+  if (t->stride(dimension) == 1) {
     real theMax;
     real value;
     int64_t theIndex;
@@ -2474,7 +2491,7 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
     }
     THLongTensor_zero(indices_);
 
-    if(t->size[dimension] == 1) {
+    if(t->size(dimension) == 1) {
       if (!keepdim) {
         THTensor_(squeeze1d)(values_, values_, dimension);
         THLongTensor_squeeze1d(indices_, indices_, dimension);
@@ -2484,13 +2501,13 @@ void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int
 
     THTensor *tempValues_ = THTensor_(newWithTensor)(values_);
     // tempValues_.expand_as(t)
-    tempValues_->size[dimension] = t->size[dimension];
-    tempValues_->stride[dimension] = 0;
+    THTensor_setSizeAtDim(tempValues_, dimension, t->size(dimension));
+    THTensor_setStrideAtDim(tempValues_, dimension, 0);
 
     THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_);
     // tempIndices_.expand_as(t)
-    tempIndices_->size[dimension] = t->size[dimension];
-    tempIndices_->stride[dimension] = 0;
+    THTensor_setSizeAtDim(tempIndices_, dimension, t->size(dimension));
+    THTensor_setStrideAtDim(tempIndices_, dimension, 0);
 
     TH_TENSOR_APPLY3_D(real, t, real, tempValues_, int64_t, tempIndices_, dimension,
                           if(!(*t_data >= *tempValues__data) && !th_isnan(*tempValues__data)) {
@@ -2543,16 +2560,16 @@ void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim)
 
         for(j = 0; j < r_Dim; ++j) {
           if(j != dimension){
-            quot = rem/r_->stride[j];
-            rem = rem%r_->stride[j];
-            tBasicIndex += quot*t->stride[j];
+            quot = rem/r_->stride(j);
+            rem = rem%r_->stride(j);
+            tBasicIndex += quot*t->stride(j);
           }
         }
         real *t_data = tp+tBasicIndex;
         real *r__data = rp+iter;
         *r__data = 0;
-        for(j=0; j < t->size[dimension]; ++j) {
-          *r__data += *(t_data + j*t->stride[dimension]);
+        for(j=0; j < t->size(dimension); ++j) {
+          *r__data += *(t_data + j*t->stride(dimension));
         }
       }
     } else {
@@ -2564,7 +2581,7 @@ void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim)
 #endif
   if (serial_path) {
     // two implementations optimized for data locality
-    if (t->stride[dimension] == 1) {
+    if (t->stride(dimension) == 1) {
       TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                            accreal sum = 0;
                            int64_t i;
@@ -2575,8 +2592,8 @@ void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim)
       THTensor_(zero)(r_);
       THTensor *temp_ = THTensor_(newWithTensor)(r_);
       // r_.expand_as(t)
-      temp_->size[dimension] = t->size[dimension];
-      temp_->stride[dimension] = 0;
+      THTensor_setSizeAtDim(temp_, dimension, t->size(dimension));
+      THTensor_setStrideAtDim(temp_, dimension, 0);
 
       TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data + *t_data;);
       THTensor_(free)(temp_);
@@ -2623,16 +2640,16 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim)
 
         for(j = 0; j < r_Dim; ++j) {
           if(j != dimension){
-            quot = rem/r_->stride[j];
-            rem = rem%r_->stride[j];
-            tBasicIndex += quot*t->stride[j];
+            quot = rem/r_->stride(j);
+            rem = rem%r_->stride(j);
+            tBasicIndex += quot*t->stride(j);
           }
         }
         real *t_data = tp+tBasicIndex;
         real *r__data = rp+iter;
         *r__data = 1;
-        for(j=0; j < t->size[dimension]; ++j) {
-          *r__data *= *(t_data + j*t->stride[dimension]);
+        for(j=0; j < t->size(dimension); ++j) {
+          *r__data *= *(t_data + j*t->stride(dimension));
         }
       }
     } else {
@@ -2645,7 +2662,7 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim)
 
   if(serial_path) {
     // two implementations optimized for data locality
-    if (t->stride[dimension] == 1) {
+    if (t->stride(dimension) == 1) {
       TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                            accreal prod = 1;
                            int64_t i;
@@ -2656,8 +2673,8 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim)
       THTensor_(fill)(r_, 1);
       THTensor *temp_ = THTensor_(newWithTensor)(r_);
       // r_.expand_as(t)
-      temp_->size[dimension] = t->size[dimension];
-      temp_->stride[dimension] = 0;
+      THTensor_setSizeAtDim(temp_, dimension, t->size(dimension));
+      THTensor_setStrideAtDim(temp_, dimension, 0);
 
       TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data * *t_data;);
       THTensor_(free)(temp_);
@@ -2670,7 +2687,7 @@ void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim)
 
 void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension)
 {
-  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range",
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
       dimension + TH_INDEX_BASE);
 
   THTensor_(resizeAs)(r_, t);
@@ -2687,7 +2704,7 @@ void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension)
 
 void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension)
 {
-  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "dimension %d out of range",
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
       dimension + TH_INDEX_BASE);
 
   THTensor_(resizeAs)(r_, t);
@@ -2745,11 +2762,11 @@ void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension)
 {
   int i;
 
-  if(THTensor_(_nDimension)(a) != THTensor_(_nDimension)(b))
+  if(THTensor_(nDimension)(a) != THTensor_(nDimension)(b))
     THError("inconsistent tensor dimension %dD, %dD",
-        THTensor_(_nDimension)(a), THTensor_(_nDimension)(b));
+        THTensor_(nDimension)(a), THTensor_(nDimension)(b));
 
-  for(i = 0; i < THTensor_(_nDimension)(a); i++)
+  for(i = 0; i < THTensor_(nDimension)(a); i++)
   {
     if(THTensor_(size)(a, i) != THTensor_(size)(b, i)) {
         THDescBuff ba = THTensor_(sizeDesc)(a);
@@ -2760,7 +2777,7 @@ void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension)
 
   if(dimension < 0)
   {
-    for(i = 0; i < THTensor_(_nDimension)(a); i++)
+    for(i = 0; i < THTensor_(nDimension)(a); i++)
     {
       if(THTensor_(size)(a, i) == 3)
       {
@@ -2774,7 +2791,7 @@ void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension)
     }
   }
 
-  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(a), 3, "dimension %d out of range",
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(a), 3, "dimension %d out of range",
       dimension + TH_INDEX_BASE);
   THArgCheck(THTensor_(size)(a, dimension) == 3, 3, "dimension %d does not have size 3",
       dimension + TH_INDEX_BASE);
@@ -2893,7 +2910,7 @@ void THTensor_(eye)(THTensor *r_, int64_t n, int64_t m)
   r__data = THTensor_(data)(r_);
   sz = THMin(THTensor_(size)(r_, 0), THTensor_(size)(r_, 1));
   for(i = 0; i < sz; i++)
-    r__data[i*(r_->stride[0]+r_->stride[1])] = 1;
+    r__data[i*(r_->stride(0)+r_->stride(1))] = 1;
 }
 
 
@@ -3169,7 +3186,7 @@ static void THTensor_(quicksortdescend)(real *arr, int64_t *idx, int64_t element
 
 void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder)
 {
-  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 2, "invalid dimension %d",
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "invalid dimension %d",
       dimension + TH_INDEX_BASE);
 
   THTensor_(resizeAs)(rt_, t);
@@ -3376,7 +3393,7 @@ void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t,
   int64_t t_size_dim;
 
   THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(t), 3, "dimension out of range");
-  THArgCheck(k > 0 && k <= t->size[dimension], 2, "selected index out of range");
+  THArgCheck(k > 0 && k <= t->size(dimension), 2, "selected index out of range");
 
   int in_dims = THTensor_(_nDimension)(t);
   THTensor_(preserveReduceDimSemantics)(values_, in_dims, dimension, keepdim);
@@ -3430,11 +3447,19 @@ void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, i
 
 void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted)
 {
+#ifndef USE_TH_SIZE_ZERO_DIM
   int numDims = THTensor_(_nDimension)(t);
+#else
+  int numDims = THTensor_(nDimension)(t);
+#endif
   THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range");
 
   int64_t sliceSize = THTensor_(size)(t, dim);
+#ifndef USE_TH_SIZE_ZERO_DIM
   THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension");
+#else
+  THArgCheck(k >= 0 && k <= sliceSize, 2, "k not in range for dimension");
+#endif
 
   THTensor *tmpResults = THTensor_(new)();
   THTensor_(resize1d)(tmpResults, sliceSize);
@@ -3577,8 +3602,8 @@ inline void THTensor_(check_shape_except_dim)(THTensor *first, THTensor *second,
     if (dim == dimension) {
       continue;
     }
-    int64_t first_dim_size = first->size[dim];
-    int64_t second_dim_size = second->size[dim];
+    int64_t first_dim_size = first->size(dim);
+    int64_t second_dim_size = second->size(dim);
     THArgCheck(first_dim_size == second_dim_size, 0,
         "Sizes of tensors must match except in dimension %d. Got %lld and %lld in dimension %d",
         dimension, (long long)first_dim_size, (long long)second_dim_size, dim);
@@ -3622,13 +3647,13 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
       continue;
     }
     THTensor_(check_shape_except_dim)(notSkippedTensor, tensor, dimension);
-    cat_dim_size += tensor->size[dimension];
+    cat_dim_size += tensor->size(dimension);
   }
 
   // Compute the size of the result
   THLongStorage *size = THLongStorage_newWithSize(nDims);
   for (int dim = 0; dim < nDims; dim++) {
-    int64_t result_dim_size = notSkippedTensor->size[dim];
+    int64_t result_dim_size = notSkippedTensor->size(dim);
     if (dim == dimension) {
       result_dim_size = cat_dim_size;
     }
@@ -3667,7 +3692,7 @@ void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int
     offset = 0;
     for (int j = 0; j < numInputs; j++) {
       if (!should_skip(inputs[j])) {
-        int64_t dimSize = inputs[j]->size[dimension];
+        int64_t dimSize = inputs[j]->size(dimension);
         THTensor *nt = THTensor_(newWithTensor)(result);
         THTensor_(narrow)(nt, NULL, dimension, offset, dimSize);
         THTensor_(copy)(nt, inputs[j]);
@@ -3707,25 +3732,25 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
 #define TENSOR_IMPLEMENT_LOGICAL(NAME,OP)				\
   void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, real value)	\
   {									\
-    THByteTensor_resizeNd(r_, t->dim(), t->size, NULL);		\
+    THByteTensor_resizeNd(r_, t->dim(), THTensor_getSizePtr(t), NULL);		\
     TH_TENSOR_APPLY2(unsigned char, r_, real, t,			\
 		     *r__data = (*t_data OP value) ? 1 : 0;); \
   }									\
   void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, real value)	\
   {									\
-    THTensor_(resizeNd)(r_, t->dim(), t->size, NULL);		\
+    THTensor_(resizeNd)(r_, t->dim(), THTensor_getSizePtr(t), NULL);		\
     TH_TENSOR_APPLY2(real, r_, real, t,					\
 		     *r__data = (*t_data OP value) ? 1 : 0;); \
   }									\
   void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
   {									\
-    THByteTensor_resizeNd(r_, ta->dim(), ta->size, NULL);		\
+    THByteTensor_resizeNd(r_, ta->dim(), THTensor_getSizePtr(ta), NULL);		\
     TH_TENSOR_APPLY3(unsigned char, r_, real, ta, real, tb,		\
 		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
   }									\
   void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
   {									\
-    THTensor_(resizeNd)(r_, ta->dim(), ta->size, NULL);		\
+    THTensor_(resizeNd)(r_, ta->dim(), THTensor_getSizePtr(ta), NULL);		\
     TH_TENSOR_APPLY3(real, r_, real, ta, real, tb,			\
 		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
   }									\
@@ -3926,16 +3951,16 @@ void THTensor_(logicalAnd)(THTensor *r_, THTensor *t, int dimension, int keepdim
 
         for(j = 0; j < r_Dim; ++j) {
           if(j != dimension){
-            quot = rem/r_->stride[j];
-            rem = rem%r_->stride[j];
-            tBasicIndex += quot*t->stride[j];
+            quot = rem/r_->stride(j);
+            rem = rem%r_->stride(j);
+            tBasicIndex += quot*t->stride(j);
           }
         }
         real *t_data = tp+tBasicIndex;
         real *r__data = rp+iter;
         *r__data = 1;
-        for(j=0; j < t->size[dimension]; ++j) {
-          *r__data = *r__data && *(t_data + j*t->stride[dimension]);
+        for(j=0; j < t->size(dimension); ++j) {
+          *r__data = *r__data && *(t_data + j*t->stride(dimension));
         }
       }
     } else {
@@ -3948,7 +3973,7 @@ void THTensor_(logicalAnd)(THTensor *r_, THTensor *t, int dimension, int keepdim
 
   if(serial_path) {
     // two implementations optimized for data locality
-    if (t->stride[dimension] == 1) {
+    if (t->stride(dimension) == 1) {
       TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                            accreal prod = 1;
                            int64_t i;
@@ -3959,8 +3984,8 @@ void THTensor_(logicalAnd)(THTensor *r_, THTensor *t, int dimension, int keepdim
       THTensor_(fill)(r_, 1);
       THTensor *temp_ = THTensor_(newWithTensor)(r_);
       // r_.expand_as(t)
-      temp_->size[dimension] = t->size[dimension];
-      temp_->stride[dimension] = 0;
+      THTensor_setSizeAtDim(temp_, dimension, t->size(dimension));
+      THTensor_setStrideAtDim(temp_, dimension, 0);
 
       TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data && *t_data;);
       THTensor_(free)(temp_);
@@ -4006,16 +4031,16 @@ void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim
 
         for(j = 0; j < r_Dim; ++j) {
           if(j != dimension){
-            quot = rem/r_->stride[j];
-            rem = rem%r_->stride[j];
-            tBasicIndex += quot*t->stride[j];
+            quot = rem/r_->stride(j);
+            rem = rem%r_->stride(j);
+            tBasicIndex += quot*t->stride(j);
           }
         }
         real *t_data = tp+tBasicIndex;
         real *r__data = rp+iter;
         *r__data = 0;
-        for(j=0; j < t->size[dimension]; ++j) {
-          *r__data = *r__data || *(t_data + j*t->stride[dimension]);
+        for(j=0; j < t->size(dimension); ++j) {
+          *r__data = *r__data || *(t_data + j*t->stride(dimension));
         }
       }
     } else {
@@ -4027,7 +4052,7 @@ void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim
 #endif
   if (serial_path) {
     // two implementations optimized for data locality
-    if (t->stride[dimension] == 1) {
+    if (t->stride(dimension) == 1) {
       TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
                            accreal sum = 0;
                            int64_t i;
@@ -4038,8 +4063,8 @@ void THTensor_(logicalAny)(THTensor *r_, THTensor *t, int dimension, int keepdim
       THTensor_(zero)(r_);
       THTensor *temp_ = THTensor_(newWithTensor)(r_);
       // r_.expand_as(t)
-      temp_->size[dimension] = t->size[dimension];
-      temp_->stride[dimension] = 0;
+      THTensor_setSizeAtDim(temp_, dimension, t->size(dimension));
+      THTensor_setStrideAtDim(temp_, dimension, 0);
 
       TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data || *t_data;);
       THTensor_(free)(temp_);
@@ -4123,7 +4148,7 @@ void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim)
       dimension + TH_INDEX_BASE);
 
   THTensor_(sum)(r_, t, dimension, keepdim);
-  THTensor_(div)(r_, r_, t->size[dimension]);
+  THTensor_(div)(r_, r_, t->size(dimension));
 }
 
 void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int biased, int keepdim)
@@ -4288,21 +4313,20 @@ accreal THTensor_(normall)(THTensor *tensor, real value)
 
 void THTensor_(renorm)(THTensor *res, THTensor *src, real value, int dimension, real maxnorm)
 {
-  int i;
   THTensor *rowR, *rowS;
 
-  THArgCheck(dimension >= 0 && dimension < THTensor_(_nDimension)(src), 3, "invalid dimension %d",
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(src), 3, "invalid dimension %d",
       dimension + TH_INDEX_BASE);
   THArgCheck(value > 0, 2, "non-positive-norm not supported");
-  THArgCheck(THTensor_(_nDimension)(src) > 1, 1, "need at least 2 dimensions, got %d dimensions",
-      THTensor_(_nDimension)(src));
+  THArgCheck(THTensor_(nDimension)(src) > 1, 1, "need at least 2 dimensions, got %d dimensions",
+      THTensor_(nDimension)(src));
 
   rowR = THTensor_(new)();
   rowS = THTensor_(new)();
 
   THTensor_(resizeAs)(res, src);
 
-  for (i=0; i<src->size[dimension]; i++)
+  for (int64_t i = 0; i < src->size(dimension); i++)
   {
     real norm = 0;
     real new_norm;
@@ -4454,7 +4478,7 @@ void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, int64_t nbins, real min
   real minval;
   real maxval;
 
-  THTensor_(resize2d)(hist, tensor->size[0], nbins);
+  THTensor_(resize2d)(hist, tensor->size(0), nbins);
   THTensor_(zero)(hist);
 
   minval = minvalue;
diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp
index 3ddbfa66b4c7b7..ceb927429573fc 100644
--- a/aten/src/TH/generic/THTensorRandom.cpp
+++ b/aten/src/TH/generic/THTensorRandom.cpp
@@ -79,7 +79,7 @@ void THTensor_(iBernoulli_generate_copy)(THTensor *self, THGenerator *_generator
 #endif
   } else {
     intTensor = THIntTensor_new();
-    THIntTensor_resizeNd(intTensor, self->dim(), self->size, NULL);
+    THIntTensor_resizeNd(intTensor, self->dim(), THTensor_getSizePtr(self), nullptr);
     tmp = THIntTensor_data(intTensor);
   }
 
@@ -284,9 +284,9 @@ void THTensor_(multinomialAliasSetup)(THTensor *probs, THLongTensor *J, THTensor
       small = THLongTensor_fastGet1d(smaller, small_c-1);
 
       THLongTensor_fastSet1d(J, small, large);
-      q_data[large * q->stride[0]] -= 1.0 - THTensor_(fastGet1d)(q, small);
+      q_data[large * q->stride(0)] -= 1.0 - THTensor_(fastGet1d)(q, small);
 
-      if(q_data[large * q->stride[0]] < 1.0)
+      if(q_data[large * q->stride(0)] < 1.0)
         {
           THLongTensor_fastSet1d(smaller, small_c-1, large);
           large_c -= 1;
@@ -317,7 +317,7 @@ void THTensor_(multinomialAliasSetup)(THTensor *probs, THLongTensor *J, THTensor
     {
       for (i=0; i < inputsize; i++)
         {
-          q_data[i*q->stride[0]] /= q_max;
+          q_data[i*q->stride(0)] /= q_max;
         }
     }
   for (i=0; i < inputsize; i++)
@@ -399,7 +399,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
     {
       val = THStorage_(get)( \
         prob_dist->storage, \
-        prob_dist->storageOffset+i*prob_dist->stride[0]+j*prob_dist->stride[1] \
+        prob_dist->storageOffset+i*prob_dist->stride(0)+j*prob_dist->stride(1) \
       );
       THArgCheckWithCleanup((val >= 0),
                             THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);),
@@ -412,7 +412,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
       sum += val;
       THDoubleStorage_set(
         cum_dist->storage, \
-        cum_dist->storageOffset+j*cum_dist->stride[0], \
+        cum_dist->storageOffset+j*cum_dist->stride(0), \
         sum \
       );
     }
@@ -426,7 +426,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
     {
       for (j=0; j<n_categories; j++)
       {
-        THDoubleTensor_data(cum_dist)[j*cum_dist->stride[0]] /= sum;
+        THDoubleTensor_data(cum_dist)[j*cum_dist->stride(0)] /= sum;
       }
     }
 
@@ -442,14 +442,14 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
       double cum_prob;
       int sample_idx;
       /* Make sure the last cumulative distribution bucket sums to 1 */
-      THDoubleTensor_data(cum_dist)[(n_categories-1)*cum_dist->stride[0]] = 1;
+      THDoubleTensor_data(cum_dist)[(n_categories-1)*cum_dist->stride(0)] = 1;
 
       while(right_pointer - left_pointer > 0)
       {
           mid_pointer = left_pointer + (right_pointer - left_pointer) / 2;
           cum_prob = THDoubleStorage_get( \
             cum_dist->storage, \
-            cum_dist->storageOffset+mid_pointer*cum_dist->stride[0] \
+            cum_dist->storageOffset+mid_pointer*cum_dist->stride(0) \
           );
           if (cum_prob < uniform_sample)
           {
@@ -465,7 +465,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
        /* store in result tensor (will be incremented for lua compat by wrapper) */
       THLongStorage_set( \
         self->storage, \
-        self->storageOffset+i*self->stride[0]+j*self->stride[1], \
+        self->storageOffset+i*self->stride(0)+j*self->stride(1), \
         sample_idx \
       );
 
@@ -481,13 +481,13 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
         {
           new_val = THDoubleStorage_get( \
             cum_dist->storage, \
-            cum_dist->storageOffset+(sample_idx-1)*cum_dist->stride[0] \
+            cum_dist->storageOffset+(sample_idx-1)*cum_dist->stride(0) \
           );
         }
         /* marginal cumulative mass (i.e. original probability) of sample */
         diff = THDoubleStorage_get( \
           cum_dist->storage, \
-          cum_dist->storageOffset+sample_idx*cum_dist->stride[0] \
+          cum_dist->storageOffset+sample_idx*cum_dist->stride(0) \
         ) - new_val;
         /* new sum of marginals is not one anymore... */
         sum = 1.0 - diff;
@@ -495,7 +495,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
         {
           new_val = THDoubleStorage_get( \
             cum_dist->storage, \
-            cum_dist->storageOffset+k*cum_dist->stride[0] \
+            cum_dist->storageOffset+k*cum_dist->stride(0) \
           );
           if (k >= sample_idx)
           {
@@ -506,7 +506,7 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
           new_val /= sum;
           THDoubleStorage_set( \
             cum_dist->storage, \
-            cum_dist->storageOffset+k*cum_dist->stride[0], \
+            cum_dist->storageOffset+k*cum_dist->stride(0), \
             new_val \
           );
         }
diff --git a/aten/src/THC/THCAllocator.h b/aten/src/THC/THCAllocator.h
index 0f7724dd8d4fb2..652bb7a4a95d47 100644
--- a/aten/src/THC/THCAllocator.h
+++ b/aten/src/THC/THCAllocator.h
@@ -3,8 +3,8 @@
 
 #include "THCGeneral.h"
 
-THC_API THAllocator* getTHCudaHostAllocator();
-THC_API THAllocator* getTHCUVAAllocator();
+THC_API THAllocator* getTHCudaHostAllocator(void);
+THC_API THAllocator* getTHCUVAAllocator(void);
 // IPC doesn't support (re)allocation
 
 #ifdef __cplusplus
diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu
index e2003da5791f97..f36fc74112eb8c 100644
--- a/aten/src/THC/THCBlas.cu
+++ b/aten/src/THC/THCBlas.cu
@@ -80,10 +80,19 @@ half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y,
 #endif
 
 /* Level 2 */
+
+void adjustLdLevel2(int64_t m, int64_t n, int64_t *lda)
+{
+  // Note: leading dimensions generally are checked that they are > 0 and at least as big the result
+  // requires (even if the value won't be used).
+  // TODO: why does Level3 check trans but this doesn't?
+  if (n <= 1)
+    *lda = std::max<int64_t>(m, 1);
+}
+
 void THCudaBlas_Sgemv(THCState *state, char trans, int64_t m, int64_t n, float alpha, float *a, int64_t lda, float *x, int64_t incx, float beta, float *y, int64_t incy)
 {
-  if(n == 1)
-    lda = m;
+  adjustLdLevel2(m, n, &lda);
 
   cublasOperation_t op;
   if (trans == 't') op = CUBLAS_OP_T;
@@ -113,8 +122,7 @@ void THCudaBlas_Sgemv(THCState *state, char trans, int64_t m, int64_t n, float a
 
 void THCudaBlas_Dgemv(THCState *state, char trans, int64_t m, int64_t n, double alpha, double *a, int64_t lda, double *x, int64_t incx, double beta, double *y, int64_t incy)
 {
-  if(n == 1)
-    lda = m;
+  adjustLdLevel2(m, n, &lda);
 
   cublasOperation_t op;
   if (trans == 't') op = CUBLAS_OP_T;
@@ -144,8 +152,7 @@ void THCudaBlas_Dgemv(THCState *state, char trans, int64_t m, int64_t n, double
 
 void THCudaBlas_Sger(THCState *state, int64_t m, int64_t n, float alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda)
 {
-  if(n == 1)
-    lda = m;
+  adjustLdLevel2(m, n, &lda);
 
   if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX)  && (incx <= INT_MAX) && (incy <= INT_MAX) )
     {
@@ -166,8 +173,7 @@ void THCudaBlas_Sger(THCState *state, int64_t m, int64_t n, float alpha, float *
 
 void THCudaBlas_Dger(THCState *state, int64_t m, int64_t n, double alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda)
 {
-  if(n == 1)
-    lda = m;
+  adjustLdLevel2(m, n, &lda);
 
   if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX)  && (incx <= INT_MAX) && (incy <= INT_MAX) )
     {
@@ -197,41 +203,44 @@ cublasOperation_t convertTransToCublasOperation(char trans) {
   }
 }
 
-void adjustLd(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t *lda, int64_t *ldb, int64_t *ldc)
+void adjustLdLevel3(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t *lda, int64_t *ldb, int64_t *ldc)
 {
   int transa_ = ((transa == 't') || (transa == 'T'));
   int transb_ = ((transb == 't') || (transb == 'T'));
 
-  if(n == 1)
-    *ldc = m;
+  // Note: leading dimensions generally are checked that they are > 0 and at least as big the result
+  // requires (even if the value won't be used).
+  if(n <= 1)
+    *ldc = std::max<int64_t>(m, 1);
 
   if(transa_)
   {
-    if(m == 1)
-      *lda = k;
+    if(m <= 1)
+      *lda = std::max<int64_t>(k, 1);
   }
   else
   {
-    if(k == 1)
-      *lda = m;
+    if(k <= 1)
+      *lda = std::max<int64_t>(m, 1);
   }
 
   if(transb_)
   {
-    if(k == 1)
-      *ldb = n;
+    if(k <= 1)
+      *ldb = std::max<int64_t>(n, 1);
   }
   else
   {
-    if(n == 1)
-      *ldb = k;
+    if(n <= 1)
+      *ldb = std::max<int64_t>(k, 1);
   }
+
 }
 
 /* Level 3 */
 void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc)
 {
-  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   cublasOperation_t opa = convertTransToCublasOperation(transa);
   cublasOperation_t opb = convertTransToCublasOperation(transb);
 
@@ -261,7 +270,7 @@ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int6
 
 void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc)
 {
-  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   cublasOperation_t opa = convertTransToCublasOperation(transa);
   cublasOperation_t opb = convertTransToCublasOperation(transb);
 
@@ -312,7 +321,7 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6
 
 void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc)
 {
-  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   cublasOperation_t opa = convertTransToCublasOperation(transa);
   cublasOperation_t opb = convertTransToCublasOperation(transb);
 
@@ -346,7 +355,7 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i
             "with the bound [val] <= %d", INT_MAX);
   }
 
-  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   cublasOperation_t opa = convertTransToCublasOperation(transa);
   cublasOperation_t opb = convertTransToCublasOperation(transb);
 
@@ -375,7 +384,7 @@ void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t
             "with the bound [val] <= %d", INT_MAX);
   }
 
-  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   cublasOperation_t opa = convertTransToCublasOperation(transa);
   cublasOperation_t opb = convertTransToCublasOperation(transb);
 
@@ -399,7 +408,7 @@ void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, i
             "with the bound [val] <= %d", INT_MAX);
   }
 
-  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   cublasOperation_t opa = convertTransToCublasOperation(transa);
   cublasOperation_t opb = convertTransToCublasOperation(transb);
 
@@ -422,7 +431,7 @@ void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t
             "with the bound [val] <= %d", INT_MAX);
   }
 
-  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   cublasOperation_t opa = convertTransToCublasOperation(transa);
   cublasOperation_t opb = convertTransToCublasOperation(transb);
 
@@ -445,7 +454,7 @@ void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, i
             "with the bound [val] <= %d", INT_MAX);
   }
 
-  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   cublasOperation_t opa = convertTransToCublasOperation(transa);
   cublasOperation_t opb = convertTransToCublasOperation(transb);
 
diff --git a/aten/src/THC/THCCachingHostAllocator.h b/aten/src/THC/THCCachingHostAllocator.h
index d270c2081e8726..adb86cbb120526 100644
--- a/aten/src/THC/THCCachingHostAllocator.h
+++ b/aten/src/THC/THCCachingHostAllocator.h
@@ -19,7 +19,7 @@
 // Note that this allocator does not split larger allocations into smaller
 // blocks, unlike the caching device allocator.
 //
-THC_API THAllocator* getTHCCachingHostAllocator();
+THC_API THAllocator* getTHCCachingHostAllocator(void);
 
 // Records an event in the specified stream. The allocation 'ptr' will not be
 // re-used until the event has occurred.
diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp
index ab92022d9a6a1d..4d46a01296cb20 100644
--- a/aten/src/THC/THCStorage.cpp
+++ b/aten/src/THC/THCStorage.cpp
@@ -8,58 +8,6 @@
 #include "generic/THCStorage.cpp"
 #include "THCGenerateAllTypes.h"
 
-THCStorage* THCStorage_new(THCState *state, at::ScalarType scalar_type)
-{
-  return THCStorage_newWithSize(state, scalar_type, 0);
-}
-
-THCStorage* THCStorage_newWithSize(THCState *state, at::ScalarType scalar_type, ptrdiff_t size)
-{
-  return THCStorage_newWithAllocator(
-    state, scalar_type, size,
-    state->cudaDeviceAllocator);
-}
-
-THCStorage* THCStorage_newWithAllocator(THCState *state,
-                                        at::ScalarType scalar_type,
-                                        ptrdiff_t size,
-                                        at::Allocator* allocator)
-{
-  THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage));
-  memset(storage, 0, sizeof(THCStorage));
-  new (&storage->refcount) std::atomic<int>(1);
-  new (&storage->weakcount) std::atomic<int>(1);
-  new (&storage->finalizer) std::unique_ptr<THFinalizer>(nullptr);
-  storage->scalar_type = scalar_type;
-  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE;
-  storage->allocator = allocator;
-  storage->size = size;
-
-  at::DataPtr ptr;
-  try {
-    ptr = allocator->allocate(size * at::elementSize(scalar_type));
-  } catch(...) {
-    free(storage);
-    throw;
-  }
-  new (&storage->data_ptr) at::DataPtr(std::move(ptr));
-  return storage;
-}
-
-void THCStorage_free(THCState *state, THCStorage *storage)
-{
-  if (storage->flag & TH_STORAGE_REFCOUNTED) {
-    if (--storage->refcount == 0) {
-      if (storage->finalizer) {
-        (*storage->finalizer)();
-      }
-      storage->finalizer.~unique_ptr<THFinalizer>();
-      storage->data_ptr.~DataPtr();
-      THStorage_weakFree(storage);
-    }
-  }
-}
-
 void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size)
 {
   THArgCheck(size >= 0, 2, "invalid size");
@@ -103,18 +51,13 @@ int THCStorage_getDevice(THCState* state, const THCStorage* storage) {
   return storage->data_ptr.device().index();
 }
 
-THCStorage* THCStorage_newWithDataAndAllocator(
-  THCState *state, at::ScalarType scalar_type, at::DataPtr&& data, ptrdiff_t size,
-  at::Allocator *allocator) {
-  THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage));
-  memset(storage, 0, sizeof(THCStorage));
-  storage->scalar_type = scalar_type;
-  new (&storage->data_ptr) at::DataPtr(std::move(data));
-  storage->size = size;
-  new (&storage->refcount) std::atomic<int>(1);
-  new (&storage->weakcount) std::atomic<int>(1);
-  new (&storage->finalizer) std::unique_ptr<THFinalizer>(nullptr);
-  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE;
-  storage->allocator = allocator;
+THC_API THCStorage* THCStorage_new(
+    THCState* state,
+    at::ScalarType scalar_type) {
+  THStorage* storage = new THStorage(
+      scalar_type,
+      0,
+      state->cudaDeviceAllocator,
+      TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE);
   return storage;
 }
diff --git a/aten/src/THC/THCStorage.h b/aten/src/THC/THCStorage.h
index 22a607ce43107f..d14df7f50859ae 100644
--- a/aten/src/THC/THCStorage.h
+++ b/aten/src/THC/THCStorage.h
@@ -1,7 +1,7 @@
 #ifndef THC_STORAGE_INC
 #define THC_STORAGE_INC
 
-#include "THStorage.h"
+#include "THStorageFunctions.h"
 #include "THCGeneral.h"
 
 #define THCStorage_(NAME) TH_CONCAT_4(TH,CReal,Storage_,NAME)
diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp
index ae5ad7bd8cdc72..8ab7e27fec485a 100644
--- a/aten/src/THC/THCStorage.hpp
+++ b/aten/src/THC/THCStorage.hpp
@@ -4,7 +4,8 @@
 // read Note [TH abstraction violation]
 
 #include "THCStorage.h"
-#include <TH/THStorage.hpp>
+// Should work with THStorageClass
+#include <TH/THStorageFunctions.hpp>
 
 #include "ATen/ScalarType.h"
 #include "ATen/ScalarTypeUtils.h"
@@ -17,19 +18,10 @@ struct CTypeToScalarType<__half> : public CTypeToScalarType<Half> {};
 
 }
 
-THC_API THCStorage* THCStorage_new(THCState *state, at::ScalarType scalar_type);
-THC_API THCStorage* THCStorage_newWithSize(THCState *state, at::ScalarType scalar_type, ptrdiff_t size);
-
-THC_API THCStorage* THCStorage_newWithAllocator(THCState *state,
-                                        at::ScalarType scalar_type,
-                                        ptrdiff_t size,
-                                        at::Allocator* allocator);
+THC_API THCStorage* THCStorage_new(THCState* state, at::ScalarType);
 
 THC_API void THCStorage_retain(THCState *state, THCStorage *storage);
 
-// This exists to have a data-type independent way of freeing (necessary for THPPointer).
-THC_API void THCStorage_free(THCState *state, THCStorage *self);
-
 THC_API void THCStorage_resize(THCState *state, THCStorage *storage, ptrdiff_t size);
 THC_API int THCStorage_getDevice(THCState* state, const THCStorage* storage);
 
diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp
index 7ecf02f014a342..13fdff6b3b566b 100644
--- a/aten/src/THC/THCTensor.cpp
+++ b/aten/src/THC/THCTensor.cpp
@@ -19,16 +19,16 @@ int THCTensor__nDimension(THCState *state, const THCTensor *self) {
 
 int64_t THCTensor_size(THCState *state, const THCTensor *self, int dim) {
   THArgCheck((dim >= 0) && (dim < self->dim()), 2, "out of range");
-  return self->size[dim];
+  return self->size(dim);
 }
 
 int64_t THCTensor_stride(THCState *state, const THCTensor *self, int dim) {
   THArgCheck((dim >= 0) && (dim < self->dim()), 2, "out of range");
-  return self->stride[dim];
+  return self->stride(dim);
 }
 THLongStorage *THCTensor_newSizeOf(THCState *state, THCTensor *self) {
   THLongStorage *size = THLongStorage_newWithSize(self->dim());
-  THLongStorage_rawCopy(size, self->size);
+  THLongStorage_rawCopy(size, THTensor_getSizePtr(self));
   return size;
 }
 
@@ -73,7 +73,7 @@ void THCTensor_resizeAs(THCState *state, THCTensor *self, THCTensor *src) {
     isSame = 1;
     for(d = 0; d < self->dim(); d++)
     {
-      if(self->size[d] != src->size[d])
+      if(self->size(d) != src->size(d))
       {
         isSame = 0;
         break;
@@ -82,7 +82,7 @@ void THCTensor_resizeAs(THCState *state, THCTensor *self, THCTensor *src) {
   }
 
   if(!isSame)
-    THCTensor_resizeNd(state, self, src->dim(), src->size, NULL);
+    THCTensor_resizeNd(state, self, src->dim(), THTensor_getSizePtr(src), NULL);
 }
 
 void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_t *size, int64_t *stride)
@@ -108,12 +108,12 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_
       AT_CHECK(size[d] > 0, "sizes must be non-negative");
     }
 #endif
-    if((self->dim() > d) && (size[d] != self->size[d])) {
+    if((self->dim() > d) && (size[d] != self->size(d))) {
       hascorrectsize = false;
     }
 
     // NB: this used to test that stride[d] was >= 0
-    if((self->dim() > d) && stride && (stride[d] != self->stride[d])) {
+    if((self->dim() > d) && stride && (stride[d] != self->stride(d))) {
       hascorrectsize = false;
     }
   }
@@ -128,26 +128,24 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_
 
   if(nDimension != self->dim())
   {
-    self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*nDimension);
-    self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*nDimension);
-    self->dim_ = nDimension;
+    THTensor_resizeDim(self, nDimension);
   }
 
   totalSize = 1;
   for(d = nDimension-1; d >= 0; d--)
   {
-    self->size[d] = size[d];
+    THTensor_setSizeAtDim(self, d, size[d]);
     if(stride && (stride[d] >= 0) ) {
-      self->stride[d] = stride[d];
+      THTensor_setStrideAtDim(self, d, stride[d]);
     } else {
       if(d == nDimension-1) {
-        self->stride[d] = 1;
+        THTensor_setStrideAtDim(self, d, 1);
       } else {
         // Keep stride monotonically increasing to match NumPy.
-        self->stride[d] = std::max<int64_t>(self->size[d+1],1)*self->stride[d+1];
+        THTensor_setStrideAtDim(self, d, std::max<int64_t>(self->size(d+1),1)*self->stride(d+1));
       }
     }
-    totalSize += (self->size[d]-1)*self->stride[d];
+    totalSize += (self->size(d)-1)*self->stride(d);
   }
 
   if(totalSize+self->storageOffset > 0)
@@ -169,8 +167,8 @@ void THCTensor_set(THCState *state, THCTensor *self, THCTensor *src)
                            src->storage,
                            src->storageOffset,
                            src->dim(),
-                           src->size,
-                           src->stride);
+                           THTensor_getSizePtr(src),
+                           THTensor_getStridePtr(src));
 }
 
 void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride)
@@ -182,7 +180,7 @@ void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storag
       THError("Tensor: invalid null storage");
     }
     auto scalar_type = self->storage->scalar_type;
-    THCStorage_free(state, self->storage);
+    THStorage_free(self->storage);
 
     if(storage)
     {
@@ -214,17 +212,17 @@ void THCTensor_squeeze1d(THCState *state, THCTensor *self, THCTensor *src, int d
   THCTensor_set(state, self, src);
 
 #ifdef TH_SCALAR
-  if(src->size[dimension] == 1)
+  if(src->size(dimension) == 1)
 #else
-  if(src->size[dimension] == 1 && src->dim() > 1)
+  if(src->size(dimension) == 1 && src->dim() > 1)
 #endif
   {
     for(d = dimension; d < self->dim()-1; d++)
     {
-      self->size[d] = self->size[d+1];
-      self->stride[d] = self->stride[d+1];
+      THTensor_setSizeAtDim(self, d, self->size(d+1));
+      THTensor_setStrideAtDim(self, d, self->stride(d+1));
     }
-    self->dim_--;
+    THTensor_resizeDim(self, self->dim_ - 1);
   }
 }
 
@@ -242,19 +240,17 @@ void THCTensor_unsqueeze1d(THCState *state, THCTensor *self, THCTensor *src, int
 
   THCTensor_set(state, self, src);
 
-  self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->dim()+1));
-  self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->dim()+1));
-  self->dim_++;
+  THTensor_resizeDim(self, self->dim() + 1);
   for (d = self->dim()-1; d > dimension; d--) {
-    self->size[d] = self->size[d-1];
-    self->stride[d] = self->stride[d-1];
+    THTensor_setSizeAtDim(self, d, self->size(d-1));
+    THTensor_setStrideAtDim(self, d, self->stride(d-1));
   }
   if (dimension+1 < self->dim()) {
-    self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
+    THTensor_setStrideAtDim(self, dimension, self->size(dimension+1) * self->stride(dimension+1));
   } else {
-    self->stride[dimension] = 1;
+    THTensor_setStrideAtDim(self, dimension, 1);
   }
-  self->size[dimension] = 1;
+  THTensor_setSizeAtDim(self, dimension, 1);
 }
 
 bool THCTensor_isContiguous(THCState *state, const THCTensor *self) {
@@ -263,10 +259,10 @@ bool THCTensor_isContiguous(THCState *state, const THCTensor *self) {
   int d;
   for(d = self->dim()-1; d >= 0; d--)
   {
-    if(self->size[d] != 1)
+    if(self->size(d) != 1)
     {
-      if(self->stride[d] == z)
-        z *= self->size[d];
+      if(self->stride(d) == z)
+        z *= self->size(d);
       else
         return false;
     }
@@ -292,33 +288,18 @@ ptrdiff_t THCTensor_nElement(THCState *state, const THCTensor *self) {
     ptrdiff_t nElement = 1;
     int d;
     for(d = 0; d < self->_dim(); d++)
-      nElement *= self->size[d];
+      nElement *= self->size(d);
     return nElement;
   }
 }
 
 void THCTensor_retain(THCState *state, THCTensor *self) {
-  if(self->flag & TH_TENSOR_REFCOUNTED)
-    self->refcount++;
+  self->refcount++;
 }
 
 
 void THCTensor_free(THCState *state, THCTensor *self) {
-  if(!self)
-    return;
-
-  if(self->flag & TH_TENSOR_REFCOUNTED)
-  {
-    if(--self->refcount == 0)
-    {
-      THFree(self->size);
-      THFree(self->stride);
-      if(self->storage)
-        THCStorage_free(state, self->storage);
-      self->refcount.~atomic<int>();
-      THFree(self);
-    }
-  }
+  THTensor_free(self);
 }
 
 int THCTensor_getDevice(THCState* state, const THCTensor* tensor) {
diff --git a/aten/src/THC/THCTensor.hpp b/aten/src/THC/THCTensor.hpp
index d4fa2c6835f46e..56147b27e912c4 100644
--- a/aten/src/THC/THCTensor.hpp
+++ b/aten/src/THC/THCTensor.hpp
@@ -10,54 +10,6 @@
 #include <atomic>
 #include <ATen/ATen.h>
 
-typedef struct THCTensor
-{
-    int64_t *size;
-    int64_t *stride;
-    int64_t dim_;
-
-    THCStorage *storage;
-    ptrdiff_t storageOffset;
-    std::atomic<int> refcount;
-
-    char flag;
-
-    template <typename T>
-    inline T * data() const {
-      return storage->data<T>() + storageOffset;
-    }
-
-    template <typename T>
-    inline T * unsafe_data() const {
-      return storage->unsafe_data<T>() + storageOffset;
-    }
-
-    // [NOTE: _dim() vs dim()]
-    // _dim() returns the "old" TH dimension view where no dimensions represents an empty tensor.
-    // dim()  returns the ATen view of the dimensionality, i.e. 0-sized dimensions are supported.
-    inline int64_t _dim() const {
-      return is_empty() ? 0: dim_;
-    }
-
-    inline int64_t dim() const {
-      return dim_;
-    }
-
-    // represents that numel() == 0.
-    inline bool is_empty() const {
-      for (int64_t i = 0; i < dim_; ++i) {
-        if (size[i] == 0) {
-          return true;  
-        }
-      }
-      return false;
-    }
-
-    inline at::IntList sizes() {
-      return at::IntList(size, dim_);
-    }
-} THCTensor;
-
 // See [NOTE: _dim() vs dim()]; _nDimension corresponds to _dim(), nDimension corresponds to dim().
 THC_API int THCTensor_nDimension(THCState *state, const THCTensor *self);
 THC_API int THCTensor__nDimension(THCState *state, const THCTensor *self);
diff --git a/aten/src/THC/THCTensorRandom.cuh b/aten/src/THC/THCTensorRandom.cuh
index 7749f231c5c771..fc3d7fb49fec81 100644
--- a/aten/src/THC/THCTensorRandom.cuh
+++ b/aten/src/THC/THCTensorRandom.cuh
@@ -160,8 +160,8 @@ sampleMultinomialOnce(int64_t* dest,
                       int categories,
                       T* sampled,
                       T* dist,
-                      int stride_dist,        // dist->stride[0]
-                      int stride_categories   // dist->stride[1]
+                      int stride_dist,        // dist->stride(0)
+                      int stride_categories   // dist->stride(1)
                       ) {
   extern __shared__  unsigned char my_smem[];
   __shared__ bool found;
diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp
index 98b4c3bebb40fb..91754e70f00328 100644
--- a/aten/src/THC/generic/THCStorage.cpp
+++ b/aten/src/THC/generic/THCStorage.cpp
@@ -40,19 +40,33 @@ real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index)
 
 THCStorage* THCStorage_(new)(THCState *state)
 {
-  return THCStorage_new(state, at::CTypeToScalarType<real>::to());
+  THStorage* storage = new THStorage(
+      at::CTypeToScalarType<real>::to(),
+      0,
+      state->cudaDeviceAllocator,
+      TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE);
+  return storage;
 }
 
 THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size)
 {
-  return THCStorage_newWithSize(state, at::CTypeToScalarType<real>::to(), size);
+  THStorage* storage = new THStorage(
+      at::CTypeToScalarType<real>::to(),
+      size,
+      state->cudaDeviceAllocator,
+      TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE);
+  return storage;
 }
 
 THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size,
                                           at::Allocator* allocator)
 {
-  return THCStorage_newWithAllocator(state, at::CTypeToScalarType<real>::to(),
-                                     size, allocator);
+  THStorage* storage = new THStorage(
+      at::CTypeToScalarType<real>::to(),
+      size,
+      allocator,
+      TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE);
+  return storage;
 }
 
 THCStorage* THCStorage_(newWithSize1)(THCState *state, real data0)
@@ -96,9 +110,17 @@ THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *fileName, p
 }
 
 THCStorage* THCStorage_(newWithDataAndAllocator)(
-  THCState *state, at::DataPtr&& data, ptrdiff_t size,
-  at::Allocator *allocator) {
-  return THCStorage_newWithDataAndAllocator(state, at::CTypeToScalarType<real>::to(), std::move(data), size, allocator);
+    THCState* state,
+    at::DataPtr&& data,
+    ptrdiff_t size,
+    at::Allocator* allocator) {
+  THStorage* storage = new THStorage(
+      at::CTypeToScalarType<real>::to(),
+      size,
+      std::move(data),
+      allocator,
+      TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE);
+  return storage;
 }
 
 void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag)
@@ -118,6 +140,6 @@ void THCStorage_(retain)(THCState *state, THCStorage *self)
 
 void THCStorage_(free)(THCState *state, THCStorage *self)
 {
-  THCStorage_free(state, self);
+  THStorage_free(self);
 }
 #endif
diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp
index d8ee2045bcc60e..3b03e37232ef05 100644
--- a/aten/src/THC/generic/THCTensor.cpp
+++ b/aten/src/THC/generic/THCTensor.cpp
@@ -41,7 +41,7 @@ THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self)
 THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self)
 {
   THLongStorage *stride = THLongStorage_newWithSize(self->dim());
-  THLongStorage_rawCopy(stride, self->stride);
+  THLongStorage_rawCopy(stride, THTensor_getStridePtr(self));
   return stride;
 }
 
@@ -53,53 +53,36 @@ real *THCTensor_(data)(THCState *state, const THCTensor *self)
     return NULL;
 }
 
-void THCTensor_(setFlag)(THCState *state, THCTensor *self, const char flag)
-{
-  self->flag |= flag;
-}
-
-void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag)
-{
-  self->flag &= ~flag;
-}
-
 /**** creation methods ****/
 
-static void THCTensor_(rawInit)(THCState *state, THCTensor *self);
-
-
 /* Empty init */
 THCTensor *THCTensor_(new)(THCState *state)
 {
-  THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
-  THCTensor_(rawInit)(state, self);
-  return self;
+  return new THCTensor(THCStorage_(new)(state));
 }
 
 /* Pointer-copy init */
 THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor)
 {
-  THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
-  THCTensor_(rawInit)(state, self);
+  THCTensor *self = new THCTensor(THCStorage_(new)(state));
   THCTensor_(setStorageNd)(state,
                            self,
                            tensor->storage,
                            tensor->storageOffset,
                            tensor->dim(),
-                           tensor->size,
-                           tensor->stride);
+                           THTensor_getSizePtr(tensor),
+                           THTensor_getStridePtr(tensor));
   return self;
 }
 
 /* Storage init */
 THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, THLongStorage *size, THLongStorage *stride)
 {
-  THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
   if(size && stride)
     THArgCheck(size->size == stride->size, 4, "inconsistent size");
 
   AT_CHECK(size, "size must not be null");
-  THCTensor_(rawInit)(state, self);
+  THCTensor *self = new THCTensor(THCStorage_(new)(state));
   THCTensor_(setStorageNd)(state,
                            self,
                            storage,
@@ -113,8 +96,7 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrd
 
 THCTensor *THCTensor_(newWithStorageIntLists)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, at::IntList sizes, at::IntList strides) {
   AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
-  THCTensor *self = (THCTensor *)THAlloc(sizeof(THCTensor));
-  THCTensor_(rawInit)(state, self);
+  THCTensor *self = new THCTensor(THCStorage_(new)(state));
   THCTensor_(setStorageNd)(state, self, storage, storageOffset, sizes.size(),
                            const_cast<int64_t*>(sizes.data()), const_cast<int64_t*>(strides.data()));
 
@@ -159,8 +141,7 @@ THCTensor *THCTensor_(newWithSize)(THCState *state, THLongStorage *size, THLongS
 }
 
 THCTensor *THCTensor_(newWithSizeIntList)(THCState *state, at::IntList sizes) {
-  THCTensor *self = (THCTensor *)THAlloc(sizeof(THCTensor));
-  THCTensor_(rawInit)(state, self);
+  THCTensor *self = new THCTensor(THCStorage_(new)(state));
   THCTensor_(resizeNd)(state, self, sizes.size(), const_cast<int64_t*>(sizes.data()), nullptr);
 
   return self;
@@ -237,8 +218,8 @@ THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage
   ptrdiff_t numel = THCTensor_(nElement)(state, tensor);
   THCTensor *self = THCTensor_(new)(state);
   THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel);
-  auto stride = THTensor_compute_stride(at::IntList(tensor->size, tensor->dim()),
-                                        at::IntList(tensor->stride, tensor->dim()),
+  auto stride = THTensor_compute_stride(tensor->sizes(),
+                                        tensor->strides(),
                                         at::IntList(inferred_size->data<int64_t>(), inferred_size->size));
   THArgCheck(stride.has_value(), 2, "view size is "
     "not compatible with input tensor's size and stride (at least one dimension spans "
@@ -391,14 +372,14 @@ void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int di
 #else
   THArgCheck( size > 0, 5, "out of range");
 #endif
-  THArgCheck(firstIndex+size <= src->size[dimension], 5, "out of range");
+  THArgCheck(firstIndex+size <= src->size(dimension), 5, "out of range");
 
   THCTensor_(set)(state, self, src);
 
   if(firstIndex > 0)
-    self->storageOffset += firstIndex*self->stride[dimension];
+    self->storageOffset += firstIndex*self->stride(dimension);
 
-  self->size[dimension] = size;
+  THTensor_setSizeAtDim(self, dimension, size);
 }
 
 void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int dimension, int64_t sliceIndex)
@@ -408,20 +389,24 @@ void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int di
   if(!src)
     src = self;
 
-#ifndef USE_TH_SCALAR
+#ifndef USE_TH_SIZE_ZERO_DIM
   THArgCheck(src->_dim() > 1, 1, "cannot select on a vector");
+#else
+#ifndef USE_TH_SCALAR
+  THArgCheck(src->dim() > 1, 1, "cannot select on a vector");
+#endif
 #endif
   THArgCheck((dimension >= 0) && (dimension < src->dim()), 3, "out of range");
-  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 4, "out of range");
+  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 4, "out of range");
 
   THCTensor_(set)(state, self, src);
   THCTensor_(narrow)(state, self, NULL, dimension, sliceIndex, 1);
   for(d = dimension; d < self->dim()-1; d++)
   {
-    self->size[d] = self->size[d+1];
-    self->stride[d] = self->stride[d+1];
+    THTensor_setSizeAtDim(self, d, self->size(d+1));
+    THTensor_setStrideAtDim(self, d, self->stride(d+1));
   }
-  self->dim_--;
+  THTensor_resizeDim(self, self->dim_ - 1);
 }
 
 void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int dimension1, int dimension2)
@@ -431,26 +416,24 @@ void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int
   if(!src)
     src = self;
 
-  THArgCheck( (dimension1 >= 0) && (dimension1 < src->_dim()), 1, "out of range");
-  THArgCheck( (dimension2 >= 0) && (dimension2 < src->_dim()), 2, "out of range");
+  THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range");
+  THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range");
 
   THCTensor_(set)(state, self, src);
 
   if(dimension1 == dimension2)
     return;
 
-  z = self->stride[dimension1];
-  self->stride[dimension1] = self->stride[dimension2];
-  self->stride[dimension2] = z;
-  z = self->size[dimension1];
-  self->size[dimension1] = self->size[dimension2];
-  self->size[dimension2] = z;
+  z = self->stride(dimension1);
+  THTensor_setStrideAtDim(self, dimension1, self->stride(dimension2));
+  THTensor_setStrideAtDim(self, dimension2, z);
+  z = self->size(dimension1);
+  THTensor_setSizeAtDim(self, dimension1, self->size(dimension2));
+  THTensor_setSizeAtDim(self, dimension2, z);
 }
 
 void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int dimension, int64_t size, int64_t step)
 {
-  int64_t *newSize;
-  int64_t *newStride;
   int d;
 
   if(!src)
@@ -460,36 +443,31 @@ void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int di
   THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor");
 #endif
   THArgCheck(dimension < src->dim(), 2, "out of range");
-  THArgCheck(size <= src->size[dimension], 3, "out of range");
+  THArgCheck(size <= src->size(dimension), 3, "out of range");
   THArgCheck(step > 0, 4, "invalid step");
 
   THCTensor_(set)(state, self, src);
 
-  newSize = (int64_t*)THAlloc(sizeof(int64_t)*(self->dim()+1));
-  newStride = (int64_t*)THAlloc(sizeof(int64_t)*(self->dim()+1));
+  std::vector<int64_t> newSize(self->dim() + 1);
+  std::vector<int64_t> newStride(self->dim() + 1);
 
   newSize[self->dim()] = size;
-  newStride[self->dim()] = self->stride[dimension];
+  newStride[self->dim()] = self->stride(dimension);
   for(d = 0; d < self->dim(); d++)
   {
     if(d == dimension)
     {
-      newSize[d] = (self->size[d] - size) / step + 1;
-      newStride[d] = step*self->stride[d];
+      newSize[d] = (self->size(d) - size) / step + 1;
+      newStride[d] = step*self->stride(d);
     }
     else
     {
-      newSize[d] = self->size[d];
-      newStride[d] = self->stride[d];
+      newSize[d] = self->size(d);
+      newStride[d] = self->stride(d);
     }
   }
 
-  THFree(self->size);
-  THFree(self->stride);
-
-  self->size = newSize;
-  self->stride = newStride;
-  self->dim_++;
+  THTensor_setSizesAndStrides(self, std::move(newSize), std::move(newStride));
 }
 
 /* we have to handle the case where the result is a number */
@@ -505,12 +483,12 @@ void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src)
 
   for(d = 0; d < src->dim(); d++)
   {
-    if(src->size[d] != 1)
+    if(src->size(d) != 1)
     {
       if(d != ndim)
       {
-        self->size[ndim] = src->size[d];
-        self->stride[ndim] = src->stride[d];
+        THTensor_setSizeAtDim(self, ndim, src->size(d));
+        THTensor_setStrideAtDim(self, ndim, src->stride(d));
       }
       ndim++;
     }
@@ -520,11 +498,11 @@ void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src)
   /* right now, we do not handle 0-dimension tensors */
   if(ndim == 0 && src->dim() > 0)
   {
-    self->size[0] = 1;
-    self->stride[0] = 1;
+    THTensor_setSizeAtDim(self, 0, 1);
+    THTensor_setStrideAtDim(self, 0, 1);
     ndim = 1;
   }
-  self->dim_ = ndim;
+  THTensor_resizeDim(self, ndim);
 }
 #endif
 
@@ -551,7 +529,7 @@ int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStora
 
   for (d = 0; d < self->dim(); ++d)
   {
-    if (self->size[d] != THLongStorage_data(dims)[d])
+    if (self->size(d) != THLongStorage_data(dims)[d])
       return 0;
   }
   return 1;
@@ -566,7 +544,7 @@ int THCTensor_(isSetTo)(THCState *state, const THCTensor *self, const THCTensor
     int d;
     for (d = 0; d < self->dim(); ++d)
     {
-      if (self->size[d] != src->size[d] || self->stride[d] != src->stride[d])
+      if (self->size(d) != src->size(d) || self->stride(d) != src->stride(d))
         return 0;
     }
     return 1;
@@ -581,7 +559,7 @@ int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTe
     return 0;
   for(d = 0; d < self->dim(); ++d)
   {
-    if(self->size[d] != src->size[d])
+    if(self->size(d) != src->size(d))
       return 0;
   }
   return 1;
@@ -612,19 +590,6 @@ void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst)
 
 /*******************************************************************************/
 
-static void THCTensor_(rawInit)(THCState *state, THCTensor *self)
-{
-  new (&self->refcount) std::atomic<int>(1);
-  self->storage = THCStorage_(new)(state);
-  self->storageOffset = 0;
-  self->size = static_cast<int64_t *>(THAlloc(sizeof(int64_t)));
-  self->stride = static_cast<int64_t *>(THAlloc(sizeof(int64_t)));
-  self->size[0] = 0;
-  self->stride[0] = 1;
-  self->dim_ = 1;
-  self->flag = TH_TENSOR_REFCOUNTED;
-}
-
 void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride)
 {
   THCTensor_setStorageNd(state, self, storage, storageOffset, nDimension, size, stride);
@@ -638,57 +603,57 @@ void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, int6
 void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real value)
 {
   THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
-  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value);
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
+  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0), value);
 }
 
 real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0)
 {
   THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
-  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]);
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
+  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0));
 }
 
 void THCTensor_(set2d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, real value)
 {
   THArgCheck(tensor->dim() == 2, 1, "tensor must have two dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
-  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value);
+  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range");
+  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1), value);
 }
 
 real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1)
 {
   THArgCheck(tensor->dim() == 2, 1, "tensor must have two dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
-  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]);
+  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)), 2, "out of range");
+  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1));
 }
 
 void THCTensor_(set3d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, real value)
 {
   THArgCheck(tensor->dim() == 3, 1, "tensor must have three dimensions");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
-  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value);
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range");
+  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2), value);
 }
 
 real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2)
 {
   THArgCheck(tensor->dim() == 3, 1, "tensor must have three dimensions");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
-  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]);
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)), 2, "out of range");
+  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2));
 }
 
 void THCTensor_(set4d)(THCState *state, THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3, real value)
 {
   THArgCheck(tensor->dim() == 4, 1, "tensor must have four dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
-  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value);
+  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range");
+  THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3), value);
 }
 
 real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, int64_t x0, int64_t x1, int64_t x2, int64_t x3)
 {
   THArgCheck(tensor->dim() == 4, 1, "tensor must have four dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
-  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3]);
+  THArgCheck((x0 >= 0) && (x0 < tensor->size(0)) && (x1 >= 0) && (x1 < tensor->size(1)) && (x2 >= 0) && (x2 < tensor->size(2)) && (x3 >= 0) && (x3 < tensor->size(3)), 2, "out of range");
+  return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride(0)+x1*tensor->stride(1)+x2*tensor->stride(2)+x3*tensor->stride(3));
 }
 
 int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...)
@@ -747,7 +712,7 @@ THCDescBuff THCTensor_(sizeDesc)(THCState *state, const THCTensor *tensor) {
   int i;
   for(i = 0; i < tensor->dim(); i++) {
     if(n >= L) break;
-    n += snprintf(str+n, L-n, "%" PRId64, tensor->size[i]);
+    n += snprintf(str+n, L-n, "%" PRId64, tensor->size(i));
     if(i < tensor->dim()-1) {
       n += snprintf(str+n, L-n, " x ");
     }
diff --git a/aten/src/THC/generic/THCTensor.h b/aten/src/THC/generic/THCTensor.h
index e3e3648c3b3a5a..8e9bf84727420e 100644
--- a/aten/src/THC/generic/THCTensor.h
+++ b/aten/src/THC/generic/THCTensor.h
@@ -2,8 +2,6 @@
 #define THC_GENERIC_FILE "generic/THCTensor.h"
 #else
 
-#define TH_TENSOR_REFCOUNTED 1
-
 typedef struct THCTensor THCTensor;
 
 // These used to be distinct types; for some measure of backwards compatibility and documentation
diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu
index 0e6a7ffcc1d47b..f93ad4dfe14dd2 100644
--- a/aten/src/THC/generic/THCTensorIndex.cu
+++ b/aten/src/THC/generic/THCTensorIndex.cu
@@ -19,14 +19,14 @@ static ptrdiff_t THCTensor_(getSliceSize)(THCState *state, THCTensor *dst,
   ptrdiff_t dstSliceSize = 1;
   for (int d = 0; d < dstDims; d++) {
     if (d != dim) {
-      dstSliceSize *= dst->size[d];
+      dstSliceSize *= dst->size(d);
     }
   }
 
   if (src == nullptr) return dstSliceSize;
 
   THArgCheck(dim < srcDims, 3, "Indexing dim is out of bounds");
-  THArgCheck(THCudaLongTensor_nElement(state, index) == src->size[dim], 4,
+  THArgCheck(THCudaLongTensor_nElement(state, index) == src->size(dim), 4,
              "length of src.size[dim] is not equal to length of indices");
 
   ptrdiff_t srcSliceSize = 1;
@@ -36,8 +36,8 @@ static ptrdiff_t THCTensor_(getSliceSize)(THCState *state, THCTensor *dst,
 
   for (int d = 0; d < srcDims; d++) {
     if (d != dim) {
-      srcSliceSize *= src->size[d];
-      if (!mismatch && dst->size[d] != src->size[d]) mismatch = true;
+      srcSliceSize *= src->size(d);
+      if (!mismatch && dst->size(d) != src->size(d)) mismatch = true;
     }
   }
 
@@ -224,7 +224,7 @@ void THCTensor_(take)(THCState *state, THCTensor *dst, THCTensor *src, THCudaLon
   THArgCheck(!(THCTensor_(_nDimension)(state, src) == 0 && THCudaLongTensor__nDimension(state, index) != 0), 2,
              "tried to take from an empty tensor");
 
-  THCTensor_(resizeNd)(state, dst, index->dim(), index->size, NULL);
+  THCTensor_(resizeNd)(state, dst, index->dim(), THTensor_getSizePtr(index), NULL);
 
   // dispatchTakePut only handles non-empty tensors;
   if (index->_dim() > 0) {
diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu
index 8bdd8fa68871e8..07033fa0e8f1d8 100644
--- a/aten/src/THC/generic/THCTensorMath.cu
+++ b/aten/src/THC/generic/THCTensorMath.cu
@@ -314,9 +314,9 @@ void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor,
       strided_tensor.begin(),
       strided_tensor.end(),
       stride_dim.begin(),
-      idx_functor(div, self->size[dim])
+      idx_functor(div, self->size(dim))
     );
-    div *= self->size[dim];
+    div *= self->size(dim);
   }
 
   THCudaLongTensor_resize2d(state, tensor, num_nonzeros, num_dim);
diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu
index 6d1da07d74b947..babbd6d24eb61d 100644
--- a/aten/src/THC/generic/THCTensorMathBlas.cu
+++ b/aten/src/THC/generic/THCTensorMathBlas.cu
@@ -49,16 +49,16 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec));
-  if( (mat->_dim() != 2) || (vec->_dim() != 1) )
+  if( (mat->dim() != 2) || (vec->dim() != 1) )
     THError("matrix and vector expected");
 
-  if( mat->size[1] != vec->size[0] )
+  if( mat->size(1) != vec->size(0) )
     THError("size mismatch");
 
-  if(t->_dim() != 1)
+  if(t->dim() != 1)
     THError("size mismatch");
 
-  if(t->size[0] != mat->size[0])
+  if(t->size(0) != mat->size(0))
     THError("size mismatch");
 
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
@@ -68,32 +68,32 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
     THCTensor_(copy)(state, r_, t);
   }
 
-  if(mat->stride[0] == 1)
+  if(mat->stride(0) == 1)
   {
 #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sgemv(state, 'n', mat->size[0], mat->size[1],
-                    alpha, THCTensor_(data)(state, mat), mat->stride[1],
-                    THCTensor_(data)(state, vec), vec->stride[0],
-                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+    THCudaBlas_Sgemv(state, 'n', mat->size(0), mat->size(1),
+                    alpha, THCTensor_(data)(state, mat), mat->stride(1),
+                    THCTensor_(data)(state, vec), vec->stride(0),
+                    beta, THCTensor_(data)(state, r_), r_->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dgemv(state, 'n', mat->size[0], mat->size[1],
-                    alpha, THCTensor_(data)(state, mat), mat->stride[1],
-                    THCTensor_(data)(state, vec), vec->stride[0],
-                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+    THCudaBlas_Dgemv(state, 'n', mat->size(0), mat->size(1),
+                    alpha, THCTensor_(data)(state, mat), mat->stride(1),
+                    THCTensor_(data)(state, vec), vec->stride(0),
+                    beta, THCTensor_(data)(state, r_), r_->stride(0));
 #endif
   }
-  else if(mat->stride[1] == 1)
+  else if(mat->stride(1) == 1)
   {
 #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sgemv(state, 't',  mat->size[1], mat->size[0],
-                    alpha, THCTensor_(data)(state, mat), mat->stride[0],
-                    THCTensor_(data)(state, vec), vec->stride[0],
-                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+    THCudaBlas_Sgemv(state, 't',  mat->size(1), mat->size(0),
+                    alpha, THCTensor_(data)(state, mat), mat->stride(0),
+                    THCTensor_(data)(state, vec), vec->stride(0),
+                    beta, THCTensor_(data)(state, r_), r_->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dgemv(state, 't',  mat->size[1], mat->size[0],
-                     alpha, THCTensor_(data)(state, mat), mat->stride[0],
-                     THCTensor_(data)(state, vec), vec->stride[0],
-                     beta, THCTensor_(data)(state, r_), r_->stride[0]);
+    THCudaBlas_Dgemv(state, 't',  mat->size(1), mat->size(0),
+                     alpha, THCTensor_(data)(state, mat), mat->stride(0),
+                     THCTensor_(data)(state, vec), vec->stride(0),
+                     beta, THCTensor_(data)(state, r_), r_->stride(0));
 #endif
   }
   else
@@ -101,32 +101,42 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
     THCTensor *cmat = THCTensor_(newContiguous)(state, mat);
 
 #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sgemv(state, 't',  mat->size[1], mat->size[0],
-                    alpha, THCTensor_(data)(state, cmat), cmat->stride[0],
-                    THCTensor_(data)(state, vec), vec->stride[0],
-                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+    THCudaBlas_Sgemv(state, 't',  mat->size(1), mat->size(0),
+                    alpha, THCTensor_(data)(state, cmat), cmat->stride(0),
+                    THCTensor_(data)(state, vec), vec->stride(0),
+                    beta, THCTensor_(data)(state, r_), r_->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dgemv(state, 't',  mat->size[1], mat->size[0],
-                    alpha, THCTensor_(data)(state, cmat), cmat->stride[0],
-                    THCTensor_(data)(state, vec), vec->stride[0],
-                    beta, THCTensor_(data)(state, r_), r_->stride[0]);
+    THCudaBlas_Dgemv(state, 't',  mat->size(1), mat->size(0),
+                    alpha, THCTensor_(data)(state, cmat), cmat->stride(0),
+                    THCTensor_(data)(state, vec), vec->stride(0),
+                    beta, THCTensor_(data)(state, r_), r_->stride(0));
 #endif
 
     THCTensor_(free)(state, cmat);
   }
 
+  // cublasSgemv, cublasDgemv have a bug where (x,0).mv(0) does not
+  // handle beta, whereas cublasSgemm, cublasDgemm do for case where (x,0).mm(0,y).
+  if (vec->size(0) == 0 && mat->size(0) != 0) {
+    if(THCNumerics<real>::eq(beta, ScalarConvert<int, real>::to(0))) {
+      THCTensor_(zero)(state, r_);
+    } else if(THCNumerics<real>::ne(beta, ScalarConvert<int, real>::to(1))) {
+      THCTensor_(mul)(state, r_, r_, beta);
+    }
+  }
+
 #elif defined(THC_REAL_IS_HALF)
     // Currently no Hgemv/SgemvEx in Cublas
     THCTensor *vecAsMatrix = THCTensor_(newWithTensor)(state, vec);
-    THCTensor_(resize2d)(state, vecAsMatrix, vecAsMatrix->size[0], 1);
+    THCTensor_(resize2d)(state, vecAsMatrix, vecAsMatrix->size(0), 1);
 
     THCTensor *tAsMatrix = THCTensor_(newWithTensor)(state, t);
-    THCTensor_(resize2d)(state, tAsMatrix, tAsMatrix->size[0], 1);
+    THCTensor_(resize2d)(state, tAsMatrix, tAsMatrix->size(0), 1);
 
     THCTensor_(addmm)(state, r_, beta, tAsMatrix, alpha, mat, vecAsMatrix);
 
     // r_ will have answer as matrix, need to return a vector
-    THCTensor_(resize1d)(state, r_, r_->size[0]);
+    THCTensor_(resize1d)(state, r_, r_->size(0));
     THCTensor_(free)(state, vecAsMatrix);
     THCTensor_(free)(state, tAsMatrix);
 #endif
@@ -140,15 +150,15 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2));
-  if ( (vec1->_dim() != 1) || (vec2->_dim() != 1) ) {
+  if ( (vec1->dim() != 1) || (vec2->dim() != 1) ) {
     THError("vector and vector expected");
   }
 
-  if (t->_dim() != 2) {
+  if (t->dim() != 2) {
     THError("size mismatch");
   }
 
-  if ( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) {
+  if ( (t->size(0) != vec1->size(0)) || (t->size(1) != vec2->size(0)) ) {
     THError("size mismatch");
   }
 
@@ -164,32 +174,32 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
     THCTensor_(mul)(state, r_, r_, beta);
   }
 
-  if(r_->stride[0] == 1)
+  if(r_->stride(0) == 1)
   {
 #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sger(state, vec1->size[0], vec2->size[0],
-                   alpha, THCTensor_(data)(state, vec1), vec1->stride[0],
-                   THCTensor_(data)(state, vec2), vec2->stride[0],
-                   THCTensor_(data)(state, r_), r_->stride[1]);
+    THCudaBlas_Sger(state, vec1->size(0), vec2->size(0),
+                   alpha, THCTensor_(data)(state, vec1), vec1->stride(0),
+                   THCTensor_(data)(state, vec2), vec2->stride(0),
+                   THCTensor_(data)(state, r_), r_->stride(1));
 #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dger(state, vec1->size[0], vec2->size[0],
-                   alpha, THCTensor_(data)(state, vec1), vec1->stride[0],
-                   THCTensor_(data)(state, vec2), vec2->stride[0],
-                   THCTensor_(data)(state, r_), r_->stride[1]);
+    THCudaBlas_Dger(state, vec1->size(0), vec2->size(0),
+                   alpha, THCTensor_(data)(state, vec1), vec1->stride(0),
+                   THCTensor_(data)(state, vec2), vec2->stride(0),
+                   THCTensor_(data)(state, r_), r_->stride(1));
 #endif
   }
-  else if(r_->stride[1] == 1)
+  else if(r_->stride(1) == 1)
   {
 #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sger(state, vec2->size[0], vec1->size[0],
-                   alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
-                   THCTensor_(data)(state, vec1), vec1->stride[0],
-                   THCTensor_(data)(state, r_), r_->stride[0]);
+    THCudaBlas_Sger(state, vec2->size(0), vec1->size(0),
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride(0),
+                   THCTensor_(data)(state, vec1), vec1->stride(0),
+                   THCTensor_(data)(state, r_), r_->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dger(state, vec2->size[0], vec1->size[0],
-                   alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
-                   THCTensor_(data)(state, vec1), vec1->stride[0],
-                   THCTensor_(data)(state, r_), r_->stride[0]);
+    THCudaBlas_Dger(state, vec2->size(0), vec1->size(0),
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride(0),
+                   THCTensor_(data)(state, vec1), vec1->stride(0),
+                   THCTensor_(data)(state, r_), r_->stride(0));
 #endif
   }
   else
@@ -197,15 +207,15 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
     THCTensor *cr = THCTensor_(newClone)(state, r_);
 
 #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sger(state, vec2->size[0], vec1->size[0],
-                   alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
-                   THCTensor_(data)(state, vec1), vec1->stride[0],
-                   THCTensor_(data)(state, cr), cr->stride[0]);
+    THCudaBlas_Sger(state, vec2->size(0), vec1->size(0),
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride(0),
+                   THCTensor_(data)(state, vec1), vec1->stride(0),
+                   THCTensor_(data)(state, cr), cr->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dger(state, vec2->size[0], vec1->size[0],
-                   alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
-                   THCTensor_(data)(state, vec1), vec1->stride[0],
-                   THCTensor_(data)(state, cr), cr->stride[0]);
+    THCudaBlas_Dger(state, vec2->size(0), vec1->size(0),
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride(0),
+                   THCTensor_(data)(state, vec1), vec1->stride(0),
+                   THCTensor_(data)(state, cr), cr->stride(0));
 #endif
 
     THCTensor_(freeCopyTo)(state, cr, r_);
@@ -213,11 +223,11 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
 #elif defined(THC_REAL_IS_HALF)
   // currently no Hger/SgerEx in Cublas.
   THCTensor *vec2T = THCTensor_(newWithTensor)(state, vec2);
-  THCTensor_(resize2d)(state, vec2T, vec2T->size[0], 1);
+  THCTensor_(resize2d)(state, vec2T, vec2T->size(0), 1);
   THCTensor_(transpose)(state, vec2T, NULL, 0, 1);
 
   THCTensor *vec1M = THCTensor_(newWithTensor)(state, vec1);
-  THCTensor_(resize2d)(state, vec1M, vec1M->size[0], 1);
+  THCTensor_(resize2d)(state, vec1M, vec1M->size(0), 1);
 
   THCTensor_(addmm)(state, r_, beta, t, alpha, vec1M, vec2T);
   THCTensor_(free)(state, vec2T);
@@ -237,19 +247,19 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
   char transpose_r, transpose_m1, transpose_m2;
   THCTensor *r__, *m1_, *m2_;
 
-  if( (m1->_dim() != 2) || (m2->_dim() != 2) )
-    THError("matrices expected, got %dD, %dD tensors", m1->_dim(), m2->_dim());
+  if( (m1->dim() != 2) || (m2->dim() != 2) )
+    THError("matrices expected, got %dD, %dD tensors", m1->dim(), m2->dim());
 
-  if(t->_dim() != 2)
-    THError("matrix expected, got %dD tensor for t", t->_dim());
+  if(t->dim() != 2)
+    THError("matrix expected, got %dD tensor for t", t->dim());
 
-  if(m1->size[1] != m2->size[0]) {
+  if(m1->size(1) != m2->size(0)) {
     THCDescBuff bm1 = THCTensor_(sizeDesc)(state, m1);
     THCDescBuff bm2 = THCTensor_(sizeDesc)(state, m2);
     THError("size mismatch, m1: %s, m2: %s", bm1.str, bm2.str);
   }
 
-  if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) ) {
+  if( (t->size(0) != m1->size(0)) || (t->size(1) != m2->size(1)) ) {
     THCDescBuff bt  = THCTensor_(sizeDesc)(state, t);
     THCDescBuff bm1 = THCTensor_(sizeDesc)(state, m1);
     THCDescBuff bm2 = THCTensor_(sizeDesc)(state, m2);
@@ -265,14 +275,14 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
   }
 
   /* r_ */
-  if(r_->stride[0] == 1 &&
-     r_->stride[1] != 0)
+  if(r_->stride(0) == 1 &&
+     r_->stride(1) != 0)
   {
     transpose_r = 'n';
     r__ = r_;
   }
-  else if(r_->stride[1] == 1 &&
-          r_->stride[0] != 0)
+  else if(r_->stride(1) == 1 &&
+          r_->stride(0) != 0)
   {
     THCTensor *swap = m2;
     m2 = m1;
@@ -291,14 +301,14 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
   }
 
   /* m1 */
-  if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
-     m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
+  if(m1->stride((transpose_r == 'n' ? 0 : 1)) == 1 &&
+     m1->stride((transpose_r == 'n' ? 1 : 0)) != 0)
   {
     transpose_m1 = 'n';
     m1_ = m1;
   }
-  else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
-          m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
+  else if(m1->stride((transpose_r == 'n' ? 1 : 0)) == 1 &&
+          m1->stride((transpose_r == 'n' ? 0 : 1)) != 0)
   {
     transpose_m1 = 't';
     m1_ = m1;
@@ -310,14 +320,14 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
   }
 
   /* m2 */
-  if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
-     m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
+  if(m2->stride((transpose_r == 'n' ? 0 : 1)) == 1 &&
+     m2->stride((transpose_r == 'n' ? 1 : 0)) != 0)
   {
     transpose_m2 = 'n';
     m2_ = m2;
   }
-  else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
-          m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
+  else if(m2->stride((transpose_r == 'n' ? 1 : 0)) == 1 &&
+          m2->stride((transpose_r == 'n' ? 0 : 1)) != 0)
   {
     transpose_m2 = 't';
     m2_ = m2;
@@ -332,47 +342,47 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
   THCudaBlas_Hgemm(state,
                    transpose_m1,
                    transpose_m2,
-                   r__->size[(transpose_r == 'n' ? 0 : 1)],
-                   r__->size[(transpose_r == 'n' ? 1 : 0)],
-                   m1_->size[(transpose_r == 'n' ? 1 : 0)],
+                   r__->size((transpose_r == 'n' ? 0 : 1)),
+                   r__->size((transpose_r == 'n' ? 1 : 0)),
+                   m1_->size((transpose_r == 'n' ? 1 : 0)),
                    alpha,
                    THCTensor_(data)(state, m1_),
-                   (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   (transpose_m1 == 'n' ? m1_->stride((transpose_r == 'n' ? 1 : 0)) : m1_->stride((transpose_r == 'n' ? 0 : 1))),
                    THCTensor_(data)(state, m2_),
-                   (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   (transpose_m2 == 'n' ? m2_->stride((transpose_r == 'n' ? 1 : 0)) : m2_->stride((transpose_r == 'n' ? 0 : 1))),
                    beta,
                    THCTensor_(data)(state, r__),
-                   r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+                   r__->stride((transpose_r == 'n' ? 1 : 0)));
 #elif defined(THC_REAL_IS_FLOAT)
   THCudaBlas_Sgemm(state,
                    transpose_m1,
                    transpose_m2,
-                   r__->size[(transpose_r == 'n' ? 0 : 1)],
-                   r__->size[(transpose_r == 'n' ? 1 : 0)],
-                   m1_->size[(transpose_r == 'n' ? 1 : 0)],
+                   r__->size((transpose_r == 'n' ? 0 : 1)),
+                   r__->size((transpose_r == 'n' ? 1 : 0)),
+                   m1_->size((transpose_r == 'n' ? 1 : 0)),
                    alpha,
                    THCTensor_(data)(state, m1_),
-                   (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   (transpose_m1 == 'n' ? m1_->stride((transpose_r == 'n' ? 1 : 0)) : m1_->stride((transpose_r == 'n' ? 0 : 1))),
                    THCTensor_(data)(state, m2_),
-                   (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   (transpose_m2 == 'n' ? m2_->stride((transpose_r == 'n' ? 1 : 0)) : m2_->stride((transpose_r == 'n' ? 0 : 1))),
                    beta,
                    THCTensor_(data)(state, r__),
-                   r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+                   r__->stride((transpose_r == 'n' ? 1 : 0)));
 #elif defined(THC_REAL_IS_DOUBLE)
   THCudaBlas_Dgemm(state,
                    transpose_m1,
                    transpose_m2,
-                   r__->size[(transpose_r == 'n' ? 0 : 1)],
-                   r__->size[(transpose_r == 'n' ? 1 : 0)],
-                   m1_->size[(transpose_r == 'n' ? 1 : 0)],
+                   r__->size((transpose_r == 'n' ? 0 : 1)),
+                   r__->size((transpose_r == 'n' ? 1 : 0)),
+                   m1_->size((transpose_r == 'n' ? 1 : 0)),
                    alpha,
                    THCTensor_(data)(state, m1_),
-                   (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   (transpose_m1 == 'n' ? m1_->stride((transpose_r == 'n' ? 1 : 0)) : m1_->stride((transpose_r == 'n' ? 0 : 1))),
                    THCTensor_(data)(state, m2_),
-                   (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                   (transpose_m2 == 'n' ? m2_->stride((transpose_r == 'n' ? 1 : 0)) : m2_->stride((transpose_r == 'n' ? 0 : 1))),
                    beta,
                    THCTensor_(data)(state, r__),
-                   r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+                   r__->stride((transpose_r == 'n' ? 1 : 0)));
 #endif
 
   /* free intermediate variables */
@@ -397,9 +407,9 @@ THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
                    real alpha, THCTensor *batch1, THCTensor *batch2) {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
-  THArgCheck(THCTensor_(_nDimension)(state, t) == 2, 4, "expected 2D tensor");
-  THArgCheck(THCTensor_(_nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
-  THArgCheck(THCTensor_(_nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
+  THArgCheck(THCTensor_(nDimension)(state, t) == 2, 4, "expected 2D tensor");
+  THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
+  THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
 
   int64_t batchnum = THCTensor_(size)(state, batch1, 0);
   int64_t m1d1 = THCTensor_(size)(state, batch1, 1);
@@ -462,9 +472,9 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
                     real alpha, THCTensor *batch1, THCTensor *batch2) {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
-  THArgCheck(THCTensor_(_nDimension)(state, t) == 3, 4, "expected 3D tensor");
-  THArgCheck(THCTensor_(_nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
-  THArgCheck(THCTensor_(_nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
+  THArgCheck(THCTensor_(nDimension)(state, t) == 3, 4, "expected 3D tensor");
+  THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
+  THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
   THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch1, 0), 6,
              "equal number of batches expected");
   THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch2, 0), 7,
@@ -487,13 +497,13 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
   char transpose_batch1, transpose_batch2;
   int64_t lda, ldb, ldc;
   THCTensor *result_, *batch1_, *batch2_;
-  if (result->stride[1] == 1)
+  if (result->stride(1) == 1)
   {
     transpose_result = false;
     result_ = result;
-    ldc = result_->stride[2];
+    ldc = result_->stride(2);
   }
-  else if (result->stride[2] == 1)
+  else if (result->stride(2) == 1)
   {
     transpose_result = true;
 
@@ -502,7 +512,7 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
     batch1 = swap;
 
     result_ = result;
-    ldc = result_->stride[1];
+    ldc = result_->stride(1);
   }
   else
   {
@@ -513,22 +523,22 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
     THCTensor_(free)(state, transp_r_);
     THCTensor_(transpose)(state, result_, NULL, 1, 2);
 
-    ldc = result_->stride[2];
+    ldc = result_->stride(2);
   }
 
-  if (batch1->stride[transpose_result ? 2 : 1] == 1 &&
-   batch1->stride[transpose_result ? 1 : 2] != 0)
+  if (batch1->stride(transpose_result ? 2 : 1) == 1 &&
+   batch1->stride(transpose_result ? 1 : 2) != 0)
   {
     transpose_batch1 = 'n';
     batch1_ = batch1;
-    lda = batch1_->stride[transpose_result ? 1 : 2];
+    lda = batch1_->stride(transpose_result ? 1 : 2);
   }
-  else if (batch1->stride[transpose_result ? 1 : 2] == 1 &&
-   batch1->stride[transpose_result ? 2 : 1] != 0)
+  else if (batch1->stride(transpose_result ? 1 : 2) == 1 &&
+   batch1->stride(transpose_result ? 2 : 1) != 0)
   {
     transpose_batch1 = 't';
     batch1_ = batch1;
-    lda = batch1_->stride[transpose_result ? 2 : 1];
+    lda = batch1_->stride(transpose_result ? 2 : 1);
   }
   else
   {
@@ -539,22 +549,22 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
     } else {
       batch1_ = THCTensor_(newContiguous)(state, batch1);
     }
-    lda = batch1_->stride[1];
+    lda = batch1_->stride(1);
   }
 
-  if (batch2->stride[transpose_result ? 2 : 1] == 1 &&
-   batch2->stride[transpose_result ? 1 : 2] != 0)
+  if (batch2->stride(transpose_result ? 2 : 1) == 1 &&
+   batch2->stride(transpose_result ? 1 : 2) != 0)
   {
     transpose_batch2 = 'n';
     batch2_ = batch2;
-    ldb = batch2_->stride[transpose_result ? 1 : 2];
+    ldb = batch2_->stride(transpose_result ? 1 : 2);
   }
-  else if (batch2->stride[transpose_result ? 1 : 2] == 1 &&
-   batch2->stride[transpose_result ? 2 : 1] != 0)
+  else if (batch2->stride(transpose_result ? 1 : 2) == 1 &&
+   batch2->stride(transpose_result ? 2 : 1) != 0)
   {
     transpose_batch2 = 't';
     batch2_ = batch2;
-    ldb = batch2_->stride[transpose_result ? 2 : 1];
+    ldb = batch2_->stride(transpose_result ? 2 : 1);
   }
   else
   {
@@ -565,9 +575,9 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
     } else {
       batch2_ = THCTensor_(newContiguous)(state, batch2);
     }
-    ldb = batch2_->stride[1];
+    ldb = batch2_->stride(1);
   }
-  int64_t num_batches = result_->size[0];
+  int64_t num_batches = result_->size(0);
 
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   // Compute pointers to matrices in each batch.
@@ -585,16 +595,16 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
   createBatchGemmBuffer3<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
     d_matrices1, d_matrices2, (const real**)d_result_matrices, THCTensor_(data)(state, batch1_),
     THCTensor_(data)(state, batch2_), THCTensor_(data)(state, result_),
-    batch1_->stride[0], batch2_->stride[0], result_->stride[0], num_batches);
+    batch1_->stride(0), batch2_->stride(0), result_->stride(0), num_batches);
 
 #ifdef THC_REAL_IS_FLOAT
   THCudaBlas_SgemmBatched(
       state,
       transpose_batch1,
       transpose_batch2,
-      result_->size[transpose_result ? 2 : 1],
-      result_->size[transpose_result ? 1 : 2],
-      batch1_->size[transpose_result ? 1 : 2],
+      result_->size(transpose_result ? 2 : 1),
+      result_->size(transpose_result ? 1 : 2),
+      batch1_->size(transpose_result ? 1 : 2),
       alpha,
       d_matrices1, lda,
       d_matrices2, ldb,
@@ -606,9 +616,9 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
       state,
       transpose_batch1,
       transpose_batch2,
-      result_->size[transpose_result ? 2 : 1],
-      result_->size[transpose_result ? 1 : 2],
-      batch1_->size[transpose_result ? 1 : 2],
+      result_->size(transpose_result ? 2 : 1),
+      result_->size(transpose_result ? 1 : 2),
+      batch1_->size(transpose_result ? 1 : 2),
       alpha,
       d_matrices1, lda,
       d_matrices2, ldb,
@@ -627,28 +637,28 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
       state,
       transpose_batch1,
       transpose_batch2,
-      result_->size[transpose_result ? 2 : 1],
-      result_->size[transpose_result ? 1 : 2],
-      batch1_->size[transpose_result ? 1 : 2],
+      result_->size(transpose_result ? 2 : 1),
+      result_->size(transpose_result ? 1 : 2),
+      batch1_->size(transpose_result ? 1 : 2),
       alpha,
-      THCTensor_(data)(state, batch1_), lda, batch1_->stride[0],
-      THCTensor_(data)(state, batch2_), ldb, batch2_->stride[0],
+      THCTensor_(data)(state, batch1_), lda, batch1_->stride(0),
+      THCTensor_(data)(state, batch2_), ldb, batch2_->stride(0),
       beta,
-      THCTensor_(data)(state, result_), ldc, result_->stride[0],
+      THCTensor_(data)(state, result_), ldc, result_->stride(0),
       num_batches);
 #elif defined(THC_REAL_IS_DOUBLE)
   THCudaBlas_DgemmStridedBatched(
       state,
       transpose_batch1,
       transpose_batch2,
-      result_->size[transpose_result ? 2 : 1],
-      result_->size[transpose_result ? 1 : 2],
-      batch1_->size[transpose_result ? 1 : 2],
+      result_->size(transpose_result ? 2 : 1),
+      result_->size(transpose_result ? 1 : 2),
+      batch1_->size(transpose_result ? 1 : 2),
       alpha,
-      THCTensor_(data)(state, batch1_), lda, batch1_->stride[0],
-      THCTensor_(data)(state, batch2_), ldb, batch2_->stride[0],
+      THCTensor_(data)(state, batch1_), lda, batch1_->stride(0),
+      THCTensor_(data)(state, batch2_), ldb, batch2_->stride(0),
       beta,
-      THCTensor_(data)(state, result_), ldc, result_->stride[0],
+      THCTensor_(data)(state, result_), ldc, result_->stride(0),
       num_batches);
 #endif //THC_REAL
 #endif //CUDA_VERSION
@@ -662,14 +672,14 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
         state,
         transpose_batch1,
         transpose_batch2,
-        result_->size[transpose_result ? 2 : 1],
-        result_->size[transpose_result ? 1 : 2],
-        batch1_->size[transpose_result ? 1 : 2],
+        result_->size(transpose_result ? 2 : 1),
+        result_->size(transpose_result ? 1 : 2),
+        batch1_->size(transpose_result ? 1 : 2),
         alpha,
-        THCTensor_(data)(state, batch1_) + i * batch1_->stride[0], lda,
-        THCTensor_(data)(state, batch2_) + i * batch2_->stride[0], ldb,
+        THCTensor_(data)(state, batch1_) + i * batch1_->stride(0), lda,
+        THCTensor_(data)(state, batch2_) + i * batch2_->stride(0), ldb,
         beta,
-        THCTensor_(data)(state, result_) + i * result_->stride[0], ldc);
+        THCTensor_(data)(state, result_) + i * result_->stride(0), ldc);
   }
 #else
   cudaDeviceProp* prop = THCState_getCurrentDeviceProperties(state);
@@ -679,14 +689,14 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
       state,
       transpose_batch1,
       transpose_batch2,
-      result_->size[transpose_result ? 2 : 1],
-      result_->size[transpose_result ? 1 : 2],
-      batch1_->size[transpose_result ? 1 : 2],
+      result_->size(transpose_result ? 2 : 1),
+      result_->size(transpose_result ? 1 : 2),
+      batch1_->size(transpose_result ? 1 : 2),
       alpha,
-      THCTensor_(data)(state, batch1_), lda, batch1_->stride[0],
-      THCTensor_(data)(state, batch2_), ldb, batch2_->stride[0],
+      THCTensor_(data)(state, batch1_), lda, batch1_->stride(0),
+      THCTensor_(data)(state, batch2_), ldb, batch2_->stride(0),
       beta,
-      THCTensor_(data)(state, result_), ldc, result_->stride[0],
+      THCTensor_(data)(state, result_), ldc, result_->stride(0),
       num_batches);
    } else {
       for (int64_t i = 0; i < num_batches; ++i) {
@@ -694,14 +704,14 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
         state,
         transpose_batch1,
         transpose_batch2,
-        result_->size[transpose_result ? 2 : 1],
-        result_->size[transpose_result ? 1 : 2],
-        batch1_->size[transpose_result ? 1 : 2],
+        result_->size(transpose_result ? 2 : 1),
+        result_->size(transpose_result ? 1 : 2),
+        batch1_->size(transpose_result ? 1 : 2),
         alpha,
-        THCTensor_(data)(state, batch1_) + i * batch1_->stride[0], lda,
-        THCTensor_(data)(state, batch2_) + i * batch2_->stride[0], ldb,
+        THCTensor_(data)(state, batch1_) + i * batch1_->stride(0), lda,
+        THCTensor_(data)(state, batch2_) + i * batch2_->stride(0), ldb,
         beta,
-        THCTensor_(data)(state, result_) + i * result_->stride[0], ldc);
+        THCTensor_(data)(state, result_) + i * result_->stride(0), ldc);
       }
    }
 
@@ -728,29 +738,26 @@ THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTens
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THAssert(THCTensor_(checkGPU)(state, 2, ra_, a));
-  THArgCheck(THCTensor_(_nDimension)(state, a) == 3, 3, "expected 3D tensor");
+  THArgCheck(THCTensor_(nDimension)(state, a) == 3, 3, "expected 3D tensor");
   THArgCheck(THCTensor_(size)(state, a, 1) ==
              THCTensor_(size)(state, a, 2), 3, "matrices must be square");
 
   if (ra_ != a) {
     THCTensor_(resizeAs)(state, ra_, a);
-    // not sure if this is kosher, but things are nicer if we return in column major
-    if (ra_->stride[0] == 1) {
-      THCTensor_(transpose)(state, ra_, NULL, 1, 0);
-    } else if (ra_->stride[2] == 1) {
+    if (ra_->stride(2) == 1) {
       THCTensor_(transpose)(state, ra_, NULL, 1, 2);
     }
     THCTensor_(copy)(state, ra_, a);
   }
 
 
-  int n = a->size[1];
+  int n = a->size(1);
   int lda;
   THCTensor *ra__;
 
-  if (ra_->stride[1] == 1) {
+  if (ra_->stride(1) == 1) {
     // column ordered, what BLAS wants
-    lda = ra_->stride[2];
+    lda = ra_->stride(2);
     ra__ = ra_;
   } else {
     // not column ordered, need to make it such (requires copy)
@@ -758,10 +765,10 @@ THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTens
     ra__ = THCTensor_(newClone)(state, transp_r_);
     THCTensor_(free)(state, transp_r_);
     THCTensor_(transpose)(state, ra__, NULL, 1, 2);
-    lda = ra__->stride[2];
+    lda = ra__->stride(2);
   }
 
-  int64_t num_batches = ra__->size[0];
+  int64_t num_batches = ra__->size(0);
 
   if (!pivot) {
     THCudaIntTensor *t = THCudaIntTensor_new(state);
@@ -787,11 +794,13 @@ THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTens
   size_t matrices_size = num_batches * sizeof(real*);
   auto d_result = static_cast<real**>(THCudaMalloc(state, matrices_size));
 
-  const int64_t block = 512;
-  const int64_t grid = (num_batches + block - 1) / block;
-  createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
-    (const real**)d_result, THCTensor_(data)(state, ra__),
-    ra__->stride[0], num_batches);
+  if (num_batches > 0) {
+    const int64_t block = 512;
+    const int64_t grid = (num_batches + block - 1) / block;
+    createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      (const real**)d_result, THCTensor_(data)(state, ra__),
+      ra__->stride(0), num_batches);
+  }
 
   int *pivots_gpu = NULL;
   if (pivot) {
@@ -810,12 +819,16 @@ THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTens
   }
 
   if (free_rinfo_) {
-    int min = THCudaIntTensor_minall(state, rinfo_);
-    int max = THCudaIntTensor_maxall(state, rinfo_);
-    THCudaIntTensor_free(state, rinfo_);
-    if (min != 0 || max != 0) {
-      THError("failed to factorize some batch elements (min info == %d, max info == %d)",
-              min, max);
+    if(THCTensor_nElement(state, rinfo_) != 0) {
+      int min = THCudaIntTensor_minall(state, rinfo_);
+      int max = THCudaIntTensor_maxall(state, rinfo_);
+      THCudaIntTensor_free(state, rinfo_);
+      if (min != 0 || max != 0) {
+        THError("failed to factorize some batch elements (min info == %d, max info == %d)",
+                min, max);
+      }
+    } else {
+      THCudaIntTensor_free(state, rinfo_);
     }
   }
 
@@ -846,16 +859,16 @@ THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b
   }
 
 
-  int n = atf->size[1];
-  int nrhs = rb_->_dim() > 2 ? rb_->size[2] : 1;
+  int n = atf->size(1);
+  int nrhs = rb_->_dim() > 2 ? rb_->size(2) : 1;
   THCTensor *atf_;
   THCTensor *rb__;
   int lda, ldb;
 
   // correct ordering of A_tf
-  if (atf->stride[1] == 1) {
+  if (atf->stride(1) == 1) {
     // column ordered, what BLAS wants
-    lda = atf->stride[2];
+    lda = atf->stride(2);
     atf_ = atf;
   } else {
     // not column ordered, need to make it such (requires copy)
@@ -866,16 +879,16 @@ THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b
     atf_ = THCTensor_(newClone)(state, transp_r_);
     THCTensor_(free)(state, transp_r_);
     THCTensor_(transpose)(state, atf_, NULL, 1, 2);
-    lda = atf_->stride[2];
+    lda = atf_->stride(2);
   }
 
   // correct ordering of B
-  if (rb_->stride[1] == 1) {
+  if (rb_->stride(1) == 1) {
     // column ordered
-    if (rb_->_dim() == 2 || rb_->size[2] == 1) {
+    if (rb_->_dim() == 2 || rb_->size(2) == 1) {
       ldb = n;
     } else {
-      ldb = rb_->stride[2];
+      ldb = rb_->stride(2);
     }
     rb__ = rb_;
   } else {
@@ -885,14 +898,14 @@ THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b
       rb__ = THCTensor_(newClone)(state, transp_r_);
       THCTensor_(free)(state, transp_r_);
       THCTensor_(transpose)(state, rb__, NULL, 1, 2);
-      ldb = rb__->stride[2];
+      ldb = rb__->stride(2);
     } else {
       rb__ = THCTensor_(newClone)(state, rb_);
       ldb = n;
     }
   }
 
-  int64_t num_batches = rb_->size[0];
+  int64_t num_batches = rb_->size(0);
   size_t matrices_size = num_batches * sizeof(real*);
 
   // Copy pointers to device.
@@ -903,10 +916,10 @@ THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b
   const int64_t grid = (num_batches + block - 1) / block;
   createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
     (const real**)d_result, THCTensor_(data)(state, rb__),
-    rb__->stride[0], num_batches);
+    rb__->stride(0), num_batches);
   createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
     d_atf, THCTensor_(data)(state, atf_),
-    atf_->stride[0], num_batches);
+    atf_->stride(0), num_batches);
 
   if (!THCudaIntTensor_isContiguous(state, pivots)) {
       THError("Error: pivots is not contiguous.");
diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu
index fa7220729f0cf6..0de79233c122b4 100644
--- a/aten/src/THC/generic/THCTensorMathMagma.cu
+++ b/aten/src/THC/generic/THCTensorMathMagma.cu
@@ -40,7 +40,7 @@ static void THCTensor_(copyTensor2d)(THCState *state, real *dst, THCTensor *self
 static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, THCTensor *src)
 {
   THAssert(src->_dim() == 2);
-  if (self == src && self->stride[0] == 1 && self->stride[1] == self->size[0])
+  if (self == src && self->stride(0) == 1 && self->stride(1) == self->size(0))
   {
     THCTensor_(retain)(state, self);
     return self;
@@ -51,8 +51,8 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T
   else
     THCTensor_(retain)(state, self);
 
-  int64_t size[2] = { src->size[0], src->size[1] };
-  int64_t stride[2] = { 1, src->size[0] };
+  int64_t size[2] = { src->size(0), src->size(1) };
+  int64_t stride[2] = { 1, src->size(0) };
 
   THCTensor_(resizeNd)(state, self, 2, size, stride);
   THCTensor_(copy)(state, self, src);
@@ -65,11 +65,11 @@ THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, T
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
   THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional");
-  THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square");
-  THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible");
+  THArgCheck(a_->size(0) == a_->size(1), 1, "A should be square");
+  THArgCheck(b_->size(0) == a_->size(0), 2, "A,b size incompatible");
 
-  int64_t n = a_->size[0];
-  int64_t nrhs = b_->size[1];
+  int64_t n = a_->size(0);
+  int64_t nrhs = b_->size(1);
 
   THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
   THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_);
@@ -104,8 +104,8 @@ THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_,
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
   THArgCheck(!b_->is_empty() && b_->dim() == 2, 2, "b should be (non-empty) 2 dimensional");
-  THArgCheck(a_->size[0] == a_->size[1], 1, "A should be square");
-  THArgCheck(b_->size[0] == a_->size[0], 2, "A,b size incompatible");
+  THArgCheck(a_->size(0) == a_->size(1), 1, "A should be square");
+  THArgCheck(b_->size(0) == a_->size(0), 2, "A,b size incompatible");
 
   magma_side_t sz = MagmaLeft;
   magma_uplo_t ul = uplo[0] == 'U' ?  MagmaUpper : MagmaLower;
@@ -114,8 +114,8 @@ THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_,
 
   real alpha = 1;
 
-  int64_t n = a_->size[0];
-  int64_t nrhs = b_->size[1];
+  int64_t n = a_->size(0);
+  int64_t nrhs = b_->size(1);
 
   THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
   THCTensor *b = THCTensor_(newColumnMajor)(state, rb_, b_);
@@ -140,9 +140,9 @@ THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, T
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
   THArgCheck(!b_->is_empty() && b_->dim() == 2, 1, "b should be (non-empty) 2 dimensional");
-  THArgCheck(a_->size[0] == b_->size[0], 2, "Expected A and b to have same size "
+  THArgCheck(a_->size(0) == b_->size(0), 2, "Expected A and b to have same size "
       "at dim 0, but they have incompatible sizes");
-  THArgCheck(a_->size[0] >= a_->size[1], 2, "Expected A with shape (m x n) to have "
+  THArgCheck(a_->size(0) >= a_->size(1), 2, "Expected A with shape (m x n) to have "
       "m >= n. The case for m < n is not implemented yet.");
 
   THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
@@ -150,9 +150,9 @@ THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, T
   real *a_data = THCTensor_(data)(state, a);
   real *b_data = THCTensor_(data)(state, b);
 
-  int64_t m = a->size[0];
-  int64_t n = a->size[1];
-  int64_t nrhs = b->size[1];
+  int64_t m = a->size(0);
+  int64_t n = a->size(1);
+  int64_t nrhs = b->size(1);
   real wkopt;
 
   int info;
@@ -185,7 +185,7 @@ THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, T
 THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos)
 {
 #ifdef USE_MAGMA
-  int64_t n = a->size[0];
+  int64_t n = a->size(0);
   int64_t lda = n;
 
   magma_uplo_t uplo = uplos[0] == 'U' ?  MagmaUpper : MagmaLower;
@@ -244,10 +244,10 @@ THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, T
 {
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 3, "A should be (non-empty) 2 dimensional");
-  THArgCheck(a_->size[0] == a_->size[1], 3, "A should be square");
+  THArgCheck(a_->size(0) == a_->size(1), 3, "A should be square");
 
   magma_vec_t jobvr = jobvrs[0] == 'N' ? MagmaNoVec : MagmaVec;
-  int64_t n = a_->size[0];
+  int64_t n = a_->size(0);
 
   real *a_data = th_magma_malloc_pinned<real>(n * n);
   THCTensor_(copyTensor2d)(state, a_data, a_);
@@ -328,8 +328,8 @@ THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_,
   magma_vec_t jobz = jobus[0] == 'A' ? MagmaAllVec : jobus[0] == 'S' ? MagmaSomeVec : jobus[0] == 'O' ? MagmaOverwriteVec : MagmaNoVec;
 
   int iunused[1];
-  int64_t m = a->size[0];
-  int64_t n = a->size[1];
+  int64_t m = a->size(0);
+  int64_t n = a->size(1);
   int64_t k = m < n ? m : n;
   int64_t j = (jobz == MagmaAllVec) ? m : k;
   int64_t jv = (jobz == MagmaAllVec) ? n : k;
@@ -387,11 +387,11 @@ THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_,
 THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a)
 {
   THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+  THArgCheck(a->size(0) == a->size(1), 2, "A should be square");
 
 #ifdef USE_MAGMA
   int info;
-  int64_t n = a->size[0];
+  int64_t n = a->size(0);
   int lwork = n * magma_get_sgetri_nb(n);
 
   THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
@@ -430,11 +430,11 @@ THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a)
   magma_free_pinned(ipiv);
   THCTensor_(freeCopyTo)(state, input, ra_);
 #else
-  int64_t n = a->size[0];
+  int64_t n = a->size(0);
 
   // input
   THCTensor *input = THCTensor_(newColumnMajor)(state, a, a);
-  THCTensor_(resizeNd)(state, ra_, 2, input->size, input->stride);
+  THCTensor_(resizeNd)(state, ra_, 2, THTensor_getSizePtr(input), THTensor_getStridePtr(input));
 
   real *matrices1[1] = { THCTensor_(data)(state, input) };
   real *matrices2[1] = { THCTensor_(data)(state, ra_) };
@@ -516,9 +516,9 @@ THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, co
 {
 #ifdef USE_MAGMA
   THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+  THArgCheck(a->size(0) == a->size(1), 2, "A should be square");
 
-  int64_t n = a->size[0];
+  int64_t n = a->size(0);
   magma_uplo_t ul = uplo[0] == 'U' ?  MagmaUpper : MagmaLower;
 
   THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
@@ -556,9 +556,9 @@ THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, co
 {
 #ifdef USE_MAGMA
   THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be (non-empty) 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+  THArgCheck(a->size(0) == a->size(1), 2, "A should be square");
 
-  int64_t n = a->size[0];
+  int64_t n = a->size(0);
   magma_uplo_t ul = uplo[0] == 'U' ?  MagmaUpper : MagmaLower;
 
   THCTensor *input = THCTensor_(newColumnMajor)(state, ra_, a);
@@ -591,10 +591,10 @@ THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, co
 THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo)
 {
 #ifdef USE_MAGMA
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+  THArgCheck(a->size(0) == a->size(1), 2, "A should be square");
 
-  int64_t n = a->size[0];
-  int64_t nrhs = b->size[1];
+  int64_t n = a->size(0);
+  int64_t nrhs = b->size(1);
   magma_uplo_t ul = uplo[0] == 'U' ?  MagmaUpper : MagmaLower;
 
   THCTensor *b_ = THCTensor_(newColumnMajor)(state, rb_, b);
@@ -626,8 +626,8 @@ THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional");
 
   THCTensor *a = THCTensor_(newColumnMajor)(state, ra_, a_);
-  int64_t m = a->size[0];
-  int64_t n = a->size[1];
+  int64_t m = a->size(0);
+  int64_t n = a->size(1);
   int64_t k = (m < n ? m : n);
 
 #if defined(THC_REAL_IS_FLOAT)
@@ -663,8 +663,8 @@ THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THC
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional");
 
   THCTensor *a = THCTensor_(newColumnMajor)(state, rr_, a_);
-  int64_t m = a->size[0];
-  int64_t n = a->size[1];
+  int64_t m = a->size(0);
+  int64_t n = a->size(1);
   int64_t k = (m < n ? m : n);
 
 #if defined(THC_REAL_IS_FLOAT)
diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu
index e0f1219dcf8433..62c57a04380c43 100644
--- a/aten/src/THC/generic/THCTensorMathPairwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPairwise.cu
@@ -196,8 +196,8 @@ void THCTensor_(tril)(THCState *state, THCTensor *self_, THCTensor *src_, int64_
   if (self_ != src_)
     THCTensor_(resizeAs)(state, self_, src_);
 
-  int64_t stride0 = self_->stride[0];
-  int64_t stride1 = self_->stride[1];
+  int64_t stride0 = self_->stride(0);
+  int64_t stride1 = self_->stride(1);
   real *start = THCTensor_(data)(state, self_);
 
   TensorTriOp<real, 0> op(start, stride0, stride1, k);
@@ -225,8 +225,8 @@ void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, int64_
   if (self_ != src_)
     THCTensor_(resizeAs)(state, self_, src_);
 
-  int64_t stride0 = self_->stride[0];
-  int64_t stride1 = self_->stride[1];
+  int64_t stride0 = self_->stride(0);
+  int64_t stride1 = self_->stride(1);
   real *start = THCTensor_(data)(state, self_);
 
   TensorTriOp<real, 1> op(start, stride0, stride1, k);
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu
index 50ca326dba376d..7fb6fda38b38a5 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPointwise.cu
@@ -114,9 +114,9 @@ THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y,
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, x, y));
 
   int i;
-  int nd = THCTensor_(_nDimension)(state, x);
+  int nd = THCTensor_(nDimension)(state, x);
   ptrdiff_t nelem = THCTensor_(nElement)(state, x);
-  THArgCheck(nd == THCTensor_(_nDimension)(state, y), 1, "tensors must have same number of dimensions");
+  THArgCheck(nd == THCTensor_(nDimension)(state, y), 1, "tensors must have same number of dimensions");
   for (i = 0; i < nd; i++) {
     THArgCheck(THCTensor_(size)(state, x, i) == THCTensor_(size)(state, y, i), 1, "dimension %i of x and y does not match", i);
     if (dimension < 0 && THCTensor_(size)(state, x, i) == 3) {
diff --git a/aten/src/THC/generic/THCTensorMathReduce.cu b/aten/src/THC/generic/THCTensorMathReduce.cu
index e5d8e22e5bb5eb..1c9d9eac6ac603 100644
--- a/aten/src/THC/generic/THCTensorMathReduce.cu
+++ b/aten/src/THC/generic/THCTensorMathReduce.cu
@@ -61,13 +61,13 @@ THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value,
   THCTensor *self_;
   THCTensor *src_ = THCTensor_(newTranspose)(state, src, dimension, 0);
   THCTensor *data = THCTensor_(newClone)(state, src_);
-  ptrdiff_t size = THCTensor_(nElement)(state, data)/data->size[0];
+  ptrdiff_t size = THCTensor_(nElement)(state, data)/data->size(0);
 
   THArgCheck(dimension >= 0 && dimension < THCTensor_(_nDimension)(state, src), 3, "invalid dimension");
   THArgCheck(THCNumerics<real>::gt(value, scalar_cast<real>(0)), 2, "non-positive-norm not supported");
   THArgCheck(THCTensor_(_nDimension)(state, src) > 1, 1, "need at least 2 dimensions");
 
-  dim3 grid(data->size[0]);
+  dim3 grid(data->size(0));
   dim3 threads(32);
 
   THCTensor_kernel_renorm<real, accreal>
diff --git a/aten/src/THC/generic/THCTensorMathScan.cu b/aten/src/THC/generic/THCTensorMathScan.cu
index 63657d4aa6027a..5aafb3bae8a6d9 100644
--- a/aten/src/THC/generic/THCTensorMathScan.cu
+++ b/aten/src/THC/generic/THCTensorMathScan.cu
@@ -79,7 +79,7 @@ void THCTensor_(scanDim)(THCState *state, THCTensor *self_, THCTensor *src,
                          int dimension, real init, BinaryFunction binary_op)
 {
   // "init" must be the identity element for binary_op
-  int ndim = THCTensor_(_nDimension)(state, src);
+  int ndim = THCTensor_(nDimension)(state, src);
   THArgCheck(dimension >= 0 && dimension < ndim, 3, "dimension %d out of range",
       dimension + TH_INDEX_BASE);
 
@@ -87,16 +87,18 @@ void THCTensor_(scanDim)(THCState *state, THCTensor *self_, THCTensor *src,
   THCTensor *self = THCTensor_(newContiguous)(state, self_);
   src = THCTensor_(newContiguous)(state, src);
 
-#ifndef THC_REAL_IS_HALF
-  if (ndim == 1) {
-    // thrust does not take an "init"
-    THCTensor_(scanThrust)(state, self, src, binary_op);
-  } else
-#endif
-  if (dimension == ndim - 1) {
-    THCTensor_(scanInnermostDim)(state, self, src, init, binary_op);
-  } else {
-    THCTensor_(scanOuterDim)(state, self, src, dimension, init, binary_op);
+  if (!self->is_empty()) {
+  #ifndef THC_REAL_IS_HALF
+    if (ndim == 1) {
+      // thrust does not take an "init"
+      THCTensor_(scanThrust)(state, self, src, binary_op);
+    } else
+  #endif
+    if (dimension == ndim - 1) {
+      THCTensor_(scanInnermostDim)(state, self, src, init, binary_op);
+    } else {
+      THCTensor_(scanOuterDim)(state, self, src, dimension, init, binary_op);
+    }
   }
 
   THCTensor_(free)(state, src);
diff --git a/aten/src/THCUNN/generic/BatchNormalization.cu b/aten/src/THCUNN/generic/BatchNormalization.cu
index 1eb3b820079b7b..ccf5da4da7f25a 100644
--- a/aten/src/THCUNN/generic/BatchNormalization.cu
+++ b/aten/src/THCUNN/generic/BatchNormalization.cu
@@ -21,11 +21,11 @@ static THCDeviceTensor<real, Dim> THNN_(devicetensor)(THCState *state, THCTensor
   int size[Dim];
   for (int i = 0; i < Dim || i < inDim; ++i) {
     if (i < Dim && i < inDim) {
-      size[i] = t->size[i];
+      size[i] = t->size(i);
     } else if (i < Dim) {
       size[i] = 1;
     } else {
-      size[Dim - 1] *= t->size[i];
+      size[Dim - 1] *= t->size(i);
     }
   }
   return THCDeviceTensor<real, Dim>(t->data<real>(), size);
diff --git a/aten/src/THCUNN/generic/Col2Im.cu b/aten/src/THCUNN/generic/Col2Im.cu
index 03c8dfdd838115..d29dcf75814fd0 100644
--- a/aten/src/THCUNN/generic/Col2Im.cu
+++ b/aten/src/THCUNN/generic/Col2Im.cu
@@ -6,9 +6,9 @@ static inline void THNN_(Col2Im_shapeCheck)(
                          THCState *state,
                          THCTensor *input,
                          THCTensor *gradOutput,
-                         int outputHeight, int outputWidth,
-                         int kH, int kW, int dH, int dW,
-                         int padH, int padW, int sH, int sW) {
+                         int64_t outputHeight, int64_t outputWidth,
+                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
+                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {
 
   THArgCheck(kW > 0 && kH > 0, 6,
              "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
@@ -17,12 +17,12 @@ static inline void THNN_(Col2Im_shapeCheck)(
   THArgCheck(dW > 0 && dH > 0, 8,
              "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW);
 
-  int ndim = THCTensor_(nDimension)(state, input);
+  int64_t ndim = THCTensor_(nDimension)(state, input);
   THCUNN_argCheck(state, !input->is_empty() && (ndim == 2 || ndim == 3), 2, input,
                   "Expected non-empty 2D or 3D input tensor, but got input of shape %s");
 
   int batch_dim = (ndim == 3) ? 0 : -1;
-  int64_t nInputPlane  = input->size[batch_dim + 1];
+  int64_t nInputPlane  = input->size(batch_dim + 1);
 
   if (nInputPlane % (kW * kH) != 0) {
     THError("Expected size of input's dimension 1 to be divisible by the "
@@ -30,7 +30,7 @@ static inline void THNN_(Col2Im_shapeCheck)(
             "kernel_size=(%d, %d).", (long long) nInputPlane, kH, kW);
   }
 
-  int64_t inputLength  = input->size[batch_dim + 2];
+  int64_t inputLength  = input->size(batch_dim + 2);
   int64_t nBlocksH = 1 + (outputHeight + 2 * padH - dH * (kH - 1) - 1) / sH;
   int64_t nBlocksW = 1 + ( outputWidth + 2 * padW - dW * (kW - 1) - 1) / sW;
 
@@ -54,11 +54,11 @@ void THNN_(Col2Im_updateOutput)(
            THCState *state,
            THCTensor *input,
            THCTensor *output,
-           int outputHeight, int outputWidth,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t outputHeight, int64_t outputWidth,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
 
   THCUNN_assertSameGPU(state, 2, input, output);
 
@@ -69,11 +69,11 @@ void THNN_(Col2Im_updateOutput)(
   if (input->dim() == 2) {
       // Force batch
       batched_input = false;
-      THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]);
+      THCTensor_(resize3d)(state, input, 1, input->size(0), input->size(1));
   }
 
-  int64_t batchSize = input->size[0];
-  int64_t nInputPlane = input->size[1];
+  int64_t batchSize = input->size(0);
+  int64_t nInputPlane = input->size(1);
   int64_t nOutputPlane = nInputPlane / (kW * kH);
 
   input = THCTensor_(newContiguous)(state, input);
@@ -84,10 +84,10 @@ void THNN_(Col2Im_updateOutput)(
   THCTensor *input_n = THCTensor_(new)(state);
   THCTensor *output_n = THCTensor_(new)(state);
 
-  int height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
-  int width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
 
-  for (int elt = 0; elt < batchSize; elt++) {
+  for (int64_t elt = 0; elt < batchSize; elt++) {
     THCTensor_(select)(state, input_n, input, 0, elt);
     THCTensor_(select)(state, output_n, output, 0, elt);
 
@@ -116,10 +116,10 @@ void THNN_(Col2Im_updateGradInput)(
            THCState *state,
            THCTensor *gradOutput,
            THCTensor *gradInput,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
 
   THNN_(Im2Col_updateOutput)(state, gradOutput, gradInput,
                              kH, kW, dH, dW, padH, padW, sH, sW);
diff --git a/aten/src/THCUNN/generic/Im2Col.cu b/aten/src/THCUNN/generic/Im2Col.cu
index dd6a6dcd01019f..d0f98f0b17cf5f 100644
--- a/aten/src/THCUNN/generic/Im2Col.cu
+++ b/aten/src/THCUNN/generic/Im2Col.cu
@@ -6,8 +6,8 @@ static inline void THNN_(Im2Col_shapeCheck)(
                          THCState *state,
                          THCTensor *input,
                          THCTensor *gradOutput,
-                         int kH, int kW, int dH, int dW,
-                         int padH, int padW, int sH, int sW) {
+                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
+                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {
 
   THArgCheck(kW > 0 && kH > 0, 4,
              "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
@@ -18,7 +18,7 @@ static inline void THNN_(Im2Col_shapeCheck)(
   THArgCheck(sW > 0 && sH > 0, 10,
              "stride should be greater than zero, but got sH: %d sW: %d", sH, sW);
 
-  int ndim = THCTensor_(nDimension)(state, input);
+  int64_t ndim = THCTensor_(nDimension)(state, input);
   THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
                 "Expected non-empty 3D or 4D input tensor, but got input of shape %s");
 
@@ -26,11 +26,11 @@ static inline void THNN_(Im2Col_shapeCheck)(
   if (ndim == 3) {
     dim_batch = -1;
   }
-  int nInputPlane  = THCTensor_(size)(state, input, dim_batch + 1);
-  int inputHeight  = THCTensor_(size)(state, input, dim_batch + 2);
-  int inputWidth   = THCTensor_(size)(state, input, dim_batch + 3);
-  int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
-  int outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t nInputPlane  = THCTensor_(size)(state, input, dim_batch + 1);
+  int64_t inputHeight  = THCTensor_(size)(state, input, dim_batch + 2);
+  int64_t inputWidth   = THCTensor_(size)(state, input, dim_batch + 3);
+  int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
 
   if (outputHeight < 1 || outputWidth < 1) {
     THError("Given input with spatial size (%d, %d), kernel_size=(%d, %d), "
@@ -46,10 +46,10 @@ void THNN_(Im2Col_updateOutput)(
            THCState *state,
            THCTensor *input,
            THCTensor *output,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
 
   THCUNN_assertSameGPU(state, 2, input, output);
 
@@ -59,18 +59,18 @@ void THNN_(Im2Col_updateOutput)(
   bool batched_input = true;
   if (input->dim() == 3) {
     batched_input = false;
-    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
   }
 
-  int batchSize    = THCTensor_(size)(state, input, 0);
-  int nInputPlane  = THCTensor_(size)(state, input, 1);
-  int inputHeight  = THCTensor_(size)(state, input, 2);
-  int inputWidth   = THCTensor_(size)(state, input, 3);
+  int64_t batchSize    = THCTensor_(size)(state, input, 0);
+  int64_t nInputPlane  = THCTensor_(size)(state, input, 1);
+  int64_t inputHeight  = THCTensor_(size)(state, input, 2);
+  int64_t inputWidth   = THCTensor_(size)(state, input, 3);
 
-  int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
-  int outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
-  int nOutputPlane = nInputPlane * kW * kH;
-  int outputLength = outputHeight * outputWidth;
+  int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t nOutputPlane = nInputPlane * kW * kH;
+  int64_t outputLength = outputHeight * outputWidth;
 
   THCTensor_(resize3d)(state, output, batchSize, nOutputPlane, outputLength);
   THCTensor_(zero)(state, output);
@@ -78,7 +78,7 @@ void THNN_(Im2Col_updateOutput)(
   THCTensor *input_n = THCTensor_(new)(state);
   THCTensor *output_n = THCTensor_(new)(state);
 
-  for (int elt = 0; elt < batchSize; elt++) {
+  for (int64_t elt = 0; elt < batchSize; elt++) {
     THCTensor_(select)(state, input_n, input, 0, elt);
     THCTensor_(select)(state, output_n, output, 0, elt);
 
@@ -104,11 +104,11 @@ void THNN_(Im2Col_updateGradInput)(
            THCState *state,
            THCTensor *gradOutput,
            THCTensor *gradInput,
-           int inputHeight, int inputWidth,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t inputHeight, int64_t inputWidth,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
 
   THNN_(Col2Im_updateOutput)(state, gradOutput, gradInput,
                              inputHeight, inputWidth,
diff --git a/aten/src/THCUNN/generic/IndexLinear.cu b/aten/src/THCUNN/generic/IndexLinear.cu
index 244d2346887328..ea9683d4535390 100644
--- a/aten/src/THCUNN/generic/IndexLinear.cu
+++ b/aten/src/THCUNN/generic/IndexLinear.cu
@@ -41,12 +41,12 @@ void THNN_(IndexLinear_updateOutput)(
     THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1,
                "Keys and values should have the same number of elements");
 
-    int64_t batchSize = sizes->size[0];
-    int64_t outDim = bias->size[0];
-    int64_t wDim = weight->size[1];
-    int64_t weightStride = weight->stride[0];
+    int64_t batchSize = sizes->size(0);
+    int64_t outDim = bias->size(0);
+    int64_t wDim = weight->size(1);
+    int64_t weightStride = weight->stride(0);
     int maxNormalize = wDim - outDim;
-    int64_t keysSize = keys->size[0];
+    int64_t keysSize = keys->size(0);
     int64_t nnzPerRow = divup(keysSize, batchSize);
 
     THCTensor_(resize2d)(state, output, batchSize, outDim);
@@ -100,10 +100,10 @@ void THNN_(IndexLinear_accGradParameters)(
     accreal weightDecay,
     accreal scale)
 {
-    int64_t keysSize = keys->size[0];
-    int64_t batchSize = sizes->size[0];
-    int64_t outDim = bias->size[0];
-    int64_t wDim = weight->size[1];
+    int64_t keysSize = keys->size(0);
+    int64_t batchSize = sizes->size(0);
+    int64_t outDim = bias->size(0);
+    int64_t wDim = weight->size(1);
     int maxNormalize = wDim - outDim;
 
     // Make sure these inputs are contiguous to accelerate computations
@@ -137,7 +137,7 @@ void THNN_(IndexLinear_accGradParameters)(
     real *gradOutputData  = THCTensor_(data)      (state, gradOutput);
     real *gradBiasData    = THCTensor_(data)      (state, gradBias);
     real *gradWeightData  = THCTensor_(data)      (state, gradWeight);
-    int64_t gradWeightStride = gradWeight->stride[0];
+    int64_t gradWeightStride = gradWeight->stride(0);
 
     cudaStream_t stream = THCState_getCurrentStream(state);
     dim3 threads(THREADS_X, THREADS_Y);
@@ -182,10 +182,10 @@ void THNN_(IndexLinear_accUpdateGradParameters)(
     THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1,
                "Keys and values should have the same number of elements");
 
-    int64_t batchSize = sizes->size[0];
-    int64_t outDim = bias->size[0];
-    int64_t keysSize = keys->size[0];
-    int64_t wDim = weight->size[1];
+    int64_t batchSize = sizes->size(0);
+    int64_t outDim = bias->size(0);
+    int64_t keysSize = keys->size(0);
+    int64_t wDim = weight->size(1);
     int maxNormalize = wDim - outDim;
 
     real *biasData         = THCTensor_(data)      (state, bias);
@@ -194,7 +194,7 @@ void THNN_(IndexLinear_accUpdateGradParameters)(
     real *valuesData       = THCTensor_(data)      (state, values);
     int64_t *keysData         = THCudaLongTensor_data (state, keys);
     int64_t *cumSumSizesData  = THCudaLongTensor_data (state, cumSumSizes);
-    int64_t weightStride = weight->stride[0];
+    int64_t weightStride = weight->stride(0);
 
     cudaStream_t stream = THCState_getCurrentStream(state);
     dim3 threads(THREADS_X, THREADS_Y);
@@ -241,15 +241,15 @@ void THNN_(IndexLinear_updateParameters)(
     THArgCheck(THCudaLongTensor_isContiguous(state, cumSumSizes), 6,
                "cumSumSizes vector must be contiguous");
 
-    int64_t outDim = bias->size[0];
-    int64_t wDim = weight->size[1];
+    int64_t outDim = bias->size(0);
+    int64_t wDim = weight->size(1);
     int maxNormalize = wDim - outDim;
-    int64_t keysSize = runningKeys->size[0];
-    int64_t batchSize = cumSumSizes->size[0];
+    int64_t keysSize = runningKeys->size(0);
+    int64_t batchSize = cumSumSizes->size(0);
 
     THCTensor_(cadd)(state, bias, bias, -learningRate, gradBias);
-    int64_t gradWeightStride = gradWeight->stride[0];
-    int64_t weightStride = weight->stride[0];
+    int64_t gradWeightStride = gradWeight->stride(0);
+    int64_t weightStride = weight->stride(0);
 
     int64_t *keysData        = THCudaLongTensor_data (state, runningKeys);
     int64_t *cumSumSizesData = THCudaLongTensor_data (state, cumSumSizes);
diff --git a/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu b/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu
index 2b02bf2093ff93..510a8230d74798 100644
--- a/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu
+++ b/aten/src/THCUNN/generic/MultiLabelMarginCriterion.cu
@@ -18,8 +18,8 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
 
   if(input->dim() == 1)
   {
-    int dim = input->size[0];
-    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == dim), 3,
+    int dim = input->size(0);
+    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == dim), 3,
         "inconsistent target size");
     THCTensor_(resize1d)(state, output, 1);
 
@@ -39,17 +39,17 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
   }
   else if(input->dim() == 2)
   {
-    int nframe = input->size[0];
-    int dim = input->size[1];
-    THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size[0] == nframe)
-               && (target->size[1] == dim), 3, "inconsistent target size");
+    int nframe = input->size(0);
+    int dim = input->size(1);
+    THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size(0) == nframe)
+               && (target->size(1) == dim), 3, "inconsistent target size");
 
-    dim3 blocks(input->size[0]);
+    dim3 blocks(input->size(0));
     dim3 threads(MULTILABELMARGIN_THREADS);
 
     if (reduction != Reduction::None)
     {
-      THCTensor *output_tmp = THCTensor_(newWithSize1d)(state, input->size[0]);
+      THCTensor *output_tmp = THCTensor_(newWithSize1d)(state, input->size(0));
       THCTensor_(resize1d)(state, output, 1);
 
       cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal>
@@ -67,7 +67,7 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
     }
     else
     {
-    THCTensor_(resize1d)(state, output, input->size[0]);
+    THCTensor_(resize1d)(state, output, input->size(0));
 
     cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal>
       <<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
@@ -106,10 +106,10 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
 
   if(gradInput->dim() == 1)
   {
-    int dim = gradInput->size[0];
-    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == dim), 3,
+    int dim = gradInput->size(0);
+    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == dim), 3,
                "inconsistent target size");
-    THArgCheck(!istarget->is_empty() && (istarget->dim() == 1) && (istarget->size[0] == dim), 3,
+    THArgCheck(!istarget->is_empty() && (istarget->dim() == 1) && (istarget->size(0) == dim), 3,
                "inconsistent isTarget size");
     dim3 blocks(1);
     dim3 threads(MULTILABELMARGIN_THREADS);
@@ -121,20 +121,20 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         THCTensor_(data)(state, istarget),
-        1, gradInput->size[0],
+        1, gradInput->size(0),
         reduction == Reduction::ElementwiseMean,
         reduction != Reduction::None);
 
   }
   else if(gradInput->dim() == 2)
   {
-    int nframe = gradInput->size[0];
-    int dim = gradInput->size[1];
-    THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size[0] == nframe)
-               && (target->size[1] == dim), 3, "inconsistent target size");
-    THArgCheck(!istarget->is_empty() && (istarget->dim() == 2) && (istarget->size[0] == nframe)
-               && (istarget->size[1] == dim), 3, "inconsistent isTarget size");
-    dim3 blocks(gradInput->size[0]);
+    int nframe = gradInput->size(0);
+    int dim = gradInput->size(1);
+    THArgCheck(!target->is_empty() && (target->dim() == 2) && (target->size(0) == nframe)
+               && (target->size(1) == dim), 3, "inconsistent target size");
+    THArgCheck(!istarget->is_empty() && (istarget->dim() == 2) && (istarget->size(0) == nframe)
+               && (istarget->size(1) == dim), 3, "inconsistent isTarget size");
+    dim3 blocks(gradInput->size(0));
     dim3 threads(MULTILABELMARGIN_THREADS);
 
     cunn_MultiLabelMarginCriterion_updateGradInput_kernel<real, accreal>
@@ -144,7 +144,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         THCTensor_(data)(state, istarget),
-        gradInput->size[0], gradInput->size[1],
+        gradInput->size(0), gradInput->size(1),
         reduction == Reduction::ElementwiseMean,
         reduction != Reduction::None);
   }
diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
index a620c0f8dad13f..8272b3d4020ec7 100644
--- a/aten/src/THCUNN/generic/MultiMarginCriterion.cu
+++ b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
@@ -30,7 +30,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, input->size[0],
+        1, input->size(0),
         reduction == Reduction::ElementwiseMean,
         margin
       );
@@ -42,7 +42,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, input->size[0],
+        1, input->size(0),
         reduction == Reduction::ElementwiseMean,
         margin
       );
@@ -51,15 +51,15 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   }
   else if (input->dim() == 2)
   {
-    int nframe = input->size[0];
-    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe), 3,
+    int nframe = input->size(0);
+    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3,
                "inconsistent target size");
-    dim3 blocks(input->size[0]);
+    dim3 blocks(input->size(0));
     dim3 threads(MULTIMARGIN_THREADS);
 
     if (reduction == Reduction::None)
     {
-      THCTensor_(resize1d)(state, output, input->size[0]);
+      THCTensor_(resize1d)(state, output, input->size(0));
       if (p == 1)
       {
         cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
@@ -67,7 +67,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
           THCTensor_(data)(state, input),
           THCIndexTensor_(data)(state, target),
           weights ? THCTensor_(data)(state, weights) : NULL,
-          nframe, input->size[1],
+          nframe, input->size(1),
           false,
           margin
         );
@@ -79,7 +79,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
           THCTensor_(data)(state, input),
           THCIndexTensor_(data)(state, target),
           weights ? THCTensor_(data)(state, weights) : NULL,
-          nframe, input->size[1],
+          nframe, input->size(1),
           false,
           margin
         );
@@ -89,7 +89,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
     else
     {
       THCTensor_(resize1d)(state, output, 1);
-      THCTensor *output_ = THCTensor_(newWithSize1d)(state, input->size[0]);  // tmp output buffer
+      THCTensor *output_ = THCTensor_(newWithSize1d)(state, input->size(0));  // tmp output buffer
       if (p == 1)
       {
         cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
@@ -97,7 +97,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
           THCTensor_(data)(state, input),
           THCIndexTensor_(data)(state, target),
           weights ? THCTensor_(data)(state, weights) : NULL,
-          nframe, input->size[1],
+          nframe, input->size(1),
           reduction == Reduction::ElementwiseMean,
           margin
         );
@@ -109,7 +109,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
           THCTensor_(data)(state, input),
           THCIndexTensor_(data)(state, target),
           weights ? THCTensor_(data)(state, weights) : NULL,
-          input->size[0], input->size[1],
+          input->size(0), input->size(1),
           reduction == Reduction::ElementwiseMean,
           margin
         );
@@ -162,7 +162,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, gradInput->size[0],
+        1, gradInput->size(0),
         reduction == Reduction::ElementwiseMean,
         margin,
         reduction != Reduction::None
@@ -176,7 +176,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, gradInput->size[0],
+        1, gradInput->size(0),
         reduction == Reduction::ElementwiseMean,
         margin,
         reduction != Reduction::None
@@ -186,10 +186,10 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   }
   else if (input->dim() == 2)
   {
-    int nframe = gradInput->size[0];
-    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe), 3,
+    int nframe = gradInput->size(0);
+    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3,
                "inconsistent target size");
-    dim3 blocks(gradInput->size[0]);
+    dim3 blocks(gradInput->size(0));
     dim3 threads(MULTIMARGIN_THREADS);
 
     if (p == 1)
@@ -200,7 +200,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        nframe, gradInput->size[1],
+        nframe, gradInput->size(1),
         reduction == Reduction::ElementwiseMean,
         margin,
         reduction != Reduction::None
@@ -214,7 +214,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        nframe, gradInput->size[1],
+        nframe, gradInput->size(1),
         reduction == Reduction::ElementwiseMean,
         margin,
         reduction != Reduction::None
diff --git a/aten/src/THCUNN/generic/PReLU.cu b/aten/src/THCUNN/generic/PReLU.cu
index e03d5739eb74a6..565ffcccd9ec2d 100644
--- a/aten/src/THCUNN/generic/PReLU.cu
+++ b/aten/src/THCUNN/generic/PReLU.cu
@@ -24,12 +24,12 @@ void THNN_(PReLU_updateOutput)(
     input = THCTensor_(newContiguous)(state, input);
 
     int n = THCTensor_(nElement)(state, input);
-    if (input->size[ndim > 1] != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[ndim > 1]);
+    if (input->size(ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(ndim > 1));
 
     int mapSize = 1;
     for (int d = 2; d < ndim; d++) {
-      mapSize *= input->size[d];
+      mapSize *= input->size(d);
     }
     int nElemsPerSample = nOutputPlane * mapSize;
     preluForward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
@@ -69,12 +69,12 @@ void THNN_(PReLU_updateGradInput)(
     gradOutput = THCTensor_(newContiguous)(state, gradOutput);
 
     int n = THCTensor_(nElement)(state, input);
-    if (input->size[ndim > 1] != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[ndim > 1]);
+    if (input->size(ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(ndim > 1));
 
     int mapSize = 1;
     for (int d = 2; d < ndim; d++) {
-      mapSize *= input->size[d];
+      mapSize *= input->size(d);
     }
     int nElemsPerSample = nOutputPlane * mapSize;
     preluBackward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
@@ -142,10 +142,10 @@ void THNN_(PReLU_accGradParameters)(
         THCTensor *buffer = THCTensor_(newContiguous)(state, gradInput);
         int64_t size3 = 1;
         for (int d = 2; d < ndim; d++) {
-          size3 *= input->size[d];
+          size3 *= input->size(d);
         }
-        THCTensor_(resize3d)(state, buffer, input->size[0], nOutputPlane, size3);
-        THCTensor_(resize2d)(state, sumbuf, input->size[0], nOutputPlane);
+        THCTensor_(resize3d)(state, buffer, input->size(0), nOutputPlane, size3);
+        THCTensor_(resize2d)(state, sumbuf, input->size(0), nOutputPlane);
         THCTensor_(sum)(state, sumbuf, buffer, 2, 1);
         THCTensor_(sum)(state, gradWeightBuf, sumbuf, 0, 1);
         THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf);
diff --git a/aten/src/THCUNN/generic/SparseLinear.cu b/aten/src/THCUNN/generic/SparseLinear.cu
index d5270d6689f1fe..a370fffa527c4e 100644
--- a/aten/src/THCUNN/generic/SparseLinear.cu
+++ b/aten/src/THCUNN/generic/SparseLinear.cu
@@ -4,17 +4,17 @@
 
 static bool THNN_(checkInput)(THCTensor* t)
 {
-  return !t->is_empty() && t->_dim() == 2 && t->size[1] == 3;
+  return !t->is_empty() && t->_dim() == 2 && t->size(1) == 3;
 }
 
 static bool THNN_(checkSize2D)(THCTensor* t, int64_t size0, int64_t size1)
 {
-  return !t->is_empty() && t->_dim() == 2 && t->size[0] == size0 && t->size[1] == size1;
+  return !t->is_empty() && t->_dim() == 2 && t->size(0) == size0 && t->size(1) == size1;
 }
 
 static bool THNN_(checkSize1D)(THCTensor* t, int64_t size0)
 {
-  return !t->is_empty() && t->_dim() == 1 && t->size[0] == size0;
+  return !t->is_empty() && t->_dim() == 1 && t->size(0) == size0;
 }
 
 static inline void THNN_(copyCudaFloatingType)(THCState *state, THCudaIntTensor *buf, THCTensor *t) {
diff --git a/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu b/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu
index 05a7b04e082b99..b25bbb94e4ea5f 100644
--- a/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu
+++ b/aten/src/THCUNN/generic/SpatialAdaptiveAveragePooling.cu
@@ -22,13 +22,13 @@ void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
                   "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
 
   if (input->dim() == 3) {
-    int64_t sizeD  = input->size[0];
-    int64_t isizeH = input->size[1];
-    int64_t isizeW = input->size[2];
+    int64_t sizeD  = input->size(0);
+    int64_t isizeH = input->size(1);
+    int64_t isizeW = input->size(2);
 
-    int64_t istrideD = input->stride[0];
-    int64_t istrideH = input->stride[1];
-    int64_t istrideW = input->stride[2];
+    int64_t istrideD = input->stride(0);
+    int64_t istrideH = input->stride(1);
+    int64_t istrideW = input->stride(2);
 
     input_data = THCTensor_(data)(state, input);
 
@@ -49,14 +49,14 @@ void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
 
   } else {
     input = THCTensor_(newContiguous)(state, input);
-    int64_t sizeB  = input->size[0];
-    int64_t sizeD  = input->size[1];
-    int64_t isizeH = input->size[2];
-    int64_t isizeW = input->size[3];
+    int64_t sizeB  = input->size(0);
+    int64_t sizeD  = input->size(1);
+    int64_t isizeH = input->size(2);
+    int64_t isizeW = input->size(3);
 
-    int64_t istrideD = input->stride[1];
-    int64_t istrideH = input->stride[2];
-    int64_t istrideW = input->stride[3];
+    int64_t istrideD = input->stride(1);
+    int64_t istrideH = input->stride(2);
+    int64_t istrideW = input->stride(3);
 
     input_data = THCTensor_(data)(state, input);
 
@@ -95,12 +95,12 @@ void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
 
   if (input->dim() == 3) {
-    int64_t sizeD  = input->size[0];
-    int64_t isizeH = input->size[1];
-    int64_t isizeW = input->size[2];
+    int64_t sizeD  = input->size(0);
+    int64_t isizeH = input->size(1);
+    int64_t isizeW = input->size(2);
 
-    int64_t osizeH = gradOutput->size[1];
-    int64_t osizeW = gradOutput->size[2];
+    int64_t osizeH = gradOutput->size(1);
+    int64_t osizeW = gradOutput->size(2);
 
     //bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0);
 
@@ -129,13 +129,13 @@ void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
     }
     THCudaCheck(cudaGetLastError());
   } else {
-    int64_t sizeB  = input->size[0];
-    int64_t sizeD  = input->size[1];
-    int64_t isizeH = input->size[2];
-    int64_t isizeW = input->size[3];
+    int64_t sizeB  = input->size(0);
+    int64_t sizeD  = input->size(1);
+    int64_t isizeH = input->size(2);
+    int64_t isizeW = input->size(3);
 
-    int64_t osizeH = gradOutput->size[2];
-    int64_t osizeW = gradOutput->size[3];
+    int64_t osizeH = gradOutput->size(2);
+    int64_t osizeW = gradOutput->size(3);
 
     //bool atomic = //(isizeW%osizeW != 0) || (isizeH%osizeH != 0);
 
diff --git a/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu b/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu
index 3e5fab6dd95c14..6ca5c9b42b827d 100644
--- a/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu
+++ b/aten/src/THCUNN/generic/SpatialAdaptiveMaxPooling.cu
@@ -24,13 +24,13 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
                   "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
 
   if (input->dim() == 3) {
-    int64_t sizeD  = input->size[0];
-    int64_t isizeH = input->size[1];
-    int64_t isizeW = input->size[2];
+    int64_t sizeD  = input->size(0);
+    int64_t isizeH = input->size(1);
+    int64_t isizeW = input->size(2);
 
-    int64_t istrideD = input->stride[0];
-    int64_t istrideH = input->stride[1];
-    int64_t istrideW = input->stride[2];
+    int64_t istrideD = input->stride(0);
+    int64_t istrideH = input->stride(1);
+    int64_t istrideW = input->stride(2);
 
     input_data = THCTensor_(data)(state, input);
 
@@ -55,14 +55,14 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
 
   } else {
     input = THCTensor_(newContiguous)(state, input);
-    int64_t sizeB  = input->size[0];
-    int64_t sizeD  = input->size[1];
-    int64_t isizeH = input->size[2];
-    int64_t isizeW = input->size[3];
+    int64_t sizeB  = input->size(0);
+    int64_t sizeD  = input->size(1);
+    int64_t isizeH = input->size(2);
+    int64_t isizeW = input->size(3);
 
-    int64_t istrideD = input->stride[1];
-    int64_t istrideH = input->stride[2];
-    int64_t istrideW = input->stride[3];
+    int64_t istrideD = input->stride(1);
+    int64_t istrideH = input->stride(2);
+    int64_t istrideW = input->stride(3);
 
     input_data = THCTensor_(data)(state, input);
 
@@ -107,12 +107,12 @@ void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
 
   if (input->dim() == 3) {
-    int64_t sizeD  = input->size[0];
-    int64_t isizeH = input->size[1];
-    int64_t isizeW = input->size[2];
+    int64_t sizeD  = input->size(0);
+    int64_t isizeH = input->size(1);
+    int64_t isizeW = input->size(2);
 
-    int64_t osizeH = gradOutput->size[1];
-    int64_t osizeW = gradOutput->size[2];
+    int64_t osizeH = gradOutput->size(1);
+    int64_t osizeW = gradOutput->size(2);
 
     //bool atomic = (isizeH%osizeH != 0) || (isizeW%osizeW != 0);
 
@@ -145,13 +145,13 @@ void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
     }
     THCudaCheck(cudaGetLastError());
   } else {
-    int64_t sizeB  = input->size[0];
-    int64_t sizeD  = input->size[1];
-    int64_t isizeH = input->size[2];
-    int64_t isizeW = input->size[3];
+    int64_t sizeB  = input->size(0);
+    int64_t sizeD  = input->size(1);
+    int64_t isizeH = input->size(2);
+    int64_t isizeW = input->size(3);
 
-    int64_t osizeH = gradOutput->size[2];
-    int64_t osizeW = gradOutput->size[3];
+    int64_t osizeH = gradOutput->size(2);
+    int64_t osizeW = gradOutput->size(3);
 
     //bool atomic = (isizeH%osizeH != 0) || (isizeW%osizeW != 0);
 
diff --git a/aten/src/THCUNN/generic/SpatialAveragePooling.cu b/aten/src/THCUNN/generic/SpatialAveragePooling.cu
index 7b3d2d4ee4cca6..7811acc4247666 100644
--- a/aten/src/THCUNN/generic/SpatialAveragePooling.cu
+++ b/aten/src/THCUNN/generic/SpatialAveragePooling.cu
@@ -32,9 +32,9 @@ static inline void THNN_(SpatialAveragePooling_shapeCheck)(
              "padW = %d, padH = %d, kW = %d, kH = %d",
              padW, padH, kW, kH);
 
-  int64_t nInputPlane = input->size[dimh-1];
-  int64_t nInputRows = input->size[dimh];
-  int64_t nInputCols = input->size[dimw];
+  int64_t nInputPlane = input->size(dimh-1);
+  int64_t nInputRows = input->size(dimh);
+  int64_t nInputCols = input->size(dimw);
   int64_t nOutputRows, nOutputCols;
   int64_t nOutputPlane = nInputPlane;
 
@@ -88,17 +88,17 @@ void THNN_(SpatialAveragePooling_updateOutput)(
   int64_t nOutputCols, nOutputRows;
 
   if (input->dim() == 3) {
-    nInputCols = input->size[2];
-    nInputRows = input->size[1];
-    nInputPlane = input->size[0];
+    nInputCols = input->size(2);
+    nInputRows = input->size(1);
+    nInputPlane = input->size(0);
     batchSize = 1;
   }
   else
   {
-    nInputCols = input->size[3];
-    nInputRows = input->size[2];
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
+    nInputCols = input->size(3);
+    nInputRows = input->size(2);
+    nInputPlane = input->size(1);
+    batchSize = input->size(0);
   }
 
   if(ceil_mode) {
@@ -174,18 +174,18 @@ void THNN_(SpatialAveragePooling_updateGradInput)(
   int dimRow = 1;
 
   if (input->dim() == 3) {
-    nInputPlane = input->size[0];
+    nInputPlane = input->size(0);
     batchSize = 1;
   }
   else
   {
     dimCol = 3;
     dimRow = 2;
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
+    nInputPlane = input->size(1);
+    batchSize = input->size(0);
   }
-  nInputCols = input->size[dimCol];
-  nInputRows = input->size[dimRow];
+  nInputCols = input->size(dimCol);
+  nInputRows = input->size(dimRow);
 
   if(ceil_mode) {
     nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu
index 64463945e6dadb..f22aba639a2d62 100644
--- a/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu
+++ b/aten/src/THCUNN/generic/SpatialConvolutionLocal.cu
@@ -30,8 +30,8 @@ static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
   THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
                   "non-empty 3D or 4D input tensor expected but got: %s");
 
-  int64_t nInputPlane = weight->size[2] / (kH * kW);
-  int64_t nOutputPlane = weight->size[1];
+  int64_t nInputPlane = weight->size(2) / (kH * kW);
+  int64_t nOutputPlane = weight->size(1);
 
   if (bias != NULL) {
    THCUNN_check_dim_size(state, bias, 3, 0, nOutputPlane);
@@ -56,9 +56,9 @@ static THCTensor* THNN_(view_weight_local)(
   AT_CHECK(!weight->is_empty() && (weight->dim() == 3 || weight->dim() == 6), 4,
            "weight tensor should be (non-empty) 3D or 6D - got size: ", weight->sizes());
   if (weight->dim() == 6) {
-    int64_t s1 = weight->size[0] * weight->size[1];
-    int64_t s2 = weight->size[2];
-    int64_t s3 = weight->size[3] * weight->size[4] * weight->size[5];
+    int64_t s1 = weight->size(0) * weight->size(1);
+    int64_t s2 = weight->size(2);
+    int64_t s3 = weight->size(3) * weight->size(4) * weight->size(5);
     THCTensor *old_weight = weight;
     weight = THCTensor_(newWithStorage3d)(state,
                           weight->storage,
@@ -105,7 +105,7 @@ void THNN_(SpatialConvolutionLocal_updateOutput)(
   }
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
@@ -219,7 +219,7 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
   }
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
@@ -339,7 +339,7 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)(
   }
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Helpers
   THCTensor *input_n = THCTensor_(new)(state);
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
index b5dab9b34394a5..e276d349d648cf 100644
--- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
+++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
@@ -17,7 +17,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
     THCUNN_argCheck(state, !weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
                     "non-empty 2D or 4D weight tensor expected, but got: %s");
     if (bias != NULL) {
-      THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]);
+      THCUNN_check_dim_size(state, bias, 1, 0, weight->size(0));
     }
   } else if (!weight_nullable) {
     THError("weight tensor is expected to be non-nullable");
@@ -37,8 +37,8 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
   THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
                   "non-empty 3D or 4D input tensor expected but got: %s");
 
-  int64_t inputHeight  = input->size[dimh];
-  int64_t inputWidth   = input->size[dimw];
+  int64_t inputHeight  = input->size(dimh);
+  int64_t inputWidth   = input->size(dimw);
 
   int64_t exactInputHeight = inputHeight + 2 * padH;
   int64_t exactInputWidth = inputWidth + 2 * padW;
@@ -59,7 +59,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
   }
 
   if (weight != NULL) {
-    int64_t nInputPlane = weight->size[1];
+    int64_t nInputPlane = weight->size(1);
     if (weight->dim() == 2) {
       nInputPlane /= (kH * kW);
     }
@@ -68,10 +68,10 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
 
   if (gradOutput != NULL) {
     if (weight != NULL) {
-      int64_t nOutputPlane = weight->size[0];
+      int64_t nOutputPlane = weight->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size[0];
+      int64_t nOutputPlane = bias->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     }
     THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
@@ -103,12 +103,12 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
   int freeWeight = 0;
 
   // Params:
-  int nInputPlane = weight->dim() == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
-  int nOutputPlane = weight->size[0];
+  int nInputPlane = weight->dim() == 2 ? weight->size(1)/(kH*kW) : weight->size(1);
+  int nOutputPlane = weight->size(0);
 
   if (weight->dim() == 4) {
-    int64_t s1 = weight->size[0];
-    int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    int64_t s1 = weight->size(0);
+    int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
     weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
     freeWeight = 1;
   }
@@ -121,16 +121,16 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
@@ -141,7 +141,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
-  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+  if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
@@ -199,7 +199,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
     int64_t m = nOutputPlane;
-    int64_t n = columns->size[1];
+    int64_t n = columns->size(1);
     int64_t k = nInputPlane*kH*kW;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
@@ -257,13 +257,13 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
        (state, input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, 0);
 
   // Params
-  int nInputPlane = weight->dim() == 2 ? weight->size[1]/(kW*kH) : weight->size[1];
-  int nOutputPlane = weight->size[0];
+  int nInputPlane = weight->dim() == 2 ? weight->size(1)/(kW*kH) : weight->size(1);
+  int nOutputPlane = weight->size(0);
 
   int freeWeight = 0;
   if (weight->dim() == 4) {
-    int64_t s1 = weight->size[0];
-    int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    int64_t s1 = weight->size(0);
+    int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
     weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
     freeWeight = 1;
   }
@@ -275,17 +275,17 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
@@ -306,7 +306,7 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
     int64_t m = nInputPlane*kW*kH;
-    int64_t n = gradColumns->size[1];
+    int64_t n = gradColumns->size(1);
     int64_t k = nOutputPlane;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
@@ -387,31 +387,31 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2));
   }
 
-  int64_t nInputPlane = input->size[1];
-  int64_t nOutputPlane = gradOutput->size[1];
+  int64_t nInputPlane = input->size(1);
+  int64_t nOutputPlane = gradOutput->size(1);
 
   int freeWeight = 0;
   if (gradWeight && gradWeight->dim() == 4) {
-    int64_t s1 = gradWeight->size[0];
-    int64_t s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3];
+    int64_t s1 = gradWeight->size(0);
+    int64_t s2 = gradWeight->size(1) * gradWeight->size(2) * gradWeight->size(3);
     gradWeight = THCTensor_(newWithStorage2d)(state, gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1);
     freeWeight = 1;
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Define a buffer of ones, for bias accumulation
-  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+  if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
@@ -448,7 +448,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
       // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
       int64_t m = nOutputPlane;
       int64_t n = nInputPlane*kW*kH;
-      int64_t k = columns->size[1];
+      int64_t k = columns->size(1);
 
       // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
       #ifdef THC_REAL_IS_FLOAT
diff --git a/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu
index fbdd8b4aa072a9..b0e65ed8b8fbab 100644
--- a/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu
+++ b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu
@@ -19,16 +19,16 @@ void THNN_(LRNforward)(THCState* state, THCTensor* input, THCTensor* output,
 
   if (input->dim() == 3) {
     batchSize = 1;
-    nInputPlane = input->size[0];
-    imsize_h = input->size[1];
-    imsize_w = input->size[2];
+    nInputPlane = input->size(0);
+    imsize_h = input->size(1);
+    imsize_w = input->size(2);
   }
   else
   {
-    batchSize = input->size[0];
-    nInputPlane = input->size[1];
-    imsize_h = input->size[2];
-    imsize_w = input->size[3];
+    batchSize = input->size(0);
+    nInputPlane = input->size(1);
+    imsize_h = input->size(2);
+    imsize_w = input->size(3);
   }
 
   input = THCTensor_(newContiguous)(state, input);
@@ -64,16 +64,16 @@ void THNN_(LRNbackward)(THCState* state, THCTensor* input, THCTensor* output,
 
   if (input->dim() == 3) {
     batchSize = 1;
-    nInputPlane = input->size[0];
-    imsize_h = input->size[1];
-    imsize_w = input->size[2];
+    nInputPlane = input->size(0);
+    imsize_h = input->size(1);
+    imsize_w = input->size(2);
   }
   else
   {
-    batchSize = input->size[0];
-    nInputPlane = input->size[1];
-    imsize_h = input->size[2];
-    imsize_w = input->size[3];
+    batchSize = input->size(0);
+    nInputPlane = input->size(1);
+    imsize_h = input->size(2);
+    imsize_w = input->size(3);
   }
 
   input = THCTensor_(newContiguous)(state, input);
diff --git a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
index 61cd0e2f10b4d0..16c0f2475860d3 100644
--- a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
@@ -23,15 +23,15 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)(
   // the caller, so we verify that here to some extent
 
   // Weight Tensor is shape (output_channels, 1, kH, kW)
-  THAssert(weight->size[1] == 1);
+  THAssert(weight->size(1) == 1);
 
   // Input Tensor is shape (N, input_channels, H, W)
   // We verify that the # of output_channels is a multiple of input_channels
-  THAssert(weight->size[0] % input->size[1] == 0);
+  THAssert(weight->size(0) % input->size(1) == 0);
 
   // Bias has same # of channels as output
   if (bias) {
-    THAssert(bias->size[0] == weight->size[0]);
+    THAssert(bias->size(0) == weight->size(0));
   }
 
   input = THCTensor_(newContiguous)(state, input);
@@ -41,12 +41,12 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)(
   // Following the behvaior of other THCUNN functions, we shape the output
   // Tensor ourselves
 
-  int batchSize = input->size[0];
-  int height = input->size[2];
-  int width = input->size[3];
+  int batchSize = input->size(0);
+  int height = input->size(2);
+  int width = input->size(3);
   int outputHeight = (height + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
   int outputWidth = (width + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  int outputChannels = weight->size[0];
+  int outputChannels = weight->size(0);
 
   THCTensor_(resize4d)(state, output, batchSize, outputChannels, outputHeight, outputWidth);
 
@@ -61,7 +61,7 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)(
     dBias = toDeviceTensor<real, 1>(state, bias);
   }
 
-  int inputChannels = input->size[1];
+  int inputChannels = input->size(1);
   int depthwiseMultiplier = outputChannels / inputChannels;
 
   // One thread per output value
@@ -113,20 +113,20 @@ void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
 
   // Minimal shape checking, as above
   // Same # of elements in batch
-  THAssert(input->size[0] == gradOutput->size[0]);
+  THAssert(input->size(0) == gradOutput->size(0));
   // Same # of filters as outputChannels
-  THAssert(weight->size[0] == gradOutput->size[1]);
+  THAssert(weight->size(0) == gradOutput->size(1));
 
   // Resize GradInput
   THCTensor_(resizeAs)(state, gradInput, input);
 
-  int inputChannels = input->size[1];
-  int height = input->size[2];
-  int width = input->size[3];
+  int inputChannels = input->size(1);
+  int height = input->size(2);
+  int width = input->size(3);
 
-  int outputChannels = gradOutput->size[1];
-  int outputHeight = gradOutput->size[2];
-  int outputWidth = gradOutput->size[3];
+  int outputChannels = gradOutput->size(1);
+  int outputHeight = gradOutput->size(2);
+  int outputWidth = gradOutput->size(3);
 
   int depthwiseMultiplier = outputChannels / inputChannels;
 
@@ -210,18 +210,18 @@ void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
 
   // Minimal shape checking as above
   // Same # of elements in batch
-  THAssert(input->size[0] == gradOutput->size[0]);
+  THAssert(input->size(0) == gradOutput->size(0));
   // Same # of filters as outputChannels
-  THAssert(gradWeight->size[0] == gradOutput->size[1]);
+  THAssert(gradWeight->size(0) == gradOutput->size(1));
 
-  int batchSize = input->size[0];
-  int inputChannels = input->size[1];
-  int height = input->size[2];
-  int width = input->size[3];
+  int batchSize = input->size(0);
+  int inputChannels = input->size(1);
+  int height = input->size(2);
+  int width = input->size(3);
 
-  int outputChannels = gradOutput->size[1];
-  int outputHeight = gradOutput->size[2];
-  int outputWidth = gradOutput->size[3];
+  int outputChannels = gradOutput->size(1);
+  int outputHeight = gradOutput->size(2);
+  int outputWidth = gradOutput->size(3);
 
   int depthwiseMultiplier = outputChannels / inputChannels;
 
diff --git a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
index 1cac7f604d354f..6d218ab6ca4829 100644
--- a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
@@ -21,7 +21,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
                     "non-empty 4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
                   "but got: %s");
     if (bias != NULL) {
-      THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]);
+      THCUNN_check_dim_size(state, bias, 1, 0, weight->size(0));
     }
   } else if (!weight_nullable) {
     THError("weight tensor is expected to be non-nullable");
@@ -41,8 +41,8 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
    THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
                    "non-empty 3D or 4D input tensor expected but got: %s");
 
-   int64_t inputHeight  = input->size[dimh];
-   int64_t inputWidth   = input->size[dimw];
+   int64_t inputHeight  = input->size(dimh);
+   int64_t inputWidth   = input->size(dimw);
 
    int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
    int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
@@ -54,16 +54,16 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
   }
 
   if (weight != NULL) {
-    int64_t nInputPlane = weight->size[1];
+    int64_t nInputPlane = weight->size(1);
      THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
   }
 
    if (gradOutput != NULL) {
     if (weight != NULL) {
-      int64_t nOutputPlane = weight->size[0];
+      int64_t nOutputPlane = weight->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size[0];
+      int64_t nOutputPlane = bias->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     }
      THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
@@ -94,8 +94,8 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
         dilationH, dilationW, 0);
 
   // Params:
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
+  int nInputPlane = weight->size(1);
+  int nOutputPlane = weight->size(0);
 
   input = THCTensor_(newContiguous)(state, input);
   weight = THCTensor_(newContiguous)(state, weight);
@@ -105,16 +105,16 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
@@ -125,7 +125,7 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
-  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+  if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
@@ -184,7 +184,7 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
     int64_t m = nOutputPlane;
-    int64_t n = columns->size[1];
+    int64_t n = columns->size(1);
     int64_t k = nInputPlane*kH*kW;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
@@ -240,8 +240,8 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
         dilationH, dilationW, 0);
 
   // Params
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
+  int nInputPlane = weight->size(1);
+  int nOutputPlane = weight->size(0);
 
   input = THCTensor_(newContiguous)(state, input);
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
@@ -251,17 +251,17 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
@@ -282,7 +282,7 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
     int64_t m = nInputPlane*kW*kH;
-    int64_t n = gradColumns->size[1];
+    int64_t n = gradColumns->size(1);
     int64_t k = nOutputPlane;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
@@ -367,22 +367,22 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2));
   }
 
-  int64_t nInputPlane = input->size[1];
-  int64_t nOutputPlane = gradOutput->size[1];
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t nInputPlane = input->size(1);
+  int64_t nOutputPlane = gradOutput->size(1);
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Define a buffer of ones, for bias accumulation
-  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+  if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
@@ -420,7 +420,7 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
       // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
       int64_t m = nOutputPlane;
       int64_t n = nInputPlane*kW*kH;
-      int64_t k = columns->size[1];
+      int64_t k = columns->size(1);
 
       // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
       #ifdef THC_REAL_IS_FLOAT
diff --git a/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu
index 7425345ce2c1fe..48a13720c48f31 100644
--- a/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu
+++ b/aten/src/THCUNN/generic/SpatialDilatedMaxPooling.cu
@@ -25,7 +25,7 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
   int batchSize = 1;
 
   if (ndim == 4) {
-    batchSize = input->size[0];
+    batchSize = input->size(0);
     dimf++;
     dimh++;
     dimw++;
@@ -38,9 +38,9 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
              "padW = %d, padH = %d, kW = %d, kH = %d",
              padW, padH, kW, kH);
 
-  int64_t nInputPlane = input->size[dimh-1];
-  int64_t nInputRows = input->size[dimh];
-  int64_t nInputCols = input->size[dimw];
+  int64_t nInputPlane = input->size(dimh-1);
+  int64_t nInputRows = input->size(dimh);
+  int64_t nInputCols = input->size(dimw);
   int64_t nOutputRows, nOutputCols;
   int64_t nOutputPlane = nInputPlane;
 
@@ -102,17 +102,17 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)(
   int64_t nOutputCols, nOutputRows;
 
   if (input->dim() == 3) {
-    nInputCols = input->size[2];
-    nInputRows = input->size[1];
-    nInputPlane = input->size[0];
+    nInputCols = input->size(2);
+    nInputRows = input->size(1);
+    nInputPlane = input->size(0);
     batchSize = 1;
   }
   else
   {
-    nInputCols = input->size[3];
-    nInputRows = input->size[2];
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
+    nInputCols = input->size(3);
+    nInputRows = input->size(2);
+    nInputPlane = input->size(1);
+    batchSize = input->size(0);
   }
 
   if(ceil_mode) {
@@ -181,17 +181,17 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
   int64_t nOutputCols, nOutputRows;
 
   if (input->_dim() == 3) {
-    nInputCols = input->size[2];
-    nInputRows = input->size[1];
-    nInputPlane = input->size[0];
+    nInputCols = input->size(2);
+    nInputRows = input->size(1);
+    nInputPlane = input->size(0);
     batchSize = 1;
   }
   else
   {
-    nInputCols = input->size[3];
-    nInputRows = input->size[2];
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
+    nInputCols = input->size(3);
+    nInputRows = input->size(2);
+    nInputPlane = input->size(1);
+    batchSize = input->size(0);
   }
 
   if(ceil_mode) {
diff --git a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu
index 58ab364551c76c..76777796e361e4 100644
--- a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu
@@ -24,7 +24,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
     THCUNN_argCheck(state, !weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
                     "non-empty 2D or 4D weight tensor expected, but got: %s");
     if (bias != NULL) {
-      THCUNN_check_dim_size(state, bias, 1, 0, weight->size[1]);
+      THCUNN_check_dim_size(state, bias, 1, 0, weight->size(1));
     }
   } else if (!weight_nullable) {
     THError("weight tensor is expected to be non-nullable");
@@ -44,8 +44,8 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
   THCUNN_argCheck(state, !input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
                   "non-empty 3D or 4D input tensor expected but got: %s");
 
-  int64_t inputHeight  = input->size[dimh];
-  int64_t inputWidth   = input->size[dimw];
+  int64_t inputHeight  = input->size(dimh);
+  int64_t inputWidth   = input->size(dimw);
   int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
   int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
 
@@ -56,16 +56,16 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
   }
 
   if (weight != NULL) {
-    int64_t nInputPlane = weight->size[0];
+    int64_t nInputPlane = weight->size(0);
     THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
   }
 
   if (gradOutput != NULL) {
     if (weight != NULL) {
-      int64_t nOutputPlane = weight->size[1];
+      int64_t nOutputPlane = weight->size(1);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size[0];
+      int64_t nOutputPlane = bias->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     }
     THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
@@ -105,16 +105,16 @@ void THNN_(SpatialFullDilatedConvolution_updateOutput)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
   int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
@@ -125,7 +125,7 @@ void THNN_(SpatialFullDilatedConvolution_updateOutput)(
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
-  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+  if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
@@ -143,9 +143,9 @@ void THNN_(SpatialFullDilatedConvolution_updateOutput)(
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    int64_t m = weight->size[1] * weight->size[2] * weight->size[3];
-    int64_t n = columns->size[1];
-    int64_t k = weight->size[0];
+    int64_t m = weight->size(1) * weight->size(2) * weight->size(3);
+    int64_t n = columns->size(1);
+    int64_t k = weight->size(0);
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     #ifdef THC_REAL_IS_FLOAT
@@ -244,17 +244,17 @@ void THNN_(SpatialFullDilatedConvolution_updateGradInput)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
   int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
@@ -285,9 +285,9 @@ void THNN_(SpatialFullDilatedConvolution_updateGradInput)(
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    int64_t m = weight->size[0];
-    int64_t n = gradColumns->size[1];
-    int64_t k = weight->size[1] * weight->size[2] * weight->size[3];
+    int64_t m = weight->size(0);
+    int64_t n = gradColumns->size(1);
+    int64_t k = weight->size(1) * weight->size(2) * weight->size(3);
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     #ifdef THC_REAL_IS_FLOAT
@@ -371,20 +371,20 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+    THCTensor_(resize4d)(state, input, 1, input->size(0), input->size(1), input->size(2));
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
   int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Define a buffer of ones, for bias accumulation
-  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+  if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
@@ -419,9 +419,9 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
 
       // M,N,K are dims of matrix A and B
       // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-      int64_t n = columns->size[0];   // nOutputPlane * kh * kw
-      int64_t m = input_n->size[0];   // nInputPlane
-      int64_t k = columns->size[1];   // inputHeight * inputWidth
+      int64_t n = columns->size(0);   // nOutputPlane * kh * kw
+      int64_t m = input_n->size(0);   // nInputPlane
+      int64_t k = columns->size(1);   // inputHeight * inputWidth
 
       // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
       #ifdef THC_REAL_IS_FLOAT
@@ -488,7 +488,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
   // Resize
   if (is_batch == 0) {
     THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
-    THCTensor_(resize3d)(state, input, input->size[1], inputHeight, inputWidth);
+    THCTensor_(resize3d)(state, input, input->size(1), inputHeight, inputWidth);
   }
 
   THCTensor_(free)(state, input);
diff --git a/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu b/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu
index 90d6e0a837f067..e86896e2764434 100644
--- a/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu
+++ b/aten/src/THCUNN/generic/SpatialMaxUnpooling.cu
@@ -17,17 +17,17 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
   int64_t nInputCols, nInputRows, nInputPlane, batchSize;
 
   if (input->dim() == 3) {
-    nInputCols = input->size[2];
-    nInputRows = input->size[1];
-    nInputPlane = input->size[0];
+    nInputCols = input->size(2);
+    nInputRows = input->size(1);
+    nInputPlane = input->size(0);
     batchSize = 1;
   }
   else
   {
-    nInputCols = input->size[3];
-    nInputRows = input->size[2];
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
+    nInputCols = input->size(3);
+    nInputRows = input->size(2);
+    nInputPlane = input->size(1);
+    batchSize = input->size(0);
   }
 
   input = THCTensor_(newContiguous)(state, input);
@@ -65,22 +65,22 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
   int dimh = 1;
 
   if (input->dim() == 3) {
-    nInputPlane = input->size[0];
+    nInputPlane = input->size(0);
     batchSize = 1;
   }
   else
   {
     ++dimw;
     ++dimh;
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
+    nInputPlane = input->size(1);
+    batchSize = input->size(0);
   }
-  nInputCols = input->size[dimw];
-  nInputRows = input->size[dimh];
+  nInputCols = input->size(dimw);
+  nInputRows = input->size(dimh);
 
-  if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
+  if(owidth!=gradOutput->size(dimw) || oheight!=gradOutput->size(dimh)){
      THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
-             oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]);
+             oheight, owidth,gradOutput->size(dimh),gradOutput->size(dimw));
   }
 
   input = THCTensor_(newContiguous)(state, input);
diff --git a/aten/src/THCUNN/generic/SpatialReflectionPadding.cu b/aten/src/THCUNN/generic/SpatialReflectionPadding.cu
index 0c90944e84a5f3..4e3ab6c7de24c5 100644
--- a/aten/src/THCUNN/generic/SpatialReflectionPadding.cu
+++ b/aten/src/THCUNN/generic/SpatialReflectionPadding.cu
@@ -97,8 +97,8 @@ void THNN_(SpatialReflectionPadding_updateGradInput)(
     dimh++;
     dimw++;
   }
-  int iheight = input->size[dimh];
-  int iwidth = input->size[dimw];
+  int iheight = input->size(dimh);
+  int iwidth = input->size(dimw);
   int oheight = iheight + padT + padB;
   int owidth  = iwidth + padL + padR;
 
diff --git a/aten/src/THCUNN/generic/SpatialReplicationPadding.cu b/aten/src/THCUNN/generic/SpatialReplicationPadding.cu
index 6ab694dcacd92e..07c51292bf2b37 100644
--- a/aten/src/THCUNN/generic/SpatialReplicationPadding.cu
+++ b/aten/src/THCUNN/generic/SpatialReplicationPadding.cu
@@ -87,8 +87,8 @@ void THNN_(SpatialReplicationPadding_updateGradInput)(
     dimh++;
     dimw++;
   }
-  int iheight = input->size[dimh];
-  int iwidth = input->size[dimw];
+  int iheight = input->size(dimh);
+  int iwidth = input->size(dimw);
   int oheight = iheight + padT + padB;
   int owidth  = iwidth + padL + padR;
 
diff --git a/aten/src/THCUNN/generic/SpatialSubSampling.cu b/aten/src/THCUNN/generic/SpatialSubSampling.cu
index ea71c82e943fdb..d44168218e44c9 100644
--- a/aten/src/THCUNN/generic/SpatialSubSampling.cu
+++ b/aten/src/THCUNN/generic/SpatialSubSampling.cu
@@ -25,9 +25,9 @@ static inline void THNN_(SpatialSubSampling_shapeCheck)(
     dimp++;
   }
 
-  int64_t nInputCols = input->size[dimc];
-  int64_t nInputRows = input->size[dimr];
-  THArgCheck(input->size[dimp] == nInputPlane, 2, "invalid number of input planes");
+  int64_t nInputCols = input->size(dimc);
+  int64_t nInputRows = input->size(dimr);
+  THArgCheck(input->size(dimp) == nInputPlane, 2, "invalid number of input planes");
   THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size");
 }
 
@@ -51,8 +51,8 @@ void THNN_(SpatialSubSampling_updateOutput)(
   THNN_(SpatialSubSampling_shapeCheck)(state, input, NULL, weight, kW, kH);
 
   if (input->dim() == 3) {
-    int64_t nInputCols = input->size[2];
-    int64_t nInputRows = input->size[1];
+    int64_t nInputCols = input->size(2);
+    int64_t nInputRows = input->size(1);
     int64_t nOutputCols = (nInputCols - kW) / dW + 1;
     int64_t nOutputRows = (nInputRows - kH) / dH + 1;
 
@@ -74,9 +74,9 @@ void THNN_(SpatialSubSampling_updateOutput)(
       nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
     THCudaCheck(cudaGetLastError());
   } else {
-    int64_t nInputCols = input->size[3];
-    int64_t nInputRows = input->size[2];
-    int64_t nbatch = input->size[0];
+    int64_t nInputCols = input->size(3);
+    int64_t nInputRows = input->size(2);
+    int64_t nbatch = input->size(0);
     int64_t nOutputCols = (nInputCols - kW) / dW + 1;
     int64_t nOutputRows = (nInputRows - kH) / dH + 1;
 
@@ -119,8 +119,8 @@ void THNN_(SpatialSubSampling_updateGradInput)(
   int nInputPlane = THCTensor_(size)(state, weight, 0);
 
   if (input->dim() == 3) {
-    int64_t nInputCols = input->size[2];
-    int64_t nInputRows = input->size[1];
+    int64_t nInputCols = input->size(2);
+    int64_t nInputRows = input->size(1);
 
     real *weight_data = THCTensor_(data)(state, weight);
     gradOutput = THCTensor_(newContiguous)(state, gradOutput);
@@ -149,9 +149,9 @@ void THNN_(SpatialSubSampling_updateGradInput)(
     }
     THCudaCheck(cudaGetLastError());
   } else {
-    int64_t nInputCols = input->size[3];
-    int64_t nInputRows = input->size[2];
-    int64_t nbatch = input->size[0];
+    int64_t nInputCols = input->size(3);
+    int64_t nInputRows = input->size(2);
+    int64_t nbatch = input->size(0);
 
     real *weight_data = THCTensor_(data)(state, weight);
     gradOutput = THCTensor_(newContiguous)(state, gradOutput);
@@ -199,8 +199,8 @@ void THNN_(SpatialSubSampling_accGradParameters)(
   int nInputPlane = THCTensor_(size)(state, gradWeight, 0);
 
   if (input->dim() == 3) {
-    int64_t nInputCols = input->size[2];
-    int64_t nInputRows = input->size[1];
+    int64_t nInputCols = input->size(2);
+    int64_t nInputRows = input->size(1);
 
     real *gradWeight_data = THCTensor_(data)(state, gradWeight);
     real *gradBias_data = THCTensor_(data)(state, gradBias);
@@ -221,9 +221,9 @@ void THNN_(SpatialSubSampling_accGradParameters)(
       nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
     THCudaCheck(cudaGetLastError());
   } else {
-    int64_t nInputCols = input->size[3];
-    int64_t nInputRows = input->size[2];
-    int64_t nbatch = input->size[0];
+    int64_t nInputCols = input->size(3);
+    int64_t nInputRows = input->size(2);
+    int64_t nbatch = input->size(0);
 
     real *gradWeight_data = THCTensor_(data)(state, gradWeight);
     real *gradBias_data = THCTensor_(data)(state, gradBias);
@@ -242,8 +242,8 @@ void THNN_(SpatialSubSampling_accGradParameters)(
     int64_t sl;
     for (sl=0; sl<nbatch; sl++) {
       subgradweight<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
-        input_data + sl*input->stride[0],
-        gradOutput_data + sl*gradOutput->stride[0],
+        input_data + sl*input->stride(0),
+        gradOutput_data + sl*gradOutput->stride(0),
         gradWeight_data, gradBias_data,
         nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
     }
diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h
index c48536e4ded57b..eaadf66c8306ee 100644
--- a/aten/src/THCUNN/generic/THCUNN.h
+++ b/aten/src/THCUNN/generic/THCUNN.h
@@ -183,39 +183,39 @@ THC_API void THNN_(Im2Col_updateOutput)(
                   THCState *state,
                   THCTensor *input,
                   THCTensor *output,
-                  int kH, int kW,
-                  int dH, int dW,
-                  int padH, int padW,
-                  int sH, int sW);
+                  int64_t kH, int64_t kW,
+                  int64_t dH, int64_t dW,
+                  int64_t padH, int64_t padW,
+                  int64_t sH, int64_t sW);
 
 THC_API void THNN_(Im2Col_updateGradInput)(
                   THCState *state,
                   THCTensor *gradOutput,
                   THCTensor *gradInput,
-                  int inputHeight, int inputWidth,
-                  int kH, int kW,
-                  int dH, int dW,
-                  int padH, int padW,
-                  int sH, int sW);
+                  int64_t inputHeight, int64_t inputWidth,
+                  int64_t kH, int64_t kW,
+                  int64_t dH, int64_t dW,
+                  int64_t padH, int64_t padW,
+                  int64_t sH, int64_t sW);
 
 THC_API void THNN_(Col2Im_updateOutput)(
                   THCState *state,
                   THCTensor *input,
                   THCTensor *output,
-                  int outputHeight, int outputWidth,
-                  int kH, int kW,
-                  int dH, int dW,
-                  int padH, int padW,
-                  int sH, int sW);
+                  int64_t outputHeight, int64_t outputWidth,
+                  int64_t kH, int64_t kW,
+                  int64_t dH, int64_t dW,
+                  int64_t padH, int64_t padW,
+                  int64_t sH, int64_t sW);
 
  THC_API void THNN_(Col2Im_updateGradInput)(
                   THCState *state,
                   THCTensor *gradOutput,
                   THCTensor *gradInput,
-                  int kH, int kW,
-                  int dH, int dW,
-                  int padH, int padW,
-                  int sH, int sW);
+                  int64_t kH, int64_t kW,
+                  int64_t dH, int64_t dW,
+                  int64_t padH, int64_t padW,
+                  int64_t sH, int64_t sW);
 
 THC_API void THNN_(LeakyReLU_updateOutput)(
                   THCState *state,
diff --git a/aten/src/THCUNN/generic/TemporalConvolution.cu b/aten/src/THCUNN/generic/TemporalConvolution.cu
index 1bb17612fd9e89..25baf933b57a98 100644
--- a/aten/src/THCUNN/generic/TemporalConvolution.cu
+++ b/aten/src/THCUNN/generic/TemporalConvolution.cu
@@ -25,13 +25,13 @@ static inline void THNN_(TemporalConvolution_shapeCheck)(
   THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input,
                   "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s");
   if (inputFrameSize != NULL) {
-    THArgCheck(input->size[dimF] == *inputFrameSize, 2,
+    THArgCheck(input->size(dimF) == *inputFrameSize, 2,
                "invalid input frame size. Got: %d, Expected: %d",
-               input->size[dimF], *inputFrameSize);
+               input->size(dimF), *inputFrameSize);
   }
-  THArgCheck(input->size[dimS] >= kW, 2,
+  THArgCheck(input->size(dimS) >= kW, 2,
              "input sequence smaller than kernel size. Got: %d, Expected: %d",
-             input->size[dimS], kW);
+             input->size(dimS), kW);
 }
 
 void THNN_(TemporalConvolution_updateOutput)(
@@ -65,7 +65,7 @@ void THNN_(TemporalConvolution_updateOutput)(
   outputWindow = THCTensor_(new)(state);
   inputWindow = THCTensor_(new)(state);
 
-  nInputFrame = input->size[dimS];
+  nInputFrame = input->size(dimS);
   nOutputFrame = (nInputFrame - kW) / dW + 1;
 
   if (input->dim() == 2)
@@ -91,14 +91,14 @@ void THNN_(TemporalConvolution_updateOutput)(
       nOutputFrame -= nFrame;
 
       THCTensor_(setStorage2d)(state, inputWindow, input->storage,
-                              input->storageOffset+k*dW*input->size[1],
-                              nFrame, inputFrameStride*input->size[1],
-                              kW*input->size[1], 1);
+                              input->storageOffset+k*dW*input->size(1),
+                              nFrame, inputFrameStride*input->size(1),
+                              kW*input->size(1), 1);
 
       THCTensor_(setStorage2d)(state, outputWindow, output->storage,
-                              output->storageOffset + k*output->size[1],
-                              nFrame, outputFrameStride*output->size[1],
-                              output->size[1], 1);
+                              output->storageOffset + k*output->size(1),
+                              nFrame, outputFrameStride*output->size(1),
+                              output->size(1), 1);
 
       THCTensor *tweight = THCTensor_(new)(state);
       THCTensor_(transpose)(state, tweight, weight, 0, 1);
@@ -110,7 +110,7 @@ void THNN_(TemporalConvolution_updateOutput)(
   {
     THCTensor *outputSample = THCTensor_(new)(state);
     THCTensor *inputSample = THCTensor_(new)(state);
-    int nBatchFrame = input->size[0];
+    int nBatchFrame = input->size(0);
 
     THCTensor_(resize3d)(state, output,
                           nBatchFrame,
@@ -139,14 +139,14 @@ void THNN_(TemporalConvolution_updateOutput)(
         nOutputSampleFrame -= nFrame;
 
         THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage,
-                                inputSample->storageOffset+k*dW*inputSample->size[1],
-                                nFrame, inputFrameStride*inputSample->size[1],
-                                kW*inputSample->size[1], 1);
+                                inputSample->storageOffset+k*dW*inputSample->size(1),
+                                nFrame, inputFrameStride*inputSample->size(1),
+                                kW*inputSample->size(1), 1);
 
         THCTensor_(setStorage2d)(state, outputWindow, outputSample->storage,
-                                outputSample->storageOffset + k*outputSample->size[1],
-                                nFrame, outputFrameStride*outputSample->size[1],
-                                outputSample->size[1], 1);
+                                outputSample->storageOffset + k*outputSample->size(1),
+                                nFrame, outputFrameStride*outputSample->size(1),
+                                outputSample->size(1), 1);
 
         THCTensor *tweight = THCTensor_(new)(state);
         THCTensor_(transpose)(state, tweight, weight, 0, 1);
@@ -194,8 +194,8 @@ void THNN_(TemporalConvolution_updateGradInput)(
     dimS = 1;
   }
 
-  nInputFrame = input->size[dimS];
-  nOutputFrame = gradOutput->size[dimS];
+  nInputFrame = input->size(dimS);
+  nOutputFrame = gradOutput->size(dimS);
 
 
   /* Not necessary with partial backprop: */
@@ -216,14 +216,14 @@ void THNN_(TemporalConvolution_updateGradInput)(
       nOutputFrame -= nFrame;
 
       THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage,
-                              gradOutput->storageOffset + k*gradOutput->size[1],
-                              nFrame, outputFrameStride*gradOutput->size[1],
-                              gradOutput->size[1], 1);
+                              gradOutput->storageOffset + k*gradOutput->size(1),
+                              nFrame, outputFrameStride*gradOutput->size(1),
+                              gradOutput->size(1), 1);
 
       THCTensor_(setStorage2d)(state, gradInputWindow, gradInput->storage,
-                              gradInput->storageOffset+k*dW*gradInput->size[1],
-                              nFrame, inputFrameStride*gradInput->size[1],
-                              kW*gradInput->size[1], 1);
+                              gradInput->storageOffset+k*dW*gradInput->size(1),
+                              nFrame, inputFrameStride*gradInput->size(1),
+                              kW*gradInput->size(1), 1);
 
       THCTensor_(addmm)(state, gradInputWindow, ScalarConvert<int, real>::to(1), gradInputWindow, ScalarConvert<int, real>::to(1), gradOutputWindow, weight);
     }
@@ -232,7 +232,7 @@ void THNN_(TemporalConvolution_updateGradInput)(
   {
     THCTensor *gradOutputSample = THCTensor_(new)(state);
     THCTensor *gradInputSample = THCTensor_(new)(state);
-    int64_t nBatchFrame = input->size[0];
+    int64_t nBatchFrame = input->size(0);
     for(i = 0; i < nBatchFrame; i++)
     {
       THCTensor_(select)(state, gradOutputSample, gradOutput, 0, i);
@@ -248,14 +248,14 @@ void THNN_(TemporalConvolution_updateGradInput)(
         nOutputSampleFrame -= nFrame;
 
         THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage,
-                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
-                                nFrame, outputFrameStride*gradOutputSample->size[1],
-                                gradOutputSample->size[1], 1);
+                                gradOutputSample->storageOffset + k*gradOutputSample->size(1),
+                                nFrame, outputFrameStride*gradOutputSample->size(1),
+                                gradOutputSample->size(1), 1);
 
         THCTensor_(setStorage2d)(state, gradInputWindow, gradInputSample->storage,
-                                gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
-                                nFrame, inputFrameStride*gradInputSample->size[1],
-                                kW*gradInputSample->size[1], 1);
+                                gradInputSample->storageOffset+k*dW*gradInputSample->size(1),
+                                nFrame, inputFrameStride*gradInputSample->size(1),
+                                kW*gradInputSample->size(1), 1);
 
         THCTensor_(addmm)(state, gradInputWindow, ScalarConvert<int, real>::to(1), gradInputWindow, ScalarConvert<int, real>::to(1), gradOutputWindow, weight);
       }
@@ -298,8 +298,8 @@ void THNN_(TemporalConvolution_accGradParameters)(
     dimS = 1;
   }
 
-  nInputFrame = input->size[dimS];
-  nOutputFrame = gradOutput->size[dimS];
+  nInputFrame = input->size(dimS);
+  nOutputFrame = gradOutput->size(dimS);
 
   /* Not necessary with partial backprop: */
   input = THCTensor_(newContiguous)(state, input);
@@ -325,14 +325,14 @@ void THNN_(TemporalConvolution_accGradParameters)(
       nOutputFrame -= nFrame;
 
       THCTensor_(setStorage2d)(state, inputWindow, input->storage,
-                              input->storageOffset+k*dW*input->size[1],
-                              nFrame, inputFrameStride*input->size[1],
-                              kW*input->size[1], 1);
+                              input->storageOffset+k*dW*input->size(1),
+                              nFrame, inputFrameStride*input->size(1),
+                              kW*input->size(1), 1);
 
       THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage,
-                              gradOutput->storageOffset + k*gradOutput->size[1],
-                              nFrame, outputFrameStride*gradOutput->size[1],
-                              gradOutput->size[1], 1);
+                              gradOutput->storageOffset + k*gradOutput->size(1),
+                              nFrame, outputFrameStride*gradOutput->size(1),
+                              gradOutput->size(1), 1);
 
       THCTensor *tgradOutputWindow = THCTensor_(new)(state);
       THCTensor_(transpose)(state, tgradOutputWindow, gradOutputWindow, 0, 1);
@@ -344,7 +344,7 @@ void THNN_(TemporalConvolution_accGradParameters)(
   {
     THCTensor *gradOutputSample = THCTensor_(new)(state);
     THCTensor *inputSample = THCTensor_(new)(state);
-    int64_t nBatchFrame = input->size[0];
+    int64_t nBatchFrame = input->size(0);
 
     for(i = 0; i < nBatchFrame; i++)
     {
@@ -368,14 +368,14 @@ void THNN_(TemporalConvolution_accGradParameters)(
         nOutputSampleFrame -= nFrame;
 
         THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage,
-                                inputSample->storageOffset+k*dW*inputSample->size[1],
-                                nFrame, inputFrameStride*inputSample->size[1],
-                                kW*inputSample->size[1], 1);
+                                inputSample->storageOffset+k*dW*inputSample->size(1),
+                                nFrame, inputFrameStride*inputSample->size(1),
+                                kW*inputSample->size(1), 1);
 
         THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage,
-                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
-                                nFrame, outputFrameStride*gradOutputSample->size[1],
-                                gradOutputSample->size[1], 1);
+                                gradOutputSample->storageOffset + k*gradOutputSample->size(1),
+                                nFrame, outputFrameStride*gradOutputSample->size(1),
+                                gradOutputSample->size(1), 1);
 
         THCTensor *tgradOutputWindow = THCTensor_(new)(state);
         THCTensor_(transpose)(state, tgradOutputWindow, gradOutputWindow, 0, 1);
diff --git a/aten/src/THCUNN/generic/TemporalMaxPooling.cu b/aten/src/THCUNN/generic/TemporalMaxPooling.cu
index e355ebd14ee5c6..a950aa730afb59 100644
--- a/aten/src/THCUNN/generic/TemporalMaxPooling.cu
+++ b/aten/src/THCUNN/generic/TemporalMaxPooling.cu
@@ -27,12 +27,12 @@ static inline void THNN_(TemporalMaxPooling_shapeCheck)(
 
   THCUNN_argCheck(state, !input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input,
                   "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s");
-  THArgCheck(input->size[dimT] >= kW, 2,
+  THArgCheck(input->size(dimT) >= kW, 2,
              "input sequence smaller than kernel size. Got: %d, Expected: %d",
-             input->size[dimT], kW);
+             input->size(dimT), kW);
 
-  input_w = input->size[dimT];
-  input_n = input->size[dimF];
+  input_w = input->size(dimT);
+  input_n = input->size(dimF);
   output_w = (input_w - kW) / dW + 1;
 
   if (gradOutput != NULL) {
@@ -71,23 +71,23 @@ void THNN_(TemporalMaxPooling_updateOutput)(
   {
     dimT = 1;
     dimF = 2;
-    batch = input->size[0];
+    batch = input->size(0);
   }
   input = THCTensor_(newContiguous)(state, input);
 
-  input_w = input->size[dimT];
-  input_n = input->size[dimF];
+  input_w = input->size(dimT);
+  input_n = input->size(dimF);
   output_w = (input_w - kW) / dW + 1;
 
   if (input->dim() == 2)
   {
-    THCTensor_(resize2d)(state, output, output_w, input->size[dimF]);
-    THCIndexTensor_(resize2d)(state, indices, output_w, input->size[dimF]);
+    THCTensor_(resize2d)(state, output, output_w, input->size(dimF));
+    THCIndexTensor_(resize2d)(state, indices, output_w, input->size(dimF));
   }
   else
   {
-    THCTensor_(resize3d)(state, output, batch, output_w, input->size[dimF]);
-    THCIndexTensor_(resize3d)(state, indices, batch, output_w, input->size[dimF]);
+    THCTensor_(resize3d)(state, output, batch, output_w, input->size(dimF));
+    THCIndexTensor_(resize3d)(state, indices, batch, output_w, input->size(dimF));
   }
 
   input_data = THCTensor_(data)(state, input);
@@ -146,12 +146,12 @@ void THNN_(TemporalMaxPooling_updateGradInput)(
   {
     dimT = 1;
     dimF = 2;
-    batch = input->size[0];
+    batch = input->size(0);
   }
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
 
-  input_w = input->size[dimT];
-  input_n = input->size[dimF];
+  input_w = input->size(dimT);
+  input_n = input->size(dimF);
   output_w = (input_w - kW) / dW + 1;
 
   gradInput_data = THCTensor_(data)(state, gradInput);
diff --git a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
index 394c796cb9a5ec..0beea05cd2826e 100644
--- a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
+++ b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
@@ -84,7 +84,7 @@ void THNN_(TemporalReflectionPadding_updateGradInput)(
     planeDim++;
     dimw++;
   }
-  int iwidth = input->size[dimw];
+  int iwidth = input->size(dimw);
   int owidth  = iwidth + padL + padR;
 
   THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3,
diff --git a/aten/src/THCUNN/generic/TemporalReplicationPadding.cu b/aten/src/THCUNN/generic/TemporalReplicationPadding.cu
index 11637dc9dfa31b..96f0ad6fa46304 100644
--- a/aten/src/THCUNN/generic/TemporalReplicationPadding.cu
+++ b/aten/src/THCUNN/generic/TemporalReplicationPadding.cu
@@ -79,7 +79,7 @@ void THNN_(TemporalReplicationPadding_updateGradInput)(
     planeDim++;
     dimw++;
   }
-  int iwidth = input->size[dimw];
+  int iwidth = input->size(dimw);
   int owidth  = iwidth + padL + padR;
 
   THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3,
diff --git a/aten/src/THCUNN/generic/TemporalRowConvolution.cu b/aten/src/THCUNN/generic/TemporalRowConvolution.cu
index 26361d498eeb55..83a32ca2a063dd 100644
--- a/aten/src/THCUNN/generic/TemporalRowConvolution.cu
+++ b/aten/src/THCUNN/generic/TemporalRowConvolution.cu
@@ -14,7 +14,7 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)(
                   weight, "non-empty 2D or 3D weight tensor expected, but got: %s");
 
   if (bias != NULL) {
-    THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]);
+    THCUNN_check_dim_size(state, bias, 1, 0, weight->size(0));
   }
 
   int ndim = input->dim();
@@ -29,8 +29,8 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)(
   THCUNN_argCheck(state, !input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
                   "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
 
-  int64_t inputFrameSize = weight->size[0];
-  int64_t nInputFrame = input->size[dimS];
+  int64_t inputFrameSize = weight->size(0);
+  int64_t nInputFrame = input->size(dimS);
   int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
 
   if (nOutputFrame < 1) {
@@ -84,16 +84,16 @@ void THNN_(TemporalRowConvolution_updateOutput)(
   if (ndim == 2) {
     // Force batch
     batch = 0;
-    THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]);
+    THCTensor_(resize3d)(state, input, 1, input->size(0), input->size(1));
   }
 
   // Params:
-  int64_t inputFrameSize = weight->size[0];
-  int64_t nInputFrame = input->size[2];
+  int64_t inputFrameSize = weight->size(0);
+  int64_t nInputFrame = input->size(2);
   int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
 
   // Batch size
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize3d)(state, output, batchSize, inputFrameSize, nOutputFrame);
@@ -104,7 +104,7 @@ void THNN_(TemporalRowConvolution_updateOutput)(
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever
   // gets increased and always contains ones.
-  if (ones->dim() != 2 || ones->size[0] * ones->size[1] < nOutputFrame) {
+  if (ones->dim() != 2 || ones->size(0) * ones->size(1) < nOutputFrame) {
     // Resize plane and fill with ones...
     THCTensor_(resize2d)(state, ones, 1, nOutputFrame);
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
@@ -218,18 +218,18 @@ void THNN_(TemporalRowConvolution_updateGradInput)(
   if (ndim == 2) {
     // Force batch
     batch = 0;
-    THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]);
-    THCTensor_(resize3d)(state, gradOutput, 1, gradOutput->size[0],
-                         gradOutput->size[1]);
+    THCTensor_(resize3d)(state, input, 1, input->size(0), input->size(1));
+    THCTensor_(resize3d)(state, gradOutput, 1, gradOutput->size(0),
+                         gradOutput->size(1));
   }
 
   // Params:
-  int64_t inputFrameSize = weight->size[0];
-  int64_t nInputFrame = input->size[2];
-  int64_t nOutputFrame = gradOutput->size[2];
+  int64_t inputFrameSize = weight->size(0);
+  int64_t nInputFrame = input->size(2);
+  int64_t nOutputFrame = gradOutput->size(2);
 
   // Batch size
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize3d)(state, gradInput, batchSize, inputFrameSize,
@@ -331,21 +331,21 @@ void THNN_(TemporalRowConvolution_accGradParameters)(
   if (ndim == 2) {
     // Force batch
     batch = 0;
-    THCTensor_(resize3d)(state, input, 1, input->size[0], input->size[1]);
-    THCTensor_(resize3d)(state, gradOutput, 1, gradOutput->size[0],
-                         gradOutput->size[1]);
+    THCTensor_(resize3d)(state, input, 1, input->size(0), input->size(1));
+    THCTensor_(resize3d)(state, gradOutput, 1, gradOutput->size(0),
+                         gradOutput->size(1));
   }
 
   // Params:
-  int64_t inputFrameSize = gradWeight->size[0];
-  int64_t nInputFrame = input->size[2];
-  int64_t nOutputFrame = gradOutput->size[2];
+  int64_t inputFrameSize = gradWeight->size(0);
+  int64_t nInputFrame = input->size(2);
+  int64_t nOutputFrame = gradOutput->size(2);
 
   // Batch size
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Define a buffer of ones, for bias accumulation
-  if (ones->dim() != 2 || ones->size[0] * ones->size[1] < nOutputFrame) {
+  if (ones->dim() != 2 || ones->size(0) * ones->size(1) < nOutputFrame) {
     // Resize plane and fill with ones...
     THCTensor_(resize2d)(state, ones, 1, nOutputFrame);
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
diff --git a/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu
index d29748338f7298..3b34077d1cee84 100644
--- a/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu
+++ b/aten/src/THCUNN/generic/VolumetricAdaptiveAveragePooling.cu
@@ -28,15 +28,15 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
   int64_t totalZ;
 
   if (input->dim() == 4) {
-    sizeD = input->size[0];
-    isizeT = input->size[1];
-    isizeH = input->size[2];
-    isizeW = input->size[3];
+    sizeD = input->size(0);
+    isizeT = input->size(1);
+    isizeH = input->size(2);
+    isizeW = input->size(3);
 
-    istrideD = input->stride[0];
-    istrideT = input->stride[1];
-    istrideH = input->stride[2];
-    istrideW = input->stride[3];
+    istrideD = input->stride(0);
+    istrideT = input->stride(1);
+    istrideH = input->stride(2);
+    istrideW = input->stride(3);
 
     THCTensor_(resize4d)(state, output, sizeD, osizeT, osizeH, osizeW);
 
@@ -44,16 +44,16 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
   } else {
     input = THCTensor_(newContiguous)(state, input);
 
-    int64_t sizeB = input->size[0];
-    sizeD = input->size[1];
-    isizeT = input->size[2];
-    isizeH = input->size[3];
-    isizeW = input->size[4];
+    int64_t sizeB = input->size(0);
+    sizeD = input->size(1);
+    isizeT = input->size(2);
+    isizeH = input->size(3);
+    isizeW = input->size(4);
 
-    istrideD = input->stride[1];
-    istrideT = input->stride[2];
-    istrideH = input->stride[3];
-    istrideW = input->stride[4];
+    istrideD = input->stride(1);
+    istrideT = input->stride(2);
+    istrideH = input->stride(3);
+    istrideW = input->stride(4);
 
     THCTensor_(resize5d)(state, output, sizeB, sizeD, osizeT, osizeH, osizeW);
 
@@ -107,23 +107,23 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)(
   int64_t totalZ;
 
   if (input->dim() == 4) {
-    sizeD = input->size[0];
-    isizeT = input->size[1];
-    isizeH = input->size[2];
-    isizeW = input->size[3];
-
-    osizeT = gradOutput->size[1];
-    osizeH = gradOutput->size[2];
-    osizeW = gradOutput->size[3];
+    sizeD = input->size(0);
+    isizeT = input->size(1);
+    isizeH = input->size(2);
+    isizeW = input->size(3);
+
+    osizeT = gradOutput->size(1);
+    osizeH = gradOutput->size(2);
+    osizeW = gradOutput->size(3);
   } else {
-    sizeD = input->size[1];
-    isizeT = input->size[2];
-    isizeH = input->size[3];
-    isizeW = input->size[4];
-
-    osizeT = gradOutput->size[2];
-    osizeH = gradOutput->size[3];
-    osizeW = gradOutput->size[4];
+    sizeD = input->size(1);
+    isizeT = input->size(2);
+    isizeH = input->size(3);
+    isizeW = input->size(4);
+
+    osizeT = gradOutput->size(2);
+    osizeH = gradOutput->size(3);
+    osizeW = gradOutput->size(4);
   }
 
   // somehow nonatomic is passing all test for volumetric case.
@@ -132,7 +132,7 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)(
   if (input->dim() == 4) {
     totalZ = atomic ? sizeD * osizeT : sizeD * isizeT;
   } else {
-    int sizeB = input->size[0];
+    int sizeB = input->size(0);
     totalZ = atomic ? sizeB * sizeD * osizeT : sizeB * sizeD * isizeT;
   }
 
diff --git a/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu b/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu
index 7f876ae26b5721..adc23e15dabedc 100644
--- a/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu
+++ b/aten/src/THCUNN/generic/VolumetricAdaptiveMaxPooling.cu
@@ -29,15 +29,15 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)(
   int64_t totalZ;
 
   if (input->dim() == 4) {
-    sizeD = input->size[0];
-    isizeT = input->size[1];
-    isizeH = input->size[2];
-    isizeW = input->size[3];
+    sizeD = input->size(0);
+    isizeT = input->size(1);
+    isizeH = input->size(2);
+    isizeW = input->size(3);
 
-    istrideD = input->stride[0];
-    istrideT = input->stride[1];
-    istrideH = input->stride[2];
-    istrideW = input->stride[3];
+    istrideD = input->stride(0);
+    istrideT = input->stride(1);
+    istrideH = input->stride(2);
+    istrideW = input->stride(3);
 
     THCTensor_(resize4d)(state, output, sizeD, osizeT, osizeH, osizeW);
     THCIndexTensor_(resize4d)(state, indices, sizeD, osizeT, osizeH, osizeW);
@@ -46,16 +46,16 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)(
   } else {
     input = THCTensor_(newContiguous)(state, input);
 
-    int64_t sizeB = input->size[0];
-    sizeD = input->size[1];
-    isizeT = input->size[2];
-    isizeH = input->size[3];
-    isizeW = input->size[4];
+    int64_t sizeB = input->size(0);
+    sizeD = input->size(1);
+    isizeT = input->size(2);
+    isizeH = input->size(3);
+    isizeW = input->size(4);
 
-    istrideD = input->stride[1];
-    istrideT = input->stride[2];
-    istrideH = input->stride[3];
-    istrideW = input->stride[4];
+    istrideD = input->stride(1);
+    istrideT = input->stride(2);
+    istrideH = input->stride(3);
+    istrideW = input->stride(4);
 
     THCTensor_(resize5d)(state, output, sizeB, sizeD, osizeT, osizeH, osizeW);
     THCIndexTensor_(resize5d)(state, indices, sizeB, sizeD, osizeT, osizeH, osizeW);
@@ -113,23 +113,23 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)(
   int64_t totalZ;
 
   if (input->dim() == 4) {
-    sizeD = input->size[0];
-    isizeT = input->size[1];
-    isizeH = input->size[2];
-    isizeW = input->size[3];
-
-    osizeT = gradOutput->size[1];
-    osizeH = gradOutput->size[2];
-    osizeW = gradOutput->size[3];
+    sizeD = input->size(0);
+    isizeT = input->size(1);
+    isizeH = input->size(2);
+    isizeW = input->size(3);
+
+    osizeT = gradOutput->size(1);
+    osizeH = gradOutput->size(2);
+    osizeW = gradOutput->size(3);
   } else {
-    sizeD = input->size[1];
-    isizeT = input->size[2];
-    isizeH = input->size[3];
-    isizeW = input->size[4];
-
-    osizeT = gradOutput->size[2];
-    osizeH = gradOutput->size[3];
-    osizeW = gradOutput->size[4];
+    sizeD = input->size(1);
+    isizeT = input->size(2);
+    isizeH = input->size(3);
+    isizeW = input->size(4);
+
+    osizeT = gradOutput->size(2);
+    osizeH = gradOutput->size(3);
+    osizeW = gradOutput->size(4);
   }
 
   bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0) || (isizeT%osizeT != 0);
@@ -137,7 +137,7 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)(
   if (input->dim() == 4) {
     totalZ = sizeD * osizeT;
   } else {
-    int sizeB = input->size[0];
+    int sizeB = input->size(0);
     totalZ = sizeB * sizeD * osizeT;
   }
 
diff --git a/aten/src/THCUNN/generic/VolumetricAveragePooling.cu b/aten/src/THCUNN/generic/VolumetricAveragePooling.cu
index b32643ddc4d091..54987bc26bc366 100644
--- a/aten/src/THCUNN/generic/VolumetricAveragePooling.cu
+++ b/aten/src/THCUNN/generic/VolumetricAveragePooling.cu
@@ -32,11 +32,11 @@ static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
 
   if (!input->is_empty() && THCTensor_(nDimension)(state, input) == 4)
   {
-    THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
-               && input->size[dimt] >= kT, 2,
+    THArgCheck(input->size(dimw) >= kW && input->size(dimh) >= kH
+               && input->size(dimt) >= kT, 2,
                "input image (T: %d H: %d W: %d) smaller than "
                "kernel size (kT: %d kH: %d kW: %d)",
-               input->size[dimt], input->size[dimh], input->size[dimw],
+               input->size(dimt), input->size(dimh), input->size(dimw),
                kT, kH, kW);
 
     /* sizes */
@@ -47,11 +47,11 @@ static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
   }
   else if (!input->is_empty() && THCTensor_(nDimension)(state, input) == 5)
   {
-    THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
-               && input->size[dimt] >= kT, 2,
+    THArgCheck(input->size(dimw) >= kW && input->size(dimh) >= kH
+               && input->size(dimt) >= kT, 2,
                "input image (T: %d H: %d W: %d) smaller than "
                "kernel size (kT: %d kH: %d kW: %d)",
-               input->size[dimt], input->size[dimh], input->size[dimw],
+               input->size(dimt), input->size(dimh), input->size(dimw),
                kT, kH, kW);
 
     /* sizes */
diff --git a/aten/src/THCUNN/generic/VolumetricConvolution.cu b/aten/src/THCUNN/generic/VolumetricConvolution.cu
index e76f8cb42531f9..f21402e65efc23 100644
--- a/aten/src/THCUNN/generic/VolumetricConvolution.cu
+++ b/aten/src/THCUNN/generic/VolumetricConvolution.cu
@@ -47,11 +47,11 @@ static inline void THNN_(VolumetricConvolution_shapeCheck)
   if (weight == NULL) {
     weight = gradWeight;
   }
-  int64_t nOutputPlane = weight->size[0];
-  int64_t nInputPlane  = weight->size[1];
-  int64_t kT           = weight->size[2];
-  int64_t kH           = weight->size[3];
-  int64_t kW           = weight->size[4];
+  int64_t nOutputPlane = weight->size(0);
+  int64_t nInputPlane  = weight->size(1);
+  int64_t kT           = weight->size(2);
+  int64_t kH           = weight->size(3);
+  int64_t kW           = weight->size(4);
 
   THArgCheck(kT > 0 && kW > 0 && kH > 0, 4,
              "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
@@ -69,9 +69,9 @@ static inline void THNN_(VolumetricConvolution_shapeCheck)
     dimd++;
   }
 
-  int64_t inputWidth   = input->size[dimw];
-  int64_t inputHeight  = input->size[dimh];
-  int64_t inputDepth   = input->size[dimd];
+  int64_t inputWidth   = input->size(dimw);
+  int64_t inputHeight  = input->size(dimh);
+  int64_t inputDepth   = input->size(dimd);
 
   int64_t exactInputDepth = inputDepth + 2*padT;
   int64_t exactInputHeight = inputHeight + 2*padH;
@@ -97,7 +97,7 @@ static inline void THNN_(VolumetricConvolution_shapeCheck)
   }
 
   if (bias != NULL) {
-    THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]);
+    THCUNN_check_dim_size(state, bias, 1, 0, weight->size(0));
   }
   THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
 
@@ -128,30 +128,30 @@ void THNN_(VolumetricConvolution_updateOutput)(
         bias, dT, dW, dH, padT, padW, padH);
   input = THCTensor_(newContiguous)(state, input);
 
-  int nOutputPlane = (int)weight->size[0];
-  int nInputPlane  = (int)weight->size[1];
-  int kT           = (int)weight->size[2];
-  int kH           = (int)weight->size[3];
-  int kW           = (int)weight->size[4];
+  int nOutputPlane = (int)weight->size(0);
+  int nInputPlane  = (int)weight->size(1);
+  int kT           = (int)weight->size(2);
+  int kH           = (int)weight->size(3);
+  int kW           = (int)weight->size(4);
 
   int batch = 1;
   if (input->dim() == 4)
   {
     // Force batch
     batch = 0;
-    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1],
-                          input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1),
+                          input->size(2), input->size(3));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
-  int64_t inputDepth   = input->size[4];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
+  int64_t inputDepth   = input->size(4);
   int64_t outputWidth  = (inputWidth  + 2*padH - kH) / dH + 1;
   int64_t outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
   int64_t outputDepth  = (inputDepth  + 2*padW - kW) / dW + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize5d)(state, output, batchSize, nOutputPlane,
@@ -163,7 +163,7 @@ void THNN_(VolumetricConvolution_updateOutput)(
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
-  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth)
   {
     // Resize plane and fill with ones...
     THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth);
@@ -220,9 +220,9 @@ void THNN_(VolumetricConvolution_updateOutput)(
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    int64_t m = weight->size[0];
-    int64_t n = columns->size[1];
-    int64_t k = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4];
+    int64_t m = weight->size(0);
+    int64_t n = columns->size(1);
+    int64_t k = weight->size(1)*weight->size(2)*weight->size(3)*weight->size(4);
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     #ifdef THC_REAL_IS_FLOAT
@@ -267,11 +267,11 @@ void THNN_(VolumetricConvolution_updateGradInput)(
            int padT, int padW, int padH)
 {
 
-  int64_t nOutputPlane = weight->size[0];
-  int64_t nInputPlane  = weight->size[1];
-  int64_t kT           = weight->size[2];
-  int64_t kH           = weight->size[3];
-  int64_t kW           = weight->size[4];
+  int64_t nOutputPlane = weight->size(0);
+  int64_t nInputPlane  = weight->size(1);
+  int64_t kT           = weight->size(2);
+  int64_t kH           = weight->size(3);
+  int64_t kW           = weight->size(4);
 
   THCTensor *gradColumns = finput;
 
@@ -287,19 +287,19 @@ void THNN_(VolumetricConvolution_updateGradInput)(
     input = THCTensor_(newContiguous)(state, input);
     // Force batch
     batch = 0;
-    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+    THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
-  int64_t inputDepth   = input->size[4];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
+  int64_t inputDepth   = input->size(4);
   int64_t outputWidth  = (inputWidth  + 2*padH - kH) / dH + 1;
   int64_t outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
   int64_t outputDepth  = (inputDepth  + 2*padW - kW) / dW + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth, inputDepth);
@@ -320,9 +320,9 @@ void THNN_(VolumetricConvolution_updateGradInput)(
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    int64_t m = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4];
-    int64_t n = gradColumns->size[1];
-    int64_t k = weight->size[0];
+    int64_t m = weight->size(1)*weight->size(2)*weight->size(3)*weight->size(4);
+    int64_t n = gradColumns->size(1);
+    int64_t k = weight->size(0);
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     #ifdef THC_REAL_IS_FLOAT
@@ -387,11 +387,11 @@ void THNN_(VolumetricConvolution_accGradParameters)(
         state, input, gradOutput, NULL, gradWeight,
         gradBias, dT, dW, dH, padT, padW, padH);
 
-  int nOutputPlane = (int)gradWeight->size[0];
-  int nInputPlane  = (int)gradWeight->size[1];
-  int kT           = (int)gradWeight->size[2];
-  int kH           = (int)gradWeight->size[3];
-  int kW           = (int)gradWeight->size[4];
+  int nOutputPlane = (int)gradWeight->size(0);
+  int nInputPlane  = (int)gradWeight->size(1);
+  int kT           = (int)gradWeight->size(2);
+  int kH           = (int)gradWeight->size(3);
+  int kW           = (int)gradWeight->size(4);
 
   input = THCTensor_(newContiguous)(state, input);
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
@@ -401,22 +401,22 @@ void THNN_(VolumetricConvolution_accGradParameters)(
   {
     // Force batch
     batch = 0;
-    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+    THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
-  int64_t inputDepth   = input->size[4];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
+  int64_t inputDepth   = input->size(4);
   int64_t outputWidth  = (inputWidth  + 2*padH - kH) / dH + 1;
   int64_t outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
   int64_t outputDepth  = (inputDepth  + 2*padW - kW) / dW + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Define a buffer of ones, for bias accumulation
-  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth)
   {
     // Resize plane and fill with ones...
     THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth);
@@ -447,9 +447,9 @@ void THNN_(VolumetricConvolution_accGradParameters)(
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    int64_t m = gradWeight->size[0];
-    int64_t n = gradWeight->size[1]*gradWeight->size[2]*gradWeight->size[3]*gradWeight->size[4];
-    int64_t k = columns->size[1];
+    int64_t m = gradWeight->size(0);
+    int64_t n = gradWeight->size(1)*gradWeight->size(2)*gradWeight->size(3)*gradWeight->size(4);
+    int64_t k = columns->size(1);
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     #ifdef THC_REAL_IS_FLOAT
diff --git a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu
index 5751ab44662ac2..9e28ab80e43c68 100644
--- a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu
@@ -31,7 +31,7 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
                   "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
                   "expected for weight, but got: %s");
     if (bias != NULL) {
-      THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]);
+      THCUNN_check_dim_size(state, bias, 1, 0, weight->size(0));
     }
   } else if (!weight_nullable) {
     THError("weight tensor is expected to be non-nullable");
@@ -50,9 +50,9 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
     dimw++;
   }
 
-  int64_t inputDepth  = input->size[dimd];
-  int64_t inputHeight  = input->size[dimh];
-  int64_t inputWidth   = input->size[dimw];
+  int64_t inputDepth  = input->size(dimd);
+  int64_t inputHeight  = input->size(dimh);
+  int64_t inputWidth   = input->size(dimw);
   int64_t outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
   int64_t outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
@@ -64,16 +64,16 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
   }
 
   if (weight != NULL) {
-    int64_t nInputPlane = weight->size[1];
+    int64_t nInputPlane = weight->size(1);
     THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
   }
 
   if (gradOutput != NULL) {
     if (weight != NULL) {
-      int64_t nOutputPlane = weight->size[0];
+      int64_t nOutputPlane = weight->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size[0];
+      int64_t nOutputPlane = bias->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     }
     THCUNN_check_dim_size(state, gradOutput, ndim, dimd, outputDepth);
@@ -105,8 +105,8 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
         dilationT, dilationH, dilationW, 0);
 
   // Params:
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
+  int nInputPlane = weight->size(1);
+  int nOutputPlane = weight->size(0);
 
   input = THCTensor_(newContiguous)(state, input);
   weight = THCTensor_(newContiguous)(state, weight);
@@ -116,18 +116,18 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
   if (input->dim() == 4) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
   }
 
-  int64_t inputDepth  = input->size[2];
-  int64_t inputHeight  = input->size[3];
-  int64_t inputWidth   = input->size[4];
+  int64_t inputDepth  = input->size(2);
+  int64_t inputHeight  = input->size(3);
+  int64_t inputWidth   = input->size(4);
   int64_t outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
   int64_t outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
@@ -138,7 +138,7 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
-  if (ones->dim() != 2 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+  if (ones->dim() != 2 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
@@ -197,7 +197,7 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
     int64_t m = nOutputPlane;
-    int64_t n = columns->size[1];
+    int64_t n = columns->size(1);
     int64_t k = nInputPlane*kT*kH*kW;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
@@ -256,8 +256,8 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
   weight = THCTensor_(newContiguous)(state, weight);
 
   // Params
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
+  int nInputPlane = weight->size(1);
+  int nOutputPlane = weight->size(0);
 
   input = THCTensor_(newContiguous)(state, input);
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
@@ -265,19 +265,19 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
   if (input->dim() == 4) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+    THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3));
   }
 
-  int64_t inputDepth  = input->size[2];
-  int64_t inputWidth   = input->size[4];
-  int64_t inputHeight  = input->size[3];
+  int64_t inputDepth  = input->size(2);
+  int64_t inputWidth   = input->size(4);
+  int64_t inputHeight  = input->size(3);
   int64_t outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
   int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
@@ -298,7 +298,7 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
     int64_t m = nInputPlane*kT*kW*kH;
-    int64_t n = gradColumns->size[1];
+    int64_t n = gradColumns->size(1);
     int64_t k = nOutputPlane;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
@@ -375,24 +375,24 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)(
   if (input->dim() == 4) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+    THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3));
   }
 
-  int64_t nInputPlane = input->size[1];
-  int64_t nOutputPlane = gradOutput->size[1];
-  int64_t inputDepth  = input->size[2];
-  int64_t inputWidth   = input->size[4];
-  int64_t inputHeight  = input->size[3];
+  int64_t nInputPlane = input->size(1);
+  int64_t nOutputPlane = gradOutput->size(1);
+  int64_t inputDepth  = input->size(2);
+  int64_t inputWidth   = input->size(4);
+  int64_t inputHeight  = input->size(3);
   int64_t outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
   int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Define a buffer of ones, for bias accumulation
-  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+  if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
@@ -429,7 +429,7 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)(
       // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
       int64_t m = nOutputPlane;
       int64_t n = nInputPlane*kT*kW*kH;
-      int64_t k = columns->size[1];
+      int64_t k = columns->size(1);
 
       // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
       #ifdef THC_REAL_IS_FLOAT
diff --git a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu
index bd653b9f7d195b..96310609e956f4 100644
--- a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu
@@ -34,7 +34,7 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
                   "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
                   "expected for weight, but got: %s");
     if (bias != NULL) {
-      THCUNN_check_dim_size(state, bias, 1, 0, weight->size[1]);
+      THCUNN_check_dim_size(state, bias, 1, 0, weight->size(1));
     }
   } else if (!weight_nullable) {
     THError("weight tensor is expected to be non-nullable");
@@ -58,9 +58,9 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
     THCUNN_check_dim_size(state, input, ndim, dimf, nInputPlane);
   }
 
-  int64_t inputWidth   = input->size[dimw];
-  int64_t inputHeight  = input->size[dimh];
-  int64_t inputDepth  = input->size[dimd];
+  int64_t inputWidth   = input->size(dimw);
+  int64_t inputHeight  = input->size(dimh);
+  int64_t inputDepth  = input->size(dimd);
   int64_t outputDepth  = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT;
   int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
   int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
@@ -122,18 +122,18 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
   if (input->dim() == 4) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
   }
 
-  int64_t inputWidth   = input->size[4];
-  int64_t inputHeight  = input->size[3];
-  int64_t inputDepth  = input->size[2];
+  int64_t inputWidth   = input->size(4);
+  int64_t inputHeight  = input->size(3);
+  int64_t inputDepth  = input->size(2);
   int64_t outputDepth  = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT;
   int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
   int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
@@ -144,7 +144,7 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
-  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+  if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
@@ -162,9 +162,9 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    int64_t m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
-    int64_t n = columns->size[1];
-    int64_t k = weight->size[0];
+    int64_t m = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4);
+    int64_t n = columns->size(1);
+    int64_t k = weight->size(0);
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     #ifdef THC_REAL_IS_FLOAT
@@ -272,19 +272,19 @@ void THNN_(VolumetricFullDilatedConvolution_updateGradInput)(
   if (input->dim() == 4) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+    THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3));
   }
 
-  int64_t inputWidth   = input->size[4];
-  int64_t inputHeight  = input->size[3];
-  int64_t inputDepth   = input->size[2];
+  int64_t inputWidth   = input->size(4);
+  int64_t inputHeight  = input->size(3);
+  int64_t inputDepth   = input->size(2);
   int64_t outputDepth  = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT;
   int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
   int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
@@ -316,9 +316,9 @@ void THNN_(VolumetricFullDilatedConvolution_updateGradInput)(
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    int64_t m = weight->size[0];
-    int64_t n = gradColumns->size[1];
-    int64_t k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    int64_t m = weight->size(0);
+    int64_t n = gradColumns->size(1);
+    int64_t k = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4);
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     #ifdef THC_REAL_IS_FLOAT
@@ -407,22 +407,22 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
   if (input->dim() == 4) {
     // Force batch
     is_batch = 0;
-    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+    THCTensor_(resize5d)(state, input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3));
   }
 
-  int64_t inputWidth   = input->size[4];
-  int64_t inputHeight  = input->size[3];
-  int64_t inputDepth   = input->size[2];
+  int64_t inputWidth   = input->size(4);
+  int64_t inputHeight  = input->size(3);
+  int64_t inputDepth   = input->size(2);
   int64_t outputDepth  = (inputDepth - 1) * dT - 2*padT + (dilationT * (kT - 1) + 1) + adjT;
   int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
   int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Define a buffer of ones, for bias accumulation
-  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+  if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
     THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
@@ -458,9 +458,9 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
 
       // M,N,K are dims of matrix A and B
       // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-      int64_t n = columns->size[0];   // nOutputPlane * kt * kh * kw
-      int64_t m = input_n->size[0];   // nInputPlane
-      int64_t k = columns->size[1];   // inputHeight * inputWidth
+      int64_t n = columns->size(0);   // nOutputPlane * kt * kh * kw
+      int64_t m = input_n->size(0);   // nInputPlane
+      int64_t k = columns->size(1);   // inputHeight * inputWidth
 
       // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
       #ifdef THC_REAL_IS_FLOAT
@@ -527,7 +527,7 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
   // Resize
   if (is_batch == 0) {
     THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
-    THCTensor_(resize4d)(state, input, input->size[1], inputDepth, inputHeight, inputWidth);
+    THCTensor_(resize4d)(state, input, input->size(1), inputDepth, inputHeight, inputWidth);
   }
 
   THCTensor_(free)(state, input);
diff --git a/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu b/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu
index 0b5a17d0eec02f..8d482f04f873ed 100644
--- a/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu
+++ b/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu
@@ -51,11 +51,11 @@ static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)(
   }
 
   if (gradOutput != NULL) {
-    if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
+    if (oT != gradOutput->size(dimt) || oW != gradOutput->size(dimw) || oH != gradOutput->size(dimh))
     {
       THError(
         "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d",
-        oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw]);
+        oT, oH, oW, gradOutput->size(dimt), gradOutput->size(dimh), gradOutput->size(dimw));
     }
 
     THCUNN_check_dim_size(state, gradOutput, input->dim(), dimn, inputSlices);
diff --git a/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu b/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu
index 071b322232126b..fab12533901c0d 100644
--- a/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu
+++ b/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu
@@ -28,9 +28,9 @@ static inline void THNN_(VolumetricReplicationPadding_shapeCheck)(
     }
 
   int numPlanes = THCTensor_(size)(state, input, planeDim);
-  int idepth = input->size[dimd];
-  int iheight = input->size[dimh];
-  int iwidth = input->size[dimw];
+  int idepth = input->size(dimd);
+  int iheight = input->size(dimh);
+  int iwidth = input->size(dimw);
   int odepth = idepth + pfront + pback;
   int oheight = iheight + ptop + pbottom;
   int owidth  = iwidth + pleft + pright;
diff --git a/aten/src/THCUNN/im2col.h b/aten/src/THCUNN/im2col.h
index 22f47e0eb9b25e..ba905609d3135f 100644
--- a/aten/src/THCUNN/im2col.h
+++ b/aten/src/THCUNN/im2col.h
@@ -8,28 +8,28 @@
 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
 template <typename Dtype>
 __launch_bounds__(CUDA_NUM_THREADS)
-__global__ void im2col_kernel(const int n, const Dtype* data_im,
-                              const int height, const int width,
-                              const int ksize_h, const int ksize_w,
-                              const int pad_h, const int pad_w,
-                              const int stride_h, const int stride_w,
-                              const int dilation_h, const int dilation_w,
-                              const int height_col, const int width_col,
+__global__ void im2col_kernel(const int64_t n, const Dtype* data_im,
+                              const int64_t height, const int64_t width,
+                              const int64_t ksize_h, const int64_t ksize_w,
+                              const int64_t pad_h, const int64_t pad_w,
+                              const int64_t stride_h, const int64_t stride_w,
+                              const int64_t dilation_h, const int64_t dilation_w,
+                              const int64_t height_col, const int64_t width_col,
     Dtype* data_col) {
   CUDA_KERNEL_LOOP(index, n) {
-    int w_out = index % width_col;
+    int64_t w_out = index % width_col;
     index /= width_col;
-    int h_out = index % height_col;
-    int channel_in = index / height_col;
-    int channel_out = channel_in * ksize_h * ksize_w;
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
+    int64_t h_out = index % height_col;
+    int64_t channel_in = index / height_col;
+    int64_t channel_out = channel_in * ksize_h * ksize_w;
+    int64_t h_in = h_out * stride_h - pad_h;
+    int64_t w_in = w_out * stride_w - pad_w;
     data_col += (channel_out * height_col + h_out) * width_col + w_out;
     data_im += (channel_in * height + h_in) * width + w_in;
-    for (int i = 0; i < ksize_h; ++i) {
-      for (int j = 0; j < ksize_w; ++j) {
-        int h = h_in + i * dilation_h;
-        int w = w_in + j * dilation_w;
+    for (int64_t i = 0; i < ksize_h; ++i) {
+      for (int64_t j = 0; j < ksize_w; ++j) {
+        int64_t h = h_in + i * dilation_h;
+        int64_t w = w_in + j * dilation_w;
         *data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
           data_im[i * dilation_h * width + j * dilation_w] : ScalarConvert<int, Dtype>::to(0);
         data_col += height_col * width_col;
@@ -39,15 +39,15 @@ __global__ void im2col_kernel(const int n, const Dtype* data_im,
 }
 
 template <typename Dtype>
-void im2col(cudaStream_t stream, const Dtype* data_im, const int channels,
-            const int height, const int width,
-            const int height_col, const int width_col,
-            const int ksize_h, const int ksize_w, const int pad_h,
-            const int pad_w, const int stride_h, const int stride_w,
-            const int dilation_h, const int dilation_w, Dtype* data_col) {
+void im2col(cudaStream_t stream, const Dtype* data_im, const int64_t channels,
+            const int64_t height, const int64_t width,
+            const int64_t height_col, const int64_t width_col,
+            const int64_t ksize_h, const int64_t ksize_w, const int64_t pad_h,
+            const int64_t pad_w, const int64_t stride_h, const int64_t stride_w,
+            const int64_t dilation_h, const int64_t dilation_w, Dtype* data_col) {
   // We are going to launch channels * height_col * width_col kernels, each
   // kernel responsible for copying a single-channel grid.
-  int num_kernels = channels * height_col * width_col;
+  int64_t num_kernels = channels * height_col * width_col;
   // Launch
   im2col_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
       num_kernels, data_im, height, width, ksize_h, ksize_w,
@@ -60,37 +60,37 @@ void im2col(cudaStream_t stream, const Dtype* data_im, const int channels,
 
 template <typename Dtype, typename Acctype>
 __launch_bounds__(CUDA_NUM_THREADS)
-__global__ void col2im_kernel(const int n, const Dtype* data_col,
-                                  const int height, const int width, const int channels,
-                                  const int kernel_h, const int kernel_w,
-                                  const int pad_h, const int pad_w,
-                                  const int stride_h, const int stride_w,
-                                  const int dilation_h, const int dilation_w,
-                                  const int height_col, const int width_col,
+__global__ void col2im_kernel(const int64_t n, const Dtype* data_col,
+                                  const int64_t height, const int64_t width, const int64_t channels,
+                                  const int64_t kernel_h, const int64_t kernel_w,
+                                  const int64_t pad_h, const int64_t pad_w,
+                                  const int64_t stride_h, const int64_t stride_w,
+                                  const int64_t dilation_h, const int64_t dilation_w,
+                                  const int64_t height_col, const int64_t width_col,
                                   Dtype* data_im) {
   CUDA_KERNEL_LOOP(index, n) {
     Acctype val = Acctype(0);
-    const int w_im = index % width + pad_w;
-    const int h_im = (index / width) % height + pad_h;
-    const int c_im = index / (width * height);
-    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
-    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    const int64_t w_im = index % width + pad_w;
+    const int64_t h_im = (index / width) % height + pad_h;
+    const int64_t c_im = index / (width * height);
+    int64_t kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    int64_t kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
     // compute the start and end of the output
-    const int w_col_start =
+    const int64_t w_col_start =
       (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
-    const int w_col_end = min(w_im / stride_w + 1, width_col);
-    const int h_col_start =
+    const int64_t w_col_end = min(w_im / stride_w + 1, width_col);
+    const int64_t h_col_start =
       (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
-    const int h_col_end = min(h_im / stride_h + 1, height_col);
+    const int64_t h_col_end = min(h_im / stride_h + 1, height_col);
     // TODO: use LCM of stride and dilation to avoid unnecessary loops
-    for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
-      for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
-        int h_k = (h_im - h_col * stride_h);
-        int w_k = (w_im - w_col * stride_w);
+    for (int64_t h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+      for (int64_t w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+        int64_t h_k = (h_im - h_col * stride_h);
+        int64_t w_k = (w_im - w_col * stride_w);
         if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
           h_k /= dilation_h;
           w_k /= dilation_w;
-          int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
+          int64_t data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
                                 height_col + h_col) * width_col + w_col;
           val += data_col[data_col_index];
         }
@@ -101,21 +101,21 @@ __global__ void col2im_kernel(const int n, const Dtype* data_col,
 }
 
 template <typename Dtype, typename Acctype>
-void col2im(cudaStream_t stream, const Dtype* data_col, const int channels,
-            const int height, const int width,
-            const int output_height, const int output_width,
-            const int patch_h, const int patch_w, const int pad_h,
-            const int pad_w, const int stride_h, const int stride_w,
-            const int dilation_h, const int dilation_w, Dtype* data_im);
+void col2im(cudaStream_t stream, const Dtype* data_col, const int64_t channels,
+            const int64_t height, const int64_t width,
+            const int64_t output_height, const int64_t output_width,
+            const int64_t patch_h, const int64_t patch_w, const int64_t pad_h,
+            const int64_t pad_w, const int64_t stride_h, const int64_t stride_w,
+            const int64_t dilation_h, const int64_t dilation_w, Dtype* data_im);
 
 template <typename Dtype, typename Acctype>
-void col2im(cudaStream_t stream, const Dtype* data_col, const int channels,
-            const int height, const int width,
-            const int output_height, const int output_width,
-            const int patch_h, const int patch_w, const int pad_h,
-            const int pad_w, const int stride_h, const int stride_w,
-            const int dilation_h, const int dilation_w, Dtype* data_im) {
-  int num_kernels = channels * height * width;
+void col2im(cudaStream_t stream, const Dtype* data_col, const int64_t channels,
+            const int64_t height, const int64_t width,
+            const int64_t output_height, const int64_t output_width,
+            const int64_t patch_h, const int64_t patch_w, const int64_t pad_h,
+            const int64_t pad_w, const int64_t stride_h, const int64_t stride_w,
+            const int64_t dilation_h, const int64_t dilation_w, Dtype* data_im) {
+  int64_t num_kernels = channels * height * width;
   // To avoid involving atomic operations, we will launch one kernel per
   // bottom dimension, and then in the kernel add up the top dimensions.
   col2im_kernel<Dtype, Acctype> <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
diff --git a/aten/src/THNN/generic/Col2Im.c b/aten/src/THNN/generic/Col2Im.c
index cb95715d6f9b0d..97ed60b7274330 100644
--- a/aten/src/THNN/generic/Col2Im.c
+++ b/aten/src/THNN/generic/Col2Im.c
@@ -54,25 +54,25 @@
 //
 // ALSO do vol2col
 
-static void THNN_(im2col)(const real* data_im, const int channels,
-      const int height, const int width,
-      const int output_height, const int output_width,
-      const int kernel_h, const int kernel_w,
-      const int pad_h, const int pad_w,
-      const int stride_h, const int stride_w,
-      const int dilation_h, const int dilation_w,
+static void THNN_(im2col)(const real* data_im, const int64_t channels,
+      const int64_t height, const int64_t width,
+      const int64_t output_height, const int64_t output_width,
+      const int64_t kernel_h, const int64_t kernel_w,
+      const int64_t pad_h, const int64_t pad_w,
+      const int64_t stride_h, const int64_t stride_w,
+      const int64_t dilation_h, const int64_t dilation_w,
       real* data_col) {
-  const int height_col = output_height;
-  const int width_col = output_width;
-  const int channels_col = channels * kernel_h * kernel_w;
-  for (int c_col = 0; c_col < channels_col; ++c_col) {
-    int w_offset = c_col % kernel_w;
-    int h_offset = (c_col / kernel_w) % kernel_h;
-    int c_im = c_col / kernel_h / kernel_w;
-    for (int h_col = 0; h_col < height_col; ++h_col) {
-      int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
-      for (int w_col = 0; w_col < width_col; ++w_col) {
-        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+  const int64_t height_col = output_height;
+  const int64_t width_col = output_width;
+  const int64_t channels_col = channels * kernel_h * kernel_w;
+  for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
+    int64_t w_offset = c_col % kernel_w;
+    int64_t h_offset = (c_col / kernel_w) % kernel_h;
+    int64_t c_im = c_col / kernel_h / kernel_w;
+    for (int64_t h_col = 0; h_col < height_col; ++h_col) {
+      int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+      for (int64_t w_col = 0; w_col < width_col; ++w_col) {
+        int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
         data_col[(c_col * height_col + h_col) * width_col + w_col] =
           (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
           data_im[(c_im * height + h_im) * width + w_im] : 0;
@@ -81,26 +81,26 @@ static void THNN_(im2col)(const real* data_im, const int channels,
   }
 }
 
-static void THNN_(col2im)(const real* data_col, const int channels,
-      const int height, const int width,
-      const int output_height, const int output_width,
-      const int kernel_h, const int kernel_w,
-      const int pad_h, const int pad_w,
-      const int stride_h, const int stride_w,
-      const int dilation_h, const int dilation_w,
+static void THNN_(col2im)(const real* data_col, const int64_t channels,
+      const int64_t height, const int64_t width,
+      const int64_t output_height, const int64_t output_width,
+      const int64_t kernel_h, const int64_t kernel_w,
+      const int64_t pad_h, const int64_t pad_w,
+      const int64_t stride_h, const int64_t stride_w,
+      const int64_t dilation_h, const int64_t dilation_w,
       real* data_im) {
   memset(data_im, 0, sizeof(real) * height * width * channels);
-  const int height_col = output_height;
-  const int width_col = output_width;
-  const int channels_col = channels * kernel_h * kernel_w;
-  for (int c_col = 0; c_col < channels_col; ++c_col) {
-    int w_offset = c_col % kernel_w;
-    int h_offset = (c_col / kernel_w) % kernel_h;
-    int c_im = c_col / kernel_h / kernel_w;
-    for (int h_col = 0; h_col < height_col; ++h_col) {
-      int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
-      for (int w_col = 0; w_col < width_col; ++w_col) {
-        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+  const int64_t height_col = output_height;
+  const int64_t width_col = output_width;
+  const int64_t channels_col = channels * kernel_h * kernel_w;
+  for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
+    int64_t w_offset = c_col % kernel_w;
+    int64_t h_offset = (c_col / kernel_w) % kernel_h;
+    int64_t c_im = c_col / kernel_h / kernel_w;
+    for (int64_t h_col = 0; h_col < height_col; ++h_col) {
+      int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+      for (int64_t w_col = 0; w_col < width_col; ++w_col) {
+        int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
         if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
           data_im[(c_im * height + h_im) * width + w_im] +=
             data_col[(c_col * height_col + h_col) * width_col + w_col];
@@ -113,9 +113,9 @@ static inline void THNN_(Col2Im_shapeCheck)(
                          THNNState *state,
                          THTensor *input,
                          THTensor *gradOutput,
-                         int outputHeight, int outputWidth,
-                         int kH, int kW, int dH, int dW,
-                         int padH, int padW, int sH, int sW) {
+                         int64_t outputHeight, int64_t outputWidth,
+                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
+                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {
 
   THArgCheck(kW > 0 && kH > 0, 6,
              "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
@@ -124,12 +124,12 @@ static inline void THNN_(Col2Im_shapeCheck)(
   THArgCheck(dW > 0 && dH > 0, 8,
              "dilation should be greater than zero, but got dH: %d dW: %d", dH, dW);
 
-  int ndim = THTensor_(nDimension)(input);
+  int64_t ndim = THTensor_(nDimension)(input);
   THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 2, input,
                 "Expected non-empty 2D or 3D input tensor, but got input of shape %s");
 
-  int batch_dim = (ndim == 3) ? 0 : -1;
-  int64_t nInputPlane  = input->size[batch_dim + 1];
+  int64_t batch_dim = (ndim == 3) ? 0 : -1;
+  int64_t nInputPlane  = input->size(batch_dim + 1);
 
   if (nInputPlane % (kW * kH) != 0) {
     THError("Expected size of input's dimension 1 to be divisible by the "
@@ -137,7 +137,7 @@ static inline void THNN_(Col2Im_shapeCheck)(
             "kernel_size=(%d, %d).", (long long) nInputPlane, kH, kW);
   }
 
-  int64_t inputLength  = input->size[batch_dim + 2];
+  int64_t inputLength  = input->size(batch_dim + 2);
   int64_t nBlocksH = 1 + (outputHeight + 2 * padH - dH * (kH - 1) - 1) / sH;
   int64_t nBlocksW = 1 + ( outputWidth + 2 * padW - dW * (kW - 1) - 1) / sW;
 
@@ -161,11 +161,11 @@ void THNN_(Col2Im_updateOutput)(
            THNNState *state,
            THTensor *input,
            THTensor *output,
-           int outputHeight, int outputWidth,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t outputHeight, int64_t outputWidth,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
 
   THNN_(Col2Im_shapeCheck)(state, input, NULL, outputHeight, outputWidth,
                            kH, kW, dH, dW, padH, padW, sH, sW);
@@ -174,11 +174,11 @@ void THNN_(Col2Im_updateOutput)(
   if (input->dim() == 2) {
       // Force batch
       batched_input = false;
-      THTensor_(resize3d)(input, 1, input->size[0], input->size[1]);
+      THTensor_(resize3d)(input, 1, input->size(0), input->size(1));
   }
 
-  long batchSize = input->size[0];
-  long nInputPlane = input->size[1];
+  long batchSize = input->size(0);
+  long nInputPlane = input->size(1);
   long nOutputPlane = nInputPlane / (kW * kH);
 
   input = THTensor_(newContiguous)(input);
@@ -189,10 +189,10 @@ void THNN_(Col2Im_updateOutput)(
   THTensor *input_n = THTensor_(new)();
   THTensor *output_n = THTensor_(new)();
 
-  int height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
-  int width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t height_col = (outputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t width_col = (outputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
 
-  for (int elt = 0; elt < batchSize; elt++) {
+  for (int64_t elt = 0; elt < batchSize; elt++) {
     THTensor_(select)(input_n, input, 0, elt);
     THTensor_(select)(output_n, output, 0, elt);
 
@@ -220,10 +220,10 @@ void THNN_(Col2Im_updateGradInput)(
            THNNState *state,
            THTensor *gradOutput,
            THTensor *gradInput,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
 
   THNN_(Im2Col_updateOutput)(state, gradOutput, gradInput,
                              kH, kW, dH, dW, padH, padW, sH, sW);
diff --git a/aten/src/THNN/generic/FeatureLPPooling.c b/aten/src/THNN/generic/FeatureLPPooling.c
index fdb4bbefa78878..fec3d5d7e38a46 100644
--- a/aten/src/THNN/generic/FeatureLPPooling.c
+++ b/aten/src/THNN/generic/FeatureLPPooling.c
@@ -24,9 +24,9 @@ static inline size_t flpGetOffset(FeatureLPPoolingSizes* s,
                            FEATURE_LP_SIZE_TYPE opt1,
                            FEATURE_LP_SIZE_TYPE opt2) {
   return s->stride[0] * batch +
-    s->stride[1] * feature +
-    s->stride[2] * opt1 +
-    s->stride[3] * opt2;
+         s->stride[1] * feature +
+         s->stride[2] * opt1 +
+         s->stride[3] * opt2;
 }
 
 static inline size_t flpOutputSize(FEATURE_LP_SIZE_TYPE inputSize,
diff --git a/aten/src/THNN/generic/Im2Col.c b/aten/src/THNN/generic/Im2Col.c
index 8678a6ea8946f9..5ae83c5416c99c 100644
--- a/aten/src/THNN/generic/Im2Col.c
+++ b/aten/src/THNN/generic/Im2Col.c
@@ -6,8 +6,8 @@ static inline void THNN_(Im2Col_shapeCheck)(
                          THNNState *state,
                          THTensor *input,
                          THTensor *gradOutput,
-                         int kH, int kW, int dH, int dW,
-                         int padH, int padW, int sH, int sW) {
+                         int64_t kH, int64_t kW, int64_t dH, int64_t dW,
+                         int64_t padH, int64_t padW, int64_t sH, int64_t sW) {
 
   THArgCheck(kW > 0 && kH > 0, 4,
              "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
@@ -16,21 +16,21 @@ static inline void THNN_(Im2Col_shapeCheck)(
   THArgCheck(sW > 0 && sH > 0, 10,
              "stride should be greater than zero, but got sH: %d sW: %d", sH, sW);
 
-  int ndim = THTensor_(nDimension)(input);
+  int64_t ndim = THTensor_(nDimension)(input);
   THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
                 "Expected non-empty 3D or 4D input tensor, but got input of shape %s");
 
-  int dim_batch = 0;
+  int64_t dim_batch = 0;
   if (ndim == 3) {
     dim_batch = -1;
   }
-  int nInputPlane  = THTensor_(size)(input, dim_batch + 1);
-  int inputHeight  = THTensor_(size)(input, dim_batch + 2);
-  int inputWidth   = THTensor_(size)(input, dim_batch + 3);
-  int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
-  int outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
-  int nOutputPlane = nInputPlane * kW * kH;
-  int outputLength = outputHeight * outputWidth;
+  int64_t nInputPlane  = THTensor_(size)(input, dim_batch + 1);
+  int64_t inputHeight  = THTensor_(size)(input, dim_batch + 2);
+  int64_t inputWidth   = THTensor_(size)(input, dim_batch + 3);
+  int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t nOutputPlane = nInputPlane * kW * kH;
+  int64_t outputLength = outputHeight * outputWidth;
 
   if (outputHeight < 1 || outputWidth < 1) {
     THError("Given input with spatial size (%d, %d), kernel_size=(%d, %d), "
@@ -46,10 +46,10 @@ void THNN_(Im2Col_updateOutput)(
            THNNState *state,
            THTensor *input,
            THTensor *output,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
 
   THNN_(Im2Col_shapeCheck)(state, input, NULL, kH, kW, dH, dW, padH, padW, sH, sW);
 
@@ -57,18 +57,18 @@ void THNN_(Im2Col_updateOutput)(
   bool batched_input = true;
   if (input->dim() == 3) {
     batched_input = false;
-    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
   }
 
-  int batchSize    = THTensor_(size)(input, 0);
-  int nInputPlane  = THTensor_(size)(input, 1);
-  int inputHeight  = THTensor_(size)(input, 2);
-  int inputWidth   = THTensor_(size)(input, 3);
+  int64_t batchSize    = THTensor_(size)(input, 0);
+  int64_t nInputPlane  = THTensor_(size)(input, 1);
+  int64_t inputHeight  = THTensor_(size)(input, 2);
+  int64_t inputWidth   = THTensor_(size)(input, 3);
 
-  int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
-  int outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
-  int nOutputPlane = nInputPlane * kW * kH;
-  int outputLength = outputHeight * outputWidth;
+  int64_t outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
+  int64_t outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
+  int64_t nOutputPlane = nInputPlane * kW * kH;
+  int64_t outputLength = outputHeight * outputWidth;
 
   THTensor_(resize3d)(output, batchSize, nOutputPlane, outputLength);
   THTensor_(zero)(output);
@@ -76,7 +76,7 @@ void THNN_(Im2Col_updateOutput)(
   THTensor *input_n = THTensor_(new)();
   THTensor *output_n = THTensor_(new)();
 
-  for (int elt = 0; elt < batchSize; elt++) {
+  for (int64_t elt = 0; elt < batchSize; elt++) {
     THTensor_(select)(input_n, input, 0, elt);
     THTensor_(select)(output_n, output, 0, elt);
 
@@ -102,11 +102,11 @@ void THNN_(Im2Col_updateGradInput)(
            THNNState *state,
            THTensor *gradOutput,
            THTensor *gradInput,
-           int inputHeight, int inputWidth,
-           int kH, int kW,
-           int dH, int dW,
-           int padH, int padW,
-           int sH, int sW) {
+           int64_t inputHeight, int64_t inputWidth,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW,
+           int64_t sH, int64_t sW) {
 
 
   THNN_(Col2Im_updateOutput)(state, gradOutput, gradInput,
diff --git a/aten/src/THNN/generic/IndexLinear.c b/aten/src/THNN/generic/IndexLinear.c
index 50aa93d788c6ec..5bc4c548fc24d0 100644
--- a/aten/src/THNN/generic/IndexLinear.c
+++ b/aten/src/THNN/generic/IndexLinear.c
@@ -65,7 +65,7 @@ void THNN_(IndexLinear_updateOutput)(
   real* outputData = THTensor_(data)(output);
   real* valuesData = THTensor_(data)(values);
   real* weightData = THTensor_(data)(weight);
-  int64_t weightStride0 = weight->stride[0];
+  int64_t weightStride0 = weight->stride(0);
   real* biasData = THTensor_(data)(bias);
   int64_t* keysData = THLongTensor_data(keys);
 
@@ -258,7 +258,7 @@ void THNN_(IndexLinear_updateParameters)(
   /* Access the storage data/strides */
   real* gradWeightData = THTensor_(data)(gradWeight);
   real* weightData = THTensor_(data)(weight);
-  int64_t weightStride0 = weight->stride[0];
+  int64_t weightStride0 = weight->stride(0);
   real* gradBiasData = THTensor_(data)(gradBias);
   real* biasData = THTensor_(data)(bias);
   int64_t* keysData = THLongTensor_data(runningKeys);
@@ -406,7 +406,7 @@ void THNN_(IndexLinear_accUpdateGradParameters)(
   real* valuesData =THTensor_(data)(values);
   real* weightData = THTensor_(data)(weight);
   real* biasData = THTensor_(data)(bias);
-  int64_t weightStride0 = weight->stride[0];
+  int64_t weightStride0 = weight->stride(0);
   int64_t* keysData = THLongTensor_data(keys);
   int64_t* sizesData = THLongTensor_data(sizes);
 
diff --git a/aten/src/THNN/generic/LookupTable.c b/aten/src/THNN/generic/LookupTable.c
index 05694fc70d2336..d040d00ade3515 100644
--- a/aten/src/THNN/generic/LookupTable.c
+++ b/aten/src/THNN/generic/LookupTable.c
@@ -40,7 +40,7 @@ void THNN_(LookupTable_accGradParameters)(
 
   if (scaleGradByFreq)
   {
-    THIntegerTensor_(resize1d)(count, gradWeight->size[0]);
+    THIntegerTensor_(resize1d)(count, gradWeight->size(0));
     count_data = THIntegerTensor_(data)(count);
   }
 
diff --git a/aten/src/THNN/generic/MultiLabelMarginCriterion.c b/aten/src/THNN/generic/MultiLabelMarginCriterion.c
index 3072c03a03ea81..0699c3ac471c55 100644
--- a/aten/src/THNN/generic/MultiLabelMarginCriterion.c
+++ b/aten/src/THNN/generic/MultiLabelMarginCriterion.c
@@ -23,16 +23,16 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
   if (input->dim() == 1)
   {
     nframe = 1;
-    dim = input->size[0];
-    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size[0] == dim),
+    dim = input->size(0);
+    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == dim),
              "inconsistent target size");
   }
   else
   {
-    nframe = input->size[0];
-    dim = input->size[1];
-    AT_CHECK(!target->is_empty() && target->dim() == 2 && (target->size[0] == nframe)
-             && (target->size[1] == dim), "inconsistent target size");
+    nframe = input->size(0);
+    dim = input->size(1);
+    AT_CHECK(!target->is_empty() && target->dim() == 2 && (target->size(0) == nframe)
+             && (target->size(1) == dim), "inconsistent target size");
   }
 
   THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range");
@@ -161,20 +161,20 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
   if (input->dim() == 1)
   {
     nframe = 1;
-    dim = input->size[0];
-    AT_CHECK((!target->is_empty() && target->dim() == 1) && (target->size[0] == dim),
+    dim = input->size(0);
+    AT_CHECK((!target->is_empty() && target->dim() == 1) && (target->size(0) == dim),
              "inconsistent target size");
-    AT_CHECK((!isTarget->is_empty() && isTarget->dim() == 1) && (isTarget->size[0] == dim),
+    AT_CHECK((!isTarget->is_empty() && isTarget->dim() == 1) && (isTarget->size(0) == dim),
              "inconsistent isTarget size");
   }
   else
   {
-    nframe = input->size[0];
-    dim = input->size[1];
-    AT_CHECK(!target->is_empty() && (target->dim() == 2) && (target->size[0] == nframe)
-             && (target->size[1] == dim), 3, "inconsistent target size");
-    AT_CHECK(!isTarget->is_empty() && (isTarget->dim() == 2) && (isTarget->size[0] == nframe)
-             && (isTarget->size[1] == dim), 3, "inconsistent isTarget size");
+    nframe = input->size(0);
+    dim = input->size(1);
+    AT_CHECK(!target->is_empty() && (target->dim() == 2) && (target->size(0) == nframe)
+             && (target->size(1) == dim), 3, "inconsistent target size");
+    AT_CHECK(!isTarget->is_empty() && (isTarget->dim() == 2) && (isTarget->size(0) == nframe)
+             && (isTarget->size(1) == dim), 3, "inconsistent isTarget size");
   }
 
   THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range");
diff --git a/aten/src/THNN/generic/MultiMarginCriterion.c b/aten/src/THNN/generic/MultiMarginCriterion.c
index 620e13c5c02b16..424669e5de8515 100644
--- a/aten/src/THNN/generic/MultiMarginCriterion.c
+++ b/aten/src/THNN/generic/MultiMarginCriterion.c
@@ -26,13 +26,13 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   if (input->dim() == 1)
   {
     nframe = 1;
-    dim = input->size[0];
+    dim = input->size(0);
   }
   else
   {
-    nframe = input->size[0];
-    dim = input->size[1];
-    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe),
+    nframe = input->size(0);
+    dim = input->size(1);
+    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe),
              "inconsistent target size, got: ", target->sizes());
   }
 
@@ -142,13 +142,13 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   if (input->dim() == 1)
   {
     nframe = 1;
-    dim = input->size[0];
+    dim = input->size(0);
   }
   else
   {
-    nframe = input->size[0];
-    dim = input->size[1];
-    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size[0] == nframe),
+    nframe = input->size(0);
+    dim = input->size(1);
+    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe),
              "inconsistent target size, got: ", target->sizes());
   }
 
diff --git a/aten/src/THNN/generic/PReLU.c b/aten/src/THNN/generic/PReLU.c
index 462280c92a1b60..81486227115068 100644
--- a/aten/src/THNN/generic/PReLU.c
+++ b/aten/src/THNN/generic/PReLU.c
@@ -26,13 +26,13 @@ void THNN_(PReLU_updateOutput)(
   int64_t bs = 1, ks = 1;
   {
     int64_t input_ndim = THTensor_(_nDimension)(input);
-    if (input->size[input_ndim > 1] != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+    if (input->size(input_ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1));
 
     if (input_ndim > 1) {
-        bs = input->size[0];
+        bs = input->size(0);
         for (int d = 2; d < input_ndim; d++) {
-            ks *= input->size[d];
+            ks *= input->size(d);
         }
     }
   }
@@ -91,13 +91,13 @@ void THNN_(PReLU_updateGradInput)(
   int64_t bs = 1, ks = 1;
   {
     int64_t input_ndim = THTensor_(_nDimension)(input);
-    if (input->size[input_ndim > 1] != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+    if (input->size(input_ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1));
 
     if (input_ndim > 1) {
-        bs = input->size[0];
+        bs = input->size(0);
         for (int d = 2; d < input_ndim; d++) {
-            ks *= input->size[d];
+            ks *= input->size(d);
         }
     }
   }
@@ -162,13 +162,13 @@ void THNN_(PReLU_accGradParameters)(
   int64_t bs = 1, ks = 1;
   {
     int64_t input_ndim = THTensor_(_nDimension)(input);
-    if (input->size[input_ndim > 1] != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+    if (input->size(input_ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1));
 
     if (input_ndim > 1) {
-        bs = input->size[0];
+        bs = input->size(0);
         for (int d = 2; d < input_ndim; d++) {
-          ks *= input->size[d];
+          ks *= input->size(d);
         }
     }
   }
diff --git a/aten/src/THNN/generic/SparseLinear.c b/aten/src/THNN/generic/SparseLinear.c
index a0c078b4f895c9..c7a00e335dbc6b 100644
--- a/aten/src/THNN/generic/SparseLinear.c
+++ b/aten/src/THNN/generic/SparseLinear.c
@@ -6,39 +6,39 @@
 #include <omp.h>
 #endif
 
-#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
-#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
+#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride(0))
+#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride(1))
 
 static bool THNN_(checkLegacyInput)(THTensor* t)
 {
-  return !t->is_empty() && t->dim() == 3 && t->size[2] == 2;
+  return !t->is_empty() && t->dim() == 3 && t->size(2) == 2;
 }
 
 static bool THNN_(checkInput)(THTensor* t)
 {
-  return!t->is_empty() && t->dim() == 2 && t->size[1] == 3;
+  return!t->is_empty() && t->dim() == 2 && t->size(1) == 3;
 }
 
 static bool THNN_(checkSize2D)(THTensor* t, int64_t size0, int64_t size1)
 {
-  return !t->is_empty() && t->dim() == 2 && t->size[0] == size0 && t->size[1] == size1;
+  return !t->is_empty() && t->dim() == 2 && t->size(0) == size0 && t->size(1) == size1;
 }
 
 static bool THNN_(checkSize1D)(THTensor* t, int64_t size0)
 {
-  return !t->is_empty() && t->dim() == 1 && t->size[0] == size0;
+  return !t->is_empty() && t->dim() == 1 && t->size(0) == size0;
 }
 
 static void THNN_(set1d)(THTensor *t, int64_t x0, real value) {
-  THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value);
+  THStorage_(set)(t->storage, t->storageOffset + x0*t->stride(0), value);
 }
 static real THNN_(get3d)(const THTensor *t, int64_t x0, int64_t x1, int64_t x2) {
   return THStorage_(get)(t->storage, t->storageOffset +
-                         x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]);
+                         x0*t->stride(0) + x1*t->stride(1) + x2*t->stride(2));
 }
 static real THNN_(get2d)(const THTensor *t, int64_t x0, int64_t x1) {
   return THStorage_(get)(t->storage, t->storageOffset +
-                         x0*t->stride[0] + x1*t->stride[1]);
+                         x0*t->stride(0) + x1*t->stride(1));
 }
 
 void THNN_(SparseLinear_updateOutput)(
@@ -92,8 +92,8 @@ void THNN_(SparseLinear_updateOutput)(
       if (offset >= 0 && offset < inDim) {
         THBlas_(axpy)(outDim,
             val,
-            COL_PTR2(weight, offset), weight->stride[0],
-            ROW_PTR2(output, h), output->stride[1]);
+            COL_PTR2(weight, offset), weight->stride(0),
+            ROW_PTR2(output, h), output->stride(1));
       } else {
         THError("index out of bound. updateOutput: %d not between 1 and %d",
             offset + 1, inDim);
@@ -147,8 +147,8 @@ void THNN_(SparseLinear_legacyUpdateOutput)(
       if (offset >= 0 && offset < inDim) {
         THBlas_(axpy)(outDim,
                       val,
-                      COL_PTR2(weight, offset), weight->stride[0],
-                      ROW_PTR2(output, h), output->stride[1]);
+                      COL_PTR2(weight, offset), weight->stride(0),
+                      ROW_PTR2(output, h), output->stride(1));
       } else {
         THError("index out of bound. updateOutput: %d not between 1 and %d",
                 offset + 1, inDim);
@@ -221,8 +221,8 @@ void THNN_(SparseLinear_accGradParameters)(
       if (offset >= 0 && offset < inDim) {
         THBlas_(axpy)(outDim,
             val,
-            ROW_PTR2(gradOutput, h), gradOutput->stride[1],
-            COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+            ROW_PTR2(gradOutput, h), gradOutput->stride(1),
+            COL_PTR2(gradWeight, offset), gradWeight->stride(0));
       } else {
         THError(
             "index out of bound. accGradParameters: %d not between 1 and %d",
@@ -289,8 +289,8 @@ void THNN_(SparseLinear_legacyAccGradParameters)(
       if (offset >= 0 && offset < inDim) {
         THBlas_(axpy)(outDim,
                       val,
-                      ROW_PTR2(gradOutput, h), gradOutput->stride[1],
-                      COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+                      ROW_PTR2(gradOutput, h), gradOutput->stride(1),
+                      COL_PTR2(gradWeight, offset), gradWeight->stride(0));
       } else {
         THError(
           "index out of bound. accGradParameters: %d not between 1 and %d",
@@ -324,8 +324,8 @@ void THNN_(SparseLinear_updateParameters)(
 {
   real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
   int64_t i;
-  int64_t outDim = weight->size[0];
-  int64_t inDim = weight->size[1];
+  int64_t outDim = weight->size(0);
+  int64_t inDim = weight->size(1);
 
   THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
              "gradWeight size wrong");
@@ -380,8 +380,8 @@ void THNN_(SparseLinear_updateParameters)(
     int64_t offset = (int64_t)uniqueOffsets_p[i];
     THBlas_(axpy)(outDim,
                   -learningRate,
-                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
-                  COL_PTR2(weight, offset), weight->stride[0]);
+                  COL_PTR2(gradWeight, offset), gradWeight->stride(0),
+                  COL_PTR2(weight, offset), weight->stride(0));
   }
 
   THTensor_(free)(uniqueOffsets);
@@ -398,8 +398,8 @@ void THNN_(SparseLinear_legacyUpdateParameters)(
 {
   real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
   int64_t h, i;
-  int64_t outDim = weight->size[0];
-  int64_t inDim = weight->size[1];
+  int64_t outDim = weight->size(0);
+  int64_t inDim = weight->size(1);
 
   THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
              "gradWeight size wrong");
@@ -456,8 +456,8 @@ void THNN_(SparseLinear_legacyUpdateParameters)(
     int64_t offset = (int64_t)uniqueOffsets_p[i];
     THBlas_(axpy)(outDim,
                   -learningRate,
-                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
-                  COL_PTR2(weight, offset), weight->stride[0]);
+                  COL_PTR2(gradWeight, offset), gradWeight->stride(0),
+                  COL_PTR2(weight, offset), weight->stride(0));
   }
 
   THTensor_(free)(uniqueOffsets);
@@ -471,8 +471,8 @@ void THNN_(SparseLinear_zeroGradParameters)(
 {
   int64_t i, j;
 
-  int64_t outDim = gradWeight->size[0];
-  int64_t inDim = gradWeight->size[1];
+  int64_t outDim = gradWeight->size(0);
+  int64_t inDim = gradWeight->size(1);
 
   THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
   THArgCheck(THNN_(checkInput)(lastInput), 4,
@@ -492,10 +492,10 @@ void THNN_(SparseLinear_zeroGradParameters)(
     int64_t offset = (int64_t)(THNN_(get2d)(lastInput, i, 1)) - 1;
     if (offset >= 0 && offset < inDim) {
       real* pGradWeight = COL_PTR2(gradWeight, offset);
-      if (gradWeight->stride[0] == 1) {
+      if (gradWeight->stride(0) == 1) {
         THVector_(fill)(pGradWeight, 0, outDim);
       } else {
-        int64_t stride = gradWeight->stride[0];
+        int64_t stride = gradWeight->stride(0);
         for (j = 0; j < outDim; ++j) {
           pGradWeight[j * stride] = 0;
         }
@@ -517,8 +517,8 @@ void THNN_(SparseLinear_legacyZeroGradParameters)(
 {
   int64_t h, i, j;
 
-  int64_t outDim = gradWeight->size[0];
-  int64_t inDim = gradWeight->size[1];
+  int64_t outDim = gradWeight->size(0);
+  int64_t inDim = gradWeight->size(1);
 
   THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
   THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
@@ -540,10 +540,10 @@ void THNN_(SparseLinear_legacyZeroGradParameters)(
       int64_t offset = (int64_t)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
       if (offset >= 0 && offset < inDim) {
         real* pGradWeight = COL_PTR2(gradWeight, offset);
-        if (gradWeight->stride[0] == 1) {
+        if (gradWeight->stride(0) == 1) {
           THVector_(fill)(pGradWeight, 0, outDim);
         } else {
-          int64_t stride = gradWeight->stride[0];
+          int64_t stride = gradWeight->stride(0);
           for (j = 0; j < outDim; ++j) {
             pGradWeight[j * stride] = 0;
           }
diff --git a/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c b/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c
index c81657f7718c24..e7c47485f969d7 100644
--- a/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c
+++ b/aten/src/THNN/generic/SpatialAdaptiveAveragePooling.c
@@ -92,21 +92,21 @@ void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
 
   if (input->dim() == 4)
   {
-    istrideB = input->stride[0];
-    sizeB = input->size[0];
+    istrideB = input->stride(0);
+    sizeB = input->size(0);
     dimD++;
     dimH++;
     dimW++;
   }
 
   /* sizes */
-  sizeD  = input->size[dimD];
-  isizeH = input->size[dimH];
-  isizeW = input->size[dimW];
+  sizeD  = input->size(dimD);
+  isizeH = input->size(dimH);
+  isizeW = input->size(dimW);
   /* strides */
-  istrideD = input->stride[dimD];
-  istrideH = input->stride[dimH];
-  istrideW = input->stride[dimW];
+  istrideD = input->stride(dimD);
+  istrideH = input->stride(dimH);
+  istrideW = input->stride(dimW);
 
   /* resize output */
   if (input->dim() == 3)
@@ -218,18 +218,18 @@ void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
   THTensor_(zero)(gradInput);
 
   if (input->dim() == 4) {
-    sizeB = input->size[0];
+    sizeB = input->size(0);
     dimD++;
     dimH++;
     dimW++;
   }
 
   /* sizes */
-  sizeD  = input->size[dimD];
-  isizeH = input->size[dimH];
-  isizeW = input->size[dimW];
-  osizeH = gradOutput->size[dimH];
-  osizeW = gradOutput->size[dimW];
+  sizeD  = input->size(dimD);
+  isizeH = input->size(dimH);
+  isizeW = input->size(dimW);
+  osizeH = gradOutput->size(dimH);
+  osizeW = gradOutput->size(dimW);
 
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
diff --git a/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c b/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c
index 711fa73b46555a..fc49a2388cfb6f 100644
--- a/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c
+++ b/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c
@@ -102,20 +102,20 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
 
   if (input->dim() == 4)
   {
-    istrideB = input->stride[0];
-    sizeB = input->size[0];
+    istrideB = input->stride(0);
+    sizeB = input->size(0);
     dimW++;
     dimH++;
   }
 
   /* sizes */
-  sizeD  = input->size[dimH-1];
-  isizeH = input->size[dimH];
-  isizeW = input->size[dimW];
+  sizeD  = input->size(dimH-1);
+  isizeH = input->size(dimH);
+  isizeW = input->size(dimW);
   /* strides */
-  istrideD = input->stride[dimH-1];
-  istrideH = input->stride[dimH];
-  istrideW = input->stride[dimW];
+  istrideD = input->stride(dimH-1);
+  istrideH = input->stride(dimH);
+  istrideW = input->stride(dimW);
 
   /* resize output */
   if (input->dim() == 3)
@@ -223,17 +223,17 @@ void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
   THTensor_(zero)(gradInput);
 
   if (input->dim() == 4) {
-    sizeB = input->size[0];
+    sizeB = input->size(0);
     dimW++;
     dimH++;
   }
 
   /* sizes */
-  sizeD  = input->size[dimH-1];
-  isizeH = input->size[dimH];
-  isizeW = input->size[dimW];
-  osizeH = gradOutput->size[dimH];
-  osizeW = gradOutput->size[dimW];
+  sizeD  = input->size(dimH-1);
+  isizeH = input->size(dimH);
+  isizeW = input->size(dimW);
+  osizeH = gradOutput->size(dimH);
+  osizeW = gradOutput->size(dimW);
 
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
diff --git a/aten/src/THNN/generic/SpatialAveragePooling.c b/aten/src/THNN/generic/SpatialAveragePooling.c
index 2a057e43d294b9..bdf851cf5f2a19 100644
--- a/aten/src/THNN/generic/SpatialAveragePooling.c
+++ b/aten/src/THNN/generic/SpatialAveragePooling.c
@@ -31,9 +31,9 @@ static inline void THNN_(SpatialAveragePooling_shapeCheck)(
 	     "padW = %d, padH = %d, kW = %d, kH = %d",
 	     padW, padH, kW, kH);
 
-  int64_t nInputPlane = input->size[dimh-1];
-  int64_t inputHeight = input->size[dimh];
-  int64_t inputWidth = input->size[dimw];
+  int64_t nInputPlane = input->size(dimh-1);
+  int64_t inputHeight = input->size(dimh);
+  int64_t inputWidth = input->size(dimw);
   int64_t outputHeight, outputWidth;
   int64_t nOutputPlane = nInputPlane;
 
@@ -103,15 +103,15 @@ void THNN_(SpatialAveragePooling_updateOutput)(
     (input, NULL, kH, kW, dH, dW, padH, padW, ceil_mode);
 
   if (input->dim() == 4) {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
     dimc++;
   }
 
-  inputWidth = input->size[dimw];
-  inputHeight = input->size[dimh];
-  nInputPlane = input->size[dimc];
+  inputWidth = input->size(dimw);
+  inputHeight = input->size(dimh);
+  nInputPlane = input->size(dimc);
 
   if(ceil_mode)
   {
@@ -136,7 +136,7 @@ void THNN_(SpatialAveragePooling_updateOutput)(
   if (input->dim() == 3)
     THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
   else
-    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+    THTensor_(resize4d)(output, input->size(0), nInputPlane, outputHeight, outputWidth);
 
   input = THTensor_(newContiguous)(input);
   THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
@@ -232,16 +232,16 @@ void THNN_(SpatialAveragePooling_updateGradInput)(
 
 
   if (input->dim() == 4) {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
     dimc++;
     ndim = 4;
   }
 
-  inputWidth = input->size[dimw];
-  inputHeight = input->size[dimh];
-  nInputPlane = input->size[dimc];
+  inputWidth = input->size(dimw);
+  inputHeight = input->size(dimh);
+  nInputPlane = input->size(dimc);
 
   if(ceil_mode)
   {
diff --git a/aten/src/THNN/generic/SpatialConvolutionLocal.c b/aten/src/THNN/generic/SpatialConvolutionLocal.c
index 443901a37e70c1..6461285ca931fb 100644
--- a/aten/src/THNN/generic/SpatialConvolutionLocal.c
+++ b/aten/src/THNN/generic/SpatialConvolutionLocal.c
@@ -29,8 +29,8 @@ static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
   THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
         "non-empty 3D or 4D input tensor expected but got: %s");
 
-  int64_t nInputPlane = weight->size[2] / (kH * kW);
-  int64_t nOutputPlane = weight->size[1];
+  int64_t nInputPlane = weight->size(2) / (kH * kW);
+  int64_t nOutputPlane = weight->size(1);
 
   if (bias != NULL) {
     THNN_CHECK_DIM_SIZE(bias, 3, 0, nOutputPlane);
@@ -53,9 +53,9 @@ static THTensor* THNN_(view_weight_local)(THTensor *_weight)
   AT_CHECK(!weight->is_empty() && (weight->dim() == 3 || weight->dim() == 6),
            "weight tensor should be (non-empty) 3D or 6D - got size: ", weight->sizes());
   if (weight->dim() == 6) {
-    int64_t s1 = weight->size[0] * weight->size[1];
-    int64_t s2 = weight->size[2];
-    int64_t s3 = weight->size[3] * weight->size[4] * weight->size[5];
+    int64_t s1 = weight->size(0) * weight->size(1);
+    int64_t s2 = weight->size(2);
+    int64_t s3 = weight->size(3) * weight->size(4) * weight->size(5);
     THTensor *old_weight = weight;
     weight = THTensor_(newWithStorage3d)(weight->storage,
                        weight->storageOffset,
@@ -140,7 +140,7 @@ void THNN_(SpatialConvolutionLocal_updateOutput)(
   }
   else
   {
-    int64_t T = input->size[0];
+    int64_t T = input->size(0);
     int64_t t;
 
     THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
@@ -243,7 +243,7 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
   }
   else
   {
-    int64_t T = input->size[0];
+    int64_t T = input->size(0);
     int64_t t;
 
 #pragma omp parallel for private(t)
@@ -339,7 +339,7 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)(
   }
   else
   {
-    int64_t T = input->size[0];
+    int64_t T = input->size(0);
     int64_t t;
 
     for(t = 0; t < T; t++)
diff --git a/aten/src/THNN/generic/SpatialConvolutionMM.c b/aten/src/THNN/generic/SpatialConvolutionMM.c
index cdbff690b248af..434320a3cfdfd7 100644
--- a/aten/src/THNN/generic/SpatialConvolutionMM.c
+++ b/aten/src/THNN/generic/SpatialConvolutionMM.c
@@ -16,7 +16,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
     THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
                     "non-empty 2D or 4D weight tensor expected, but got: %s");
     if (bias != NULL) {
-      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
     }
   } else if (!weight_nullable) {
     THError("weight tensor is expected to be non-nullable");
@@ -36,8 +36,8 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
   THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
 		"non-empty 3D or 4D input tensor expected but got: %s");
 
-  int64_t inputHeight  = input->size[dimh];
-  int64_t inputWidth   = input->size[dimw];
+  int64_t inputHeight  = input->size(dimh);
+  int64_t inputWidth   = input->size(dimw);
 
   int64_t exactInputHeight = inputHeight + 2 * padH;
   int64_t exactInputWidth = inputWidth + 2 * padW;
@@ -58,7 +58,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
   }
 
   if (weight != NULL) {
-    int64_t nInputPlane = weight->size[1];
+    int64_t nInputPlane = weight->size(1);
     if (weight->dim() == 2) {
       nInputPlane /= (kH * kW);
     }
@@ -67,10 +67,10 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
 
   if (gradOutput != NULL) {
     if (weight != NULL) {
-      int64_t nOutputPlane = weight->size[0];
+      int64_t nOutputPlane = weight->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size[0];
+      int64_t nOutputPlane = bias->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
@@ -81,8 +81,8 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
 static THTensor* THNN_(newViewWeightMM2d)(THTensor *weight) {
   weight = THTensor_(newContiguous)(weight);
   if (weight->dim() == 4) {
-    int64_t s1 = weight->size[0];
-    int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    int64_t s1 = weight->size(0);
+    int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
     THTensor *old_weight = weight;
     weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
 					 s1, -1, s2, -1);
@@ -123,7 +123,7 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
   if (bias) {
     for(i = 0; i < nOutputPlane; i++)
         THVector_(fill)
-	  (THStorage_(data)(output->storage) + output->storageOffset + output->stride[0] * i,
+	  (THStorage_(data)(output->storage) + output->storageOffset + output->stride(0) * i,
 	   THTensor_(get1d)(bias, i), outputHeight*outputWidth);
   } else {
     THTensor_(zero)(output);
@@ -166,10 +166,10 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
     dimw++;
   }
 
-  int64_t nInputPlane = input->size[dimf];
-  int64_t inputHeight  = input->size[dimh];
-  int64_t inputWidth   = input->size[dimw];
-  int64_t nOutputPlane = weight->size[0];
+  int64_t nInputPlane = input->size(dimf);
+  int64_t inputHeight  = input->size(dimh);
+  int64_t inputWidth   = input->size(dimw);
+  int64_t nOutputPlane = weight->size(0);
   int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
   int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
 
@@ -186,7 +186,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
   }
   else
   {
-    int64_t T = input->size[0];
+    int64_t T = input->size(0);
     int64_t t;
 
     THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
@@ -229,8 +229,8 @@ static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
 {
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)
     (gradOutput->storage, gradOutput->storageOffset,
-     gradOutput->size[0], -1,
-     gradOutput->size[1]*gradOutput->size[2], -1);
+     gradOutput->size(0), -1,
+     gradOutput->size(1)*gradOutput->size(2), -1);
   THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
   THTensor_(free)(gradOutput2d);
 
@@ -238,8 +238,8 @@ static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
 
   THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
 		      padW, padH,
-		      gradInput->size[0], gradInput->size[2], gradInput->size[1],
-		      gradOutput->size[2], gradOutput->size[1]);
+		      gradInput->size(0), gradInput->size(2), gradInput->size(1),
+		      gradOutput->size(2), gradOutput->size(1));
 }
 
 void THNN_(SpatialConvolutionMM_updateGradInput)(
@@ -283,7 +283,7 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
   }
   else
   {
-    int64_t T = input->size[0];
+    int64_t T = input->size(0);
     int64_t t;
 
 #pragma omp parallel for private(t)
@@ -319,8 +319,8 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
   int64_t i;
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)
     (gradOutput->storage, gradOutput->storageOffset,
-     gradOutput->size[0], -1,
-     gradOutput->size[1]*gradOutput->size[2], -1);
+     gradOutput->size(0), -1,
+     gradOutput->size(1)*gradOutput->size(2), -1);
 
   if (gradWeight) {
     THTensor *tfinput = THTensor_(new)();
@@ -330,12 +330,12 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
   }
 
   if (gradBias) {
-    for(i = 0; i < gradBias->size[0]; i++)
+    for(i = 0; i < gradBias->size(0); i++)
     {
       int64_t k;
       real sum = 0;
-      real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
-      for(k = 0; k < gradOutput2d->size[1]; k++)
+      real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride(0);
+      for(k = 0; k < gradOutput2d->size(1); k++)
         sum += data[k];
       (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] += scale*sum;
     }
@@ -382,7 +382,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
   }
   else
   {
-    int64_t T = input->size[0];
+    int64_t T = input->size(0);
     int64_t t;
 
     for(t = 0; t < T; t++)
diff --git a/aten/src/THNN/generic/SpatialConvolutionMap.c b/aten/src/THNN/generic/SpatialConvolutionMap.c
index cdd74ed304f9d6..f91c372b4a7ac8 100644
--- a/aten/src/THNN/generic/SpatialConvolutionMap.c
+++ b/aten/src/THNN/generic/SpatialConvolutionMap.c
@@ -9,7 +9,7 @@ void THNN_(SpatialConvolutionMap_updateOutput)(
 {
   THArgCheck(
     weight != NULL && !weight->is_empty() && weight->dim() == 3
-    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    && connTable != NULL && connTable->size(0) == weight->size(0), 4,
     "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
@@ -22,27 +22,27 @@ void THNN_(SpatialConvolutionMap_updateOutput)(
 
   if (input->dim() == 4)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimc++;
     dimw++;
     dimh++;
   }
 
-  const int64_t kH       = weight->size[1];
-  const int64_t kW       = weight->size[2];
+  const int64_t kH       = weight->size(1);
+  const int64_t kW       = weight->size(2);
 
-  THArgCheck(input->size[dimc] >= nInputPlane, 2, "invalid number of input planes");
-  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size");
+  THArgCheck(input->size(dimc) >= nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(input->size(dimw) >= kW && input->size(dimh) >= kH, 2, "input image smaller than kernel size");
 
-  const int64_t input_w  = input->size[dimw];
-  const int64_t input_h  = input->size[dimh];
+  const int64_t input_w  = input->size(dimw);
+  const int64_t input_h  = input->size(dimh);
   const int64_t output_w = (input_w - kW) / dW + 1;
   const int64_t output_h = (input_h - kH) / dH + 1;
 
   if (input->dim() == 3)
     THTensor_(resize3d)(output, nOutputPlane, output_h, output_w);
   else
-    THTensor_(resize4d)(output, input->size[0], nOutputPlane, output_h, output_w);
+    THTensor_(resize4d)(output, input->size(0), nOutputPlane, output_h, output_w);
 
   /* contiguous */
   input = THTensor_(newContiguous)(input);
@@ -73,7 +73,7 @@ void THNN_(SpatialConvolutionMap_updateOutput)(
         ptr_output[j] = z;
 
       /* convolve all maps */
-      int nweight = connTable->size[0];
+      int nweight = connTable->size(0);
       for (k = 0; k < nweight; k++)
       {
         /* get offsets for input/output */
@@ -110,7 +110,7 @@ void THNN_(SpatialConvolutionMap_updateGradInput)(
 {
   THArgCheck(
     weight != NULL && !weight->is_empty() && weight->dim() == 3
-    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    && connTable != NULL && connTable->size(0) == weight->size(0), 5,
     "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
@@ -120,17 +120,17 @@ void THNN_(SpatialConvolutionMap_updateGradInput)(
   int64_t nbatch = 1;
   if (input->dim() == 4)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
   }
 
-  const int64_t input_h  = input->size[dimh];
-  const int64_t input_w  = input->size[dimw];
-  const int64_t output_h = gradOutput->size[dimh];
-  const int64_t output_w = gradOutput->size[dimw];
-  const int64_t kH       = weight->size[1];
-  const int64_t kW       = weight->size[2];
+  const int64_t input_h  = input->size(dimh);
+  const int64_t input_w  = input->size(dimw);
+  const int64_t output_h = gradOutput->size(dimh);
+  const int64_t output_w = gradOutput->size(dimw);
+  const int64_t kH       = weight->size(1);
+  const int64_t kW       = weight->size(2);
 
   /* contiguous */
   gradInput = THTensor_(newContiguous)(gradInput);
@@ -157,7 +157,7 @@ void THNN_(SpatialConvolutionMap_updateGradInput)(
     {
       int64_t k;
       /* backward all */
-      int nkernel = connTable->size[0];
+      int nkernel = connTable->size(0);
       for (k = 0; k < nkernel; k++)
       {
         int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
@@ -197,7 +197,7 @@ void THNN_(SpatialConvolutionMap_accGradParameters)(
   real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   THArgCheck(
     gradWeight != NULL && !gradWeight->is_empty() && gradWeight->dim() == 3
-    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    && connTable != NULL && connTable->size(0) == gradWeight->size(0), 5,
     "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
@@ -207,17 +207,17 @@ void THNN_(SpatialConvolutionMap_accGradParameters)(
   int64_t nbatch = 1;
   if (input->dim() == 4)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
   }
 
-  const int64_t input_h  = input->size[dimh];
-  const int64_t input_w  = input->size[dimw];
-  const int64_t output_h = gradOutput->size[dimh];
-  const int64_t output_w = gradOutput->size[dimw];
-  const int64_t kH       = gradWeight->size[1];
-  const int64_t kW       = gradWeight->size[2];
+  const int64_t input_h  = input->size(dimh);
+  const int64_t input_w  = input->size(dimw);
+  const int64_t output_h = gradOutput->size(dimh);
+  const int64_t output_w = gradOutput->size(dimw);
+  const int64_t kH       = gradWeight->size(1);
+  const int64_t kW       = gradWeight->size(2);
 
   /* contiguous */
   input = THTensor_(newContiguous)(input);
@@ -248,7 +248,7 @@ void THNN_(SpatialConvolutionMap_accGradParameters)(
   }
 
   /* gradients wrt weight */
-  const int nkernel = connTable->size[0];
+  const int nkernel = connTable->size(0);
 #pragma omp parallel for private(k)
   for (k = 0; k < nkernel; k++)
   {
diff --git a/aten/src/THNN/generic/SpatialDilatedConvolution.c b/aten/src/THNN/generic/SpatialDilatedConvolution.c
index bddf79be0b785b..10b792a1d61146 100644
--- a/aten/src/THNN/generic/SpatialDilatedConvolution.c
+++ b/aten/src/THNN/generic/SpatialDilatedConvolution.c
@@ -20,7 +20,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
                   "non-empty 4D weight tensor (nOutputPlane, nInputPlane, kH, kW) expected, "
                   "but got: %s");
     if (bias != NULL) {
-      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
     }
   } else if (!weight_nullable) {
     THError("weight tensor is expected to be non-nullable");
@@ -40,8 +40,8 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
   THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
 		"non-empty 3D or 4D input tensor expected but got: %s");
 
-  int64_t inputHeight  = input->size[dimh];
-  int64_t inputWidth   = input->size[dimw];
+  int64_t inputHeight  = input->size(dimh);
+  int64_t inputWidth   = input->size(dimw);
 
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
   int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
@@ -53,16 +53,16 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
   }
 
   if (weight != NULL) {
-    int64_t nInputPlane = weight->size[1];
+    int64_t nInputPlane = weight->size(1);
     THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
   }
 
   if (gradOutput != NULL) {
     if (weight != NULL) {
-      int64_t nOutputPlane = weight->size[0];
+      int64_t nOutputPlane = weight->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size[0];
+      int64_t nOutputPlane = bias->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
@@ -89,8 +89,8 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
      dilationH, dilationW, 0);
 
   // Params:
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
+  int nInputPlane = weight->size(1);
+  int nOutputPlane = weight->size(0);
 
   input = THTensor_(newContiguous)(input);
   weight = THTensor_(newContiguous)(weight);
@@ -103,15 +103,15 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
   }
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
@@ -124,7 +124,7 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
   if (!THTensor_(isContiguous)(ones) || ones->dim() != 2 ||
-      ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+      ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THTensor_(resize2d)(ones, outputHeight, outputWidth);
     THTensor_(fill)(ones, 1);
@@ -173,7 +173,7 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
 
     // M,N,K are dims of matrix A and B
     int64_t m = nOutputPlane;
-    int64_t n = columns->size[1];
+    int64_t n = columns->size(1);
     int64_t k = nInputPlane*kH*kW;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
@@ -220,8 +220,8 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
      dilationH, dilationW, 0);
 
   // Params
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
+  int64_t nInputPlane = weight->size(1);
+  int64_t nOutputPlane = weight->size(0);
 
   input = THTensor_(newContiguous)(input);
   weight = THTensor_(newContiguous)(weight);
@@ -231,18 +231,18 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
-    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1],
-			gradOutput->size[2]);
+    THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1),
+			gradOutput->size(2));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
@@ -263,7 +263,7 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
 
     // M,N,K are dims of matrix A and B
     int64_t m = nInputPlane*kW*kH;
-    int64_t n = gradColumns->size[1];
+    int64_t n = gradColumns->size(1);
     int64_t k = nOutputPlane;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
@@ -338,20 +338,20 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
-    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0],
-			gradOutput->size[1], gradOutput->size[2]);
+    THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0),
+			gradOutput->size(1), gradOutput->size(2));
   }
 
-  int64_t nInputPlane = input->size[1];
-  int64_t nOutputPlane = gradOutput->size[1];
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t nInputPlane = input->size(1);
+  int64_t nOutputPlane = gradOutput->size(1);
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize temporary columns
   THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
@@ -383,7 +383,7 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
       // M,N,K are dims of matrix A and B
       int64_t m = nOutputPlane;
       int64_t n = nInputPlane*kW*kH;
-      int64_t k = columns->size[1];
+      int64_t k = columns->size(1);
 
       // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
       THBlas_(gemm)(
@@ -405,7 +405,7 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
 
       // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
       // Define a buffer of ones, for bias accumulation
-      if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+      if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
         // Resize plane and fill with ones...
         THTensor_(resize2d)(ones, outputHeight, outputWidth);
         THTensor_(fill)(ones, 1);
diff --git a/aten/src/THNN/generic/SpatialDilatedMaxPooling.c b/aten/src/THNN/generic/SpatialDilatedMaxPooling.c
index 2d595b7c449dc5..2b77fcee76d028 100644
--- a/aten/src/THNN/generic/SpatialDilatedMaxPooling.c
+++ b/aten/src/THNN/generic/SpatialDilatedMaxPooling.c
@@ -34,9 +34,9 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
 	     "padW = %d, padH = %d, kW = %d, kH = %d",
 	     padW, padH, kW, kH);
 
-  int64_t nInputPlane = input->size[dimh-1];
-  int64_t inputHeight = input->size[dimh];
-  int64_t inputWidth = input->size[dimw];
+  int64_t nInputPlane = input->size(dimh-1);
+  int64_t inputHeight = input->size(dimh);
+  int64_t inputWidth = input->size(dimw);
   int64_t outputHeight, outputWidth;
   int64_t nOutputPlane = nInputPlane;
 
@@ -184,15 +184,15 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)(
 
   if (input->dim() == 4)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
   }
 
   /* sizes */
-  nInputPlane = input->size[dimh-1];
-  inputHeight = input->size[dimh];
-  inputWidth = input->size[dimw];
+  nInputPlane = input->size(dimh-1);
+  inputHeight = input->size(dimh);
+  inputWidth = input->size(dimw);
   if (ceil_mode)
   {
     outputHeight = (int64_t)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
@@ -349,17 +349,17 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
   THTensor_(zero)(gradInput);
 
   if (input->dim() == 4) {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
   }
 
   /* sizes */
-  nInputPlane = input->size[dimh-1];
-  inputHeight = input->size[dimh];
-  inputWidth = input->size[dimw];
-  outputHeight = gradOutput->size[dimh];
-  outputWidth = gradOutput->size[dimw];
+  nInputPlane = input->size(dimh-1);
+  inputHeight = input->size(dimh);
+  inputWidth = input->size(dimw);
+  outputHeight = gradOutput->size(dimh);
+  outputWidth = gradOutput->size(dimw);
 
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
diff --git a/aten/src/THNN/generic/SpatialFullConvolutionMap.c b/aten/src/THNN/generic/SpatialFullConvolutionMap.c
index a6fe50725df6b1..a989ba207c17da 100644
--- a/aten/src/THNN/generic/SpatialFullConvolutionMap.c
+++ b/aten/src/THNN/generic/SpatialFullConvolutionMap.c
@@ -12,20 +12,20 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)(
   // What does this mean?
   THArgCheck(
     weight != NULL && !weight->is_empty() && weight->dim() == 3
-    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    && connTable != NULL && connTable->size(0) == weight->size(0), 4,
     "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
-  const int kH = (int)weight->size[1];
-  const int kW = (int)weight->size[2];
+  const int kH = (int)weight->size(1);
+  const int kW = (int)weight->size(2);
 
   THArgCheck(input != NULL && !input->is_empty() && input->dim() == 3, 2, "non-empty 3D tensor expected");
-  THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(input->size(0) >= nInputPlane, 2, "invalid number of input planes");
 
   THTensor_(resize3d)(
     output_, nOutputPlane,
-    (input->size[1] - 1) * dH + kH,
-    (input->size[2] - 1) * dW + kW
+    (input->size(1) - 1) * dH + kH,
+    (input->size(2) - 1) * dW + kW
   );
 
   /* contiguous */
@@ -40,12 +40,12 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)(
   real *connTable_data = THTensor_(data)(connTable);
 
   /* and dims */
-  const int64_t input_h = input->size[1];
-  const int64_t input_w = input->size[2];
-  const int64_t output_h = output->size[1];
-  const int64_t output_w = output->size[2];
-  const int64_t weight_h = weight->size[1];
-  const int64_t weight_w = weight->size[2];
+  const int64_t input_h = input->size(1);
+  const int64_t input_w = input->size(2);
+  const int64_t output_h = output->size(1);
+  const int64_t output_w = output->size(2);
+  const int64_t weight_h = weight->size(1);
+  const int64_t weight_w = weight->size(2);
 
   int64_t p;
 #pragma omp parallel for private(p)
@@ -61,7 +61,7 @@ void THNN_(SpatialFullConvolutionMap_updateOutput)(
       ptr_output[j] = bias_data[p];
 
     /* convolve all maps */
-    nweight = connTable->size[0];
+    nweight = connTable->size(0);
     for (k = 0; k < nweight; k++)
     {
       /* get offsets for input/output */
@@ -93,7 +93,7 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)(
 {
   THArgCheck(
     weight != NULL && !weight->is_empty() && weight->dim() == 3
-    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    && connTable != NULL && connTable->size(0) == weight->size(0), 5,
     "non-empty 3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
@@ -112,12 +112,12 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)(
   real *connTable_data = THTensor_(data)(connTable);
 
   /* and dims */
-  const int64_t input_h = input->size[1];
-  const int64_t input_w = input->size[2];
-  const int64_t output_h = gradOutput->size[1];
-  const int64_t output_w = gradOutput->size[2];
-  const int64_t kH = weight->size[1];
-  const int64_t kW = weight->size[2];
+  const int64_t input_h = input->size(1);
+  const int64_t input_w = input->size(2);
+  const int64_t output_h = gradOutput->size(1);
+  const int64_t output_w = gradOutput->size(2);
+  const int64_t kH = weight->size(1);
+  const int64_t kW = weight->size(2);
 
   int64_t p;
 #pragma omp parallel for private(p)
@@ -125,7 +125,7 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)(
   {
     int64_t k;
     /* backward all */
-    int nkernel = connTable->size[0];
+    int nkernel = connTable->size(0);
     for (k = 0; k < nkernel; k++)
     {
       int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
@@ -164,7 +164,7 @@ void THNN_(SpatialFullConvolutionMap_accGradParameters)(
   real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   THArgCheck(
     gradWeight != NULL && !gradWeight->is_empty() && gradWeight->dim() == 3
-    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    && connTable != NULL && connTable->size(0) == gradWeight->size(0), 5,
     "non-empty 3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
   );
 
@@ -179,12 +179,12 @@ void THNN_(SpatialFullConvolutionMap_accGradParameters)(
   real *gradBias_data = THTensor_(data)(gradBias);
 
   /* and dims */
-  const int64_t input_h  = input->size[1];
-  const int64_t input_w  = input->size[2];
-  const int64_t output_h = gradOutput->size[1];
-  const int64_t output_w = gradOutput->size[2];
-  const int64_t weight_h = gradWeight->size[1];
-  const int64_t weight_w = gradWeight->size[2];
+  const int64_t input_h  = input->size(1);
+  const int64_t input_w  = input->size(2);
+  const int64_t output_h = gradOutput->size(1);
+  const int64_t output_w = gradOutput->size(2);
+  const int64_t weight_h = gradWeight->size(1);
+  const int64_t weight_w = gradWeight->size(2);
 
   /* gradients wrt bias */
   int64_t k;
@@ -198,7 +198,7 @@ void THNN_(SpatialFullConvolutionMap_accGradParameters)(
   }
 
   /* gradients wrt weight */
-  int nkernel = connTable->size[0];
+  int nkernel = connTable->size(0);
 #pragma omp parallel for private(k)
   for (k = 0; k < nkernel; k++)
   {
diff --git a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
index 35098095d6a7ef..7226db67ef1a74 100644
--- a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
+++ b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
@@ -23,7 +23,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
     THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
                   "non-empty 2D or 4D weight tensor expected, but got: %s");
     if (bias != NULL) {
-      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
+      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(1));
     }
   } else if (!weight_nullable) {
     THError("weight tensor is expected to be non-nullable");
@@ -43,8 +43,8 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
   THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
 		"non-empty 3D or 4D input tensor expected but got: %s");
 
-  int64_t inputHeight  = input->size[dimh];
-  int64_t inputWidth   = input->size[dimw];
+  int64_t inputHeight  = input->size(dimh);
+  int64_t inputWidth   = input->size(dimw);
   int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
   int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
 
@@ -55,16 +55,16 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
   }
 
   if (weight != NULL) {
-    int64_t nInputPlane = weight->size[0];
+    int64_t nInputPlane = weight->size(0);
     THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
   }
 
   if (gradOutput != NULL) {
     if (weight != NULL) {
-      int64_t nOutputPlane = weight->size[1];
+      int64_t nOutputPlane = weight->size(1);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size[0];
+      int64_t nOutputPlane = bias->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
@@ -105,16 +105,16 @@ void THNN_(SpatialFullDilatedConvolution_updateOutput)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
   }
 
-  int64_t inputHeight  = input->size[2];
-  int64_t inputWidth   = input->size[3];
+  int64_t inputHeight  = input->size(2);
+  int64_t inputWidth   = input->size(3);
   int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
   int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
@@ -126,7 +126,7 @@ void THNN_(SpatialFullDilatedConvolution_updateOutput)(
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
-  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+  if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THTensor_(resize2d)(ones, outputHeight, outputWidth);
     THTensor_(fill)(ones, 1);
@@ -145,9 +145,9 @@ void THNN_(SpatialFullDilatedConvolution_updateOutput)(
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    int64_t m = weight->size[1] * weight->size[2] * weight->size[3];
-    int64_t n = columns->size[1];
-    int64_t k = weight->size[0];
+    int64_t m = weight->size(1) * weight->size(2) * weight->size(3);
+    int64_t n = columns->size(1);
+    int64_t k = weight->size(0);
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     THBlas_(gemm)(
@@ -221,8 +221,8 @@ void THNN_(SpatialFullDilatedConvolution_updateGradInput)(
     (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
      dilationH, dilationW, adjH, adjW, 0);
 
-  int nInputPlane = THTensor_(size)(weight,0);
-  int nOutputPlane = THTensor_(size)(weight,1);
+  int64_t nInputPlane = THTensor_(size)(weight,0);
+  int64_t nOutputPlane = THTensor_(size)(weight,1);
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
@@ -233,17 +233,17 @@ void THNN_(SpatialFullDilatedConvolution_updateGradInput)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
-    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+    THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
   int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
@@ -275,9 +275,9 @@ void THNN_(SpatialFullDilatedConvolution_updateGradInput)(
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    int64_t m = weight->size[0];
-    int64_t n = gradColumns->size[1];
-    int64_t k = weight->size[1] * weight->size[2] * weight->size[3];
+    int64_t m = weight->size(0);
+    int64_t n = gradColumns->size(1);
+    int64_t k = weight->size(1) * weight->size(2) * weight->size(3);
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     THBlas_(gemm)(
@@ -328,7 +328,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
     (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
      dilationH, dilationW, adjH, adjW, 1);
 
-  int nOutputPlane;
+  int64_t nOutputPlane;
   if (gradWeight) {
     nOutputPlane = THTensor_(size)(gradWeight, 1);
   } else if (gradBias) {
@@ -352,20 +352,20 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
   if (input->dim() == 3) {
     // Force batch
     is_batch = 0;
-    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
-    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+    THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2));
   }
 
-  int64_t inputWidth   = input->size[3];
-  int64_t inputHeight  = input->size[2];
+  int64_t inputWidth   = input->size(3);
+  int64_t inputHeight  = input->size(2);
   int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
   int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Define a buffer of ones, for bias accumulation
-  if (ones->dim() != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+  if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THTensor_(resize2d)(ones, outputHeight, outputWidth);
     THTensor_(fill)(ones, 1);
@@ -401,9 +401,9 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
 
       // M,N,K are dims of matrix A and B
       // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-      int64_t n = columns->size[0];   // nOutputPlane * kh * kw
-      int64_t m = input_n->size[0];   // nInputPlane
-      int64_t k = columns->size[1];   // inputHeight * inputWidth
+      int64_t n = columns->size(0);   // nOutputPlane * kh * kw
+      int64_t m = input_n->size(0);   // nInputPlane
+      int64_t k = columns->size(1);   // inputHeight * inputWidth
 
       // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
       THBlas_(gemm)(
@@ -444,7 +444,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
   // Resize
   if (is_batch == 0) {
     THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
-    THTensor_(resize3d)(input, input->size[1], inputHeight, inputWidth);
+    THTensor_(resize3d)(input, input->size(1), inputHeight, inputWidth);
   }
 
   THTensor_(free)(input);
diff --git a/aten/src/THNN/generic/SpatialMaxUnpooling.c b/aten/src/THNN/generic/SpatialMaxUnpooling.c
index 64179b52779a01..cbfb34ed924b57 100644
--- a/aten/src/THNN/generic/SpatialMaxUnpooling.c
+++ b/aten/src/THNN/generic/SpatialMaxUnpooling.c
@@ -67,15 +67,15 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
 
   if (input->dim() == 4)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
   }
 
   /* sizes */
-  nslices = input->size[dimh-1];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimh-1);
+  iheight = input->size(dimh);
+  iwidth = input->size(dimw);
 
   /* get contiguous input and indices */
   input = THTensor_(newContiguous)(input);
@@ -184,19 +184,19 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
   THTensor_(zero)(gradInput);
 
   if (input->dim() == 4) {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
   }
 
   /* sizes */
-  nslices = input->size[dimh-1];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimh-1);
+  iheight = input->size(dimh);
+  iwidth = input->size(dimw);
 
-  if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
+  if(owidth!=gradOutput->size(dimw) || oheight!=gradOutput->size(dimh)){
     THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
-	    oheight, owidth, gradOutput->size[dimh], gradOutput->size[dimw]);
+	    oheight, owidth, gradOutput->size(dimh), gradOutput->size(dimw));
   }
 
   /* get raw pointers */
diff --git a/aten/src/THNN/generic/SpatialReflectionPadding.c b/aten/src/THNN/generic/SpatialReflectionPadding.c
index 4ccdca8abde381..dec9ffd3f94ae6 100644
--- a/aten/src/THNN/generic/SpatialReflectionPadding.c
+++ b/aten/src/THNN/generic/SpatialReflectionPadding.c
@@ -72,26 +72,24 @@ void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
 
   if (input->dim() == 4)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
     dimslices++;
   }
 
   /* input sizes */
-  nslices = input->size[dimslices];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimslices);
+  iheight = input->size(dimh);
+  iwidth = input->size(dimw);
 
-  THArgCheck(pad_l < iwidth && pad_r < iwidth, 4,
-             "Padding size should be less than the corresponding input dimension, "
-             "but got: padding (%d, %d) at dimension %d of input %s",
-             pad_l, pad_r, dimw, _THSizeDesc(input->size, input->dim()).str);
+  AT_CHECK(pad_l < iwidth && pad_r < iwidth,
+           "Argument #4: Padding size should be less than the corresponding input dimension, "
+           "but got: padding (", pad_l, ", ", pad_r, ") at dimension ", dimw, " of input ", input->sizes());
 
-  THArgCheck(pad_t < iheight && pad_b < iheight, 6,
-             "Padding size should be less than the corresponding input dimension, "
-             "but got: padding (%d, %d) at dimension %d of input %s",
-             pad_t, pad_b, dimh, _THSizeDesc(input->size, input->dim()).str);
+  AT_CHECK(pad_t < iheight && pad_b < iheight,
+           "Argument #6: Padding size should be less than the corresponding input dimension, "
+           "but got: padding (", pad_t, ", ", pad_b, ") at dimension ", dimh, " of input ", input->sizes());
 
   /* output sizes */
   oheight = iheight + pad_t + pad_b;
@@ -213,16 +211,16 @@ void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
 
   if (input->dim() == 4)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
     dimslices++;
   }
 
   /* sizes */
-  nslices = input->size[dimslices];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimslices);
+  iheight = input->size(dimh);
+  iwidth = input->size(dimw);
   oheight = iheight + pad_t + pad_b;
   owidth  = iwidth + pad_l + pad_r;
 
diff --git a/aten/src/THNN/generic/SpatialReplicationPadding.c b/aten/src/THNN/generic/SpatialReplicationPadding.c
index 32c125d87c4c49..9275768001e1b6 100644
--- a/aten/src/THNN/generic/SpatialReplicationPadding.c
+++ b/aten/src/THNN/generic/SpatialReplicationPadding.c
@@ -71,16 +71,16 @@ void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
 
   if (input->dim() == 4)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
     dimslices++;
   }
 
   /* sizes */
-  nslices = input->size[dimslices];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimslices);
+  iheight = input->size(dimh);
+  iwidth = input->size(dimw);
   oheight = iheight + pad_t + pad_b;
   owidth  = iwidth + pad_l + pad_r;
 
@@ -200,16 +200,16 @@ void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
 
   if (input->dim() == 4)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
     dimslices++;
   }
 
   /* sizes */
-  nslices = input->size[dimslices];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimslices);
+  iheight = input->size(dimh);
+  iwidth = input->size(dimw);
   oheight = iheight + pad_t + pad_b;
   owidth  = iwidth + pad_l + pad_r;
 
diff --git a/aten/src/THNN/generic/SpatialSubSampling.c b/aten/src/THNN/generic/SpatialSubSampling.c
index 8f9f95d48252e8..10303e951c615f 100644
--- a/aten/src/THNN/generic/SpatialSubSampling.c
+++ b/aten/src/THNN/generic/SpatialSubSampling.c
@@ -24,10 +24,10 @@ static inline void THNN_(SpatialSubSampling_shapeCheck)(
     dimh++;
   }
 
-  inputWidth = input->size[dimw];
-  inputHeight = input->size[dimh];
+  inputWidth = input->size(dimw);
+  inputHeight = input->size(dimh);
 
-  THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(input->size(dimh-1) == nInputPlane, 2, "invalid number of input planes");
   THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
 }
 
@@ -63,20 +63,20 @@ void THNN_(SpatialSubSampling_updateOutput)(
   THNN_(SpatialSubSampling_shapeCheck)(input, NULL, weight, kW, kH);
 
   if (input->dim() == 4) {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
   }
 
-  inputWidth = input->size[dimw];
-  inputHeight = input->size[dimh];
+  inputWidth = input->size(dimw);
+  inputHeight = input->size(dimh);
   outputWidth = (inputWidth - kW) / dW + 1;
   outputHeight = (inputHeight - kH) / dH + 1;
 
   if (input->dim() == 3)
     THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
   else
-    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+    THTensor_(resize4d)(output, input->size(0), nInputPlane, outputHeight, outputWidth);
 
   input = THTensor_(newContiguous)(input);
   input_data = THTensor_(data)(input);
@@ -152,13 +152,13 @@ void THNN_(SpatialSubSampling_updateGradInput)(
   int64_t k;
 
   if (input->dim() == 4) {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
   }
 
-  inputWidth = input->size[dimw];
-  inputHeight = input->size[dimh];
+  inputWidth = input->size(dimw);
+  inputHeight = input->size(dimh);
   outputWidth = (inputWidth - kW) / dW + 1;
   outputHeight = (inputHeight - kH) / dH + 1;
 
@@ -239,11 +239,11 @@ void THNN_(SpatialSubSampling_accGradParameters)(
   if (input->dim() == 4) {
     dimw++;
     dimh++;
-    nbatch = input->size[0];
+    nbatch = input->size(0);
   }
 
-  inputWidth = input->size[dimw];
-  inputHeight = input->size[dimh];
+  inputWidth = input->size(dimw);
+  inputHeight = input->size(dimh);
   outputWidth = (inputWidth - kW) / dW + 1;
   outputHeight = (inputHeight - kH) / dH + 1;
 
diff --git a/aten/src/THNN/generic/THNN.h b/aten/src/THNN/generic/THNN.h
index 98338d64da9748..455da04c7e4454 100644
--- a/aten/src/THNN/generic/THNN.h
+++ b/aten/src/THNN/generic/THNN.h
@@ -147,39 +147,39 @@ TH_API void THNN_(Im2Col_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          int kH, int kW,
-          int dH, int dW,
-          int padH, int padW,
-          int sH, int sW);
+          int64_t kH, int64_t kW,
+          int64_t dH, int64_t dW,
+          int64_t padH, int64_t padW,
+          int64_t sH, int64_t sW);
 
 TH_API void THNN_(Im2Col_updateGradInput)(
           THNNState *state,
           THTensor *gradOutput,
           THTensor *gradInput,
-          int inputHeight, int inputWidth,
-          int kH, int kW,
-          int dH, int dW,
-          int padH, int padW,
-          int sH, int sW);
+          int64_t inputHeight, int64_t inputWidth,
+          int64_t kH, int64_t kW,
+          int64_t dH, int64_t dW,
+          int64_t padH, int64_t padW,
+          int64_t sH, int64_t sW);
 
 TH_API void THNN_(Col2Im_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          int outputHeight, int outputWidth,
-          int kH, int kW,
-          int dH, int dW,
-          int padH, int padW,
-          int sH, int sW);
+          int64_t outputHeight, int64_t outputWidth,
+          int64_t kH, int64_t kW,
+          int64_t dH, int64_t dW,
+          int64_t padH, int64_t padW,
+          int64_t sH, int64_t sW);
 
 TH_API void THNN_(Col2Im_updateGradInput)(
           THNNState *state,
           THTensor *gradOutput,
           THTensor *gradInput,
-          int kH, int kW,
-          int dH, int dW,
-          int padH, int padW,
-          int sH, int sW);
+          int64_t kH, int64_t kW,
+          int64_t dH, int64_t dW,
+          int64_t padH, int64_t padW,
+          int64_t sH, int64_t sW);
 
 TH_API void THNN_(L1Cost_updateOutput)(
           THNNState *state,            // library's state
diff --git a/aten/src/THNN/generic/TemporalConvolution.c b/aten/src/THNN/generic/TemporalConvolution.c
index a7fdd3f96444eb..2c3e1da84de5ea 100644
--- a/aten/src/THNN/generic/TemporalConvolution.c
+++ b/aten/src/THNN/generic/TemporalConvolution.c
@@ -25,13 +25,13 @@ static inline void THNN_(TemporalConvolution_shapeCheck)(
   THNN_ARGCHECK(!input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input,
                   "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s");
   if (inputFrameSize != NULL) {
-    THArgCheck(input->size[dimF] == *inputFrameSize, 2,
+    THArgCheck(input->size(dimF) == *inputFrameSize, 2,
                "invalid input frame size. Got: %d, Expected: %d",
-               input->size[dimF], *inputFrameSize);
+               input->size(dimF), *inputFrameSize);
   }
-  THArgCheck(input->size[dimS] >= kW, 2,
+  THArgCheck(input->size(dimS) >= kW, 2,
              "input sequence smaller than kernel size. Got: %d, Expected: %d",
-             input->size[dimS], kW);
+             input->size(dimS), kW);
 }
 
 void THNN_(TemporalConvolution_updateOutput)(
@@ -64,7 +64,7 @@ void THNN_(TemporalConvolution_updateOutput)(
   outputWindow = THTensor_(new)();
   inputWindow = THTensor_(new)();
 
-  nInputFrame = input->size[dimS];
+  nInputFrame = input->size(dimS);
   nOutputFrame = (nInputFrame - kW) / dW + 1;
 
   if (input->dim() == 2)
@@ -89,14 +89,14 @@ void THNN_(TemporalConvolution_updateOutput)(
       nOutputFrame -= nFrame;
 
       THTensor_(setStorage2d)(inputWindow, input->storage,
-                              input->storageOffset+k*dW*input->size[1],
-                              nFrame, inputFrameStride*input->size[1],
-                              kW*input->size[1], 1);
+                              input->storageOffset+k*dW*input->size(1),
+                              nFrame, inputFrameStride*input->size(1),
+                              kW*input->size(1), 1);
 
       THTensor_(setStorage2d)(outputWindow, output->storage,
-                              output->storageOffset + k*output->size[1],
-                              nFrame, outputFrameStride*output->size[1],
-                              output->size[1], 1);
+                              output->storageOffset + k*output->size(1),
+                              nFrame, outputFrameStride*output->size(1),
+                              output->size(1), 1);
 
       THTensor *tweight = THTensor_(new)();
       THTensor_(transpose)(tweight, weight, 0, 1);
@@ -108,7 +108,7 @@ void THNN_(TemporalConvolution_updateOutput)(
   {
     THTensor *outputSample = THTensor_(new)();
     THTensor *inputSample = THTensor_(new)();
-    int nBatchFrame = input->size[0];
+    int nBatchFrame = input->size(0);
 
     THTensor_(resize3d)(output,
                         nBatchFrame,
@@ -137,14 +137,14 @@ void THNN_(TemporalConvolution_updateOutput)(
         nOutputSampleFrame -= nFrame;
 
         THTensor_(setStorage2d)(inputWindow, inputSample->storage,
-                                inputSample->storageOffset+k*dW*inputSample->size[1],
-                                nFrame, inputFrameStride*inputSample->size[1],
-                                kW*inputSample->size[1], 1);
+                                inputSample->storageOffset+k*dW*inputSample->size(1),
+                                nFrame, inputFrameStride*inputSample->size(1),
+                                kW*inputSample->size(1), 1);
 
         THTensor_(setStorage2d)(outputWindow, outputSample->storage,
-                                outputSample->storageOffset + k*outputSample->size[1],
-                                nFrame, outputFrameStride*outputSample->size[1],
-                                outputSample->size[1], 1);
+                                outputSample->storageOffset + k*outputSample->size(1),
+                                nFrame, outputFrameStride*outputSample->size(1),
+                                outputSample->size(1), 1);
 
         THTensor *tweight = THTensor_(new)();
         THTensor_(transpose)(tweight, weight, 0, 1);
@@ -188,8 +188,8 @@ void THNN_(TemporalConvolution_updateGradInput)(
   THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
   THNN_(TemporalConvolution_shapeCheck)(
         state, input, kW, dW, NULL);
-  nInputFrame = input->size[dimS];
-  nOutputFrame = gradOutput->size[dimS];
+  nInputFrame = input->size(dimS);
+  nOutputFrame = gradOutput->size(dimS);
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
@@ -211,14 +211,14 @@ void THNN_(TemporalConvolution_updateGradInput)(
       nOutputFrame -= nFrame;
 
       THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
-                              gradOutput->storageOffset + k*gradOutput->size[1],
-                              nFrame, outputFrameStride*gradOutput->size[1],
-                              gradOutput->size[1], 1);
+                              gradOutput->storageOffset + k*gradOutput->size(1),
+                              nFrame, outputFrameStride*gradOutput->size(1),
+                              gradOutput->size(1), 1);
 
       THTensor_(setStorage2d)(gradInputWindow, gradInput->storage,
-                              gradInput->storageOffset+k*dW*gradInput->size[1],
-                              nFrame, inputFrameStride*gradInput->size[1],
-                              kW*gradInput->size[1], 1);
+                              gradInput->storageOffset+k*dW*gradInput->size(1),
+                              nFrame, inputFrameStride*gradInput->size(1),
+                              kW*gradInput->size(1), 1);
 
       THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
     }
@@ -227,7 +227,7 @@ void THNN_(TemporalConvolution_updateGradInput)(
   {
     THTensor *gradOutputSample = THTensor_(new)();
     THTensor *gradInputSample = THTensor_(new)();
-    int nBatchFrame = input->size[0];
+    int nBatchFrame = input->size(0);
 
     for(i = 0; i < nBatchFrame; i++)
     {
@@ -244,14 +244,14 @@ void THNN_(TemporalConvolution_updateGradInput)(
         nOutputSampleFrame -= nFrame;
 
         THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
-                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
-                                nFrame, outputFrameStride*gradOutputSample->size[1],
-                                gradOutputSample->size[1], 1);
+                                gradOutputSample->storageOffset + k*gradOutputSample->size(1),
+                                nFrame, outputFrameStride*gradOutputSample->size(1),
+                                gradOutputSample->size(1), 1);
 
         THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage,
-                                gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
-                                nFrame, inputFrameStride*gradInputSample->size[1],
-                                kW*gradInputSample->size[1], 1);
+                                gradInputSample->storageOffset+k*dW*gradInputSample->size(1),
+                                nFrame, inputFrameStride*gradInputSample->size(1),
+                                kW*gradInputSample->size(1), 1);
 
         THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
       }
@@ -294,8 +294,8 @@ void THNN_(TemporalConvolution_accGradParameters)(
 
   THNN_(TemporalConvolution_shapeCheck)(
         state, input, kW, dW, NULL);
-  nInputFrame = input->size[dimS];
-  nOutputFrame = gradOutput->size[dimS];
+  nInputFrame = input->size(dimS);
+  nOutputFrame = gradOutput->size(dimS);
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
@@ -320,14 +320,14 @@ void THNN_(TemporalConvolution_accGradParameters)(
       nOutputFrame -= nFrame;
 
       THTensor_(setStorage2d)(inputWindow, input->storage,
-                              input->storageOffset+k*dW*input->size[1],
-                              nFrame, inputFrameStride*input->size[1],
-                              kW*input->size[1], 1);
+                              input->storageOffset+k*dW*input->size(1),
+                              nFrame, inputFrameStride*input->size(1),
+                              kW*input->size(1), 1);
 
       THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
-                              gradOutput->storageOffset + k*gradOutput->size[1],
-                              nFrame, outputFrameStride*gradOutput->size[1],
-                              gradOutput->size[1], 1);
+                              gradOutput->storageOffset + k*gradOutput->size(1),
+                              nFrame, outputFrameStride*gradOutput->size(1),
+                              gradOutput->size(1), 1);
 
       THTensor *tgradOutputWindow = THTensor_(new)();
       THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1);
@@ -339,7 +339,7 @@ void THNN_(TemporalConvolution_accGradParameters)(
   {
     THTensor *gradOutputSample = THTensor_(new)();
     THTensor *inputSample = THTensor_(new)();
-    int nBatchFrame = input->size[0];
+    int nBatchFrame = input->size(0);
 
     for(i = 0; i < nBatchFrame; i++)
     {
@@ -363,14 +363,14 @@ void THNN_(TemporalConvolution_accGradParameters)(
         nOutputSampleFrame -= nFrame;
 
         THTensor_(setStorage2d)(inputWindow, inputSample->storage,
-                                inputSample->storageOffset+k*dW*inputSample->size[1],
-                                nFrame, inputFrameStride*inputSample->size[1],
-                                kW*inputSample->size[1], 1);
+                                inputSample->storageOffset+k*dW*inputSample->size(1),
+                                nFrame, inputFrameStride*inputSample->size(1),
+                                kW*inputSample->size(1), 1);
 
         THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
-                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
-                                nFrame, outputFrameStride*gradOutputSample->size[1],
-                                gradOutputSample->size[1], 1);
+                                gradOutputSample->storageOffset + k*gradOutputSample->size(1),
+                                nFrame, outputFrameStride*gradOutputSample->size(1),
+                                gradOutputSample->size(1), 1);
 
         THTensor *tgradOutputWindow = THTensor_(new)();
         THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1);
diff --git a/aten/src/THNN/generic/TemporalMaxPooling.c b/aten/src/THNN/generic/TemporalMaxPooling.c
index faef3059d09619..69f1c94eced211 100644
--- a/aten/src/THNN/generic/TemporalMaxPooling.c
+++ b/aten/src/THNN/generic/TemporalMaxPooling.c
@@ -23,8 +23,8 @@ static inline void THNN_(TemporalMaxPooling_shapeCheck)(
     dimF = 2;
   }
 
-  niframe = input->size[dimS];
-  framesize = input->size[dimF];
+  niframe = input->size(dimS);
+  framesize = input->size(dimF);
   noframe = (niframe - kW) / dW + 1;
 
   THArgCheck(kW > 0, 5,
@@ -34,9 +34,9 @@ static inline void THNN_(TemporalMaxPooling_shapeCheck)(
 
   THNN_ARGCHECK(!input->is_empty() && (input->dim() == 2 || input->dim() == 3), 2, input,
                 "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s");
-  THArgCheck(input->size[dimS] >= kW, 2,
+  THArgCheck(input->size(dimS) >= kW, 2,
              "input sequence smaller than kernel size. Got: %d, Expected: %d",
-             input->size[dimS], kW);
+             input->size(dimS), kW);
 
   if (gradOutput != NULL) {
     THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimS, noframe);
@@ -78,8 +78,8 @@ void THNN_(TemporalMaxPooling_updateOutput)(
   }
 
   /* sizes */
-  niframe = input->size[dimS];
-  framesize = input->size[dimF];
+  niframe = input->size(dimS);
+  framesize = input->size(dimF);
   noframe = (niframe - kW) / dW + 1;
 
   /* get contiguous input */
@@ -129,7 +129,7 @@ void THNN_(TemporalMaxPooling_updateOutput)(
   else
   {
     /* number of batch frames */
-    int64_t nbframe = input->size[0];
+    int64_t nbframe = input->size(0);
     int64_t i;
 
     /* resize output */
@@ -221,9 +221,9 @@ void THNN_(TemporalMaxPooling_updateGradInput)(
     dimF = 2;
   }
   /* sizes */
-  niframe = input->size[dimS];
-  noframe = gradOutput->size[dimS];
-  framesize = gradOutput->size[dimF];
+  niframe = input->size(dimS);
+  noframe = gradOutput->size(dimS);
+  framesize = gradOutput->size(dimF);
 
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
@@ -250,7 +250,7 @@ void THNN_(TemporalMaxPooling_updateGradInput)(
   else
   {
     /* number of batch frames */
-    int64_t nbframe = input->size[0];
+    int64_t nbframe = input->size(0);
     int64_t i;
 
     for(i = 0; i < nbframe; i++)
diff --git a/aten/src/THNN/generic/TemporalReflectionPadding.c b/aten/src/THNN/generic/TemporalReflectionPadding.c
index ea6ea9ab60ebb5..43eb604a78972c 100644
--- a/aten/src/THNN/generic/TemporalReflectionPadding.c
+++ b/aten/src/THNN/generic/TemporalReflectionPadding.c
@@ -55,19 +55,18 @@ void THNN_(TemporalReflectionPadding_updateOutput)(THNNState *state,
 
   if (input->dim() == 3)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimslices++;
   }
 
   /* input size */
-  nslices = input->size[dimslices];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimslices);
+  iwidth = input->size(dimw);
 
-  THArgCheck(pad_l < iwidth && pad_r < iwidth, 4,
-             "Padding size should be less than the corresponding input dimension, "
-             "but got: padding (%d, %d) at dimension %d of input %s",
-             pad_l, pad_r, dimw, _THSizeDesc(input->size, input->dim()).str);
+  AT_CHECK(pad_l < iwidth && pad_r < iwidth,
+           "Argument #4: Padding size should be less than the corresponding input dimension, "
+           "but got: padding (", pad_l, ", ", pad_r, ") at dimension ", dimw, " of input ", input->sizes());
 
   /* output size */
   owidth  = iwidth + pad_l + pad_r;
@@ -168,14 +167,14 @@ void THNN_(TemporalReflectionPadding_updateGradInput)(THNNState *state,
 
   if (input->dim() == 3)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimslices++;
   }
 
   /* sizes */
-  nslices = input->size[dimslices];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimslices);
+  iwidth = input->size(dimw);
   owidth  = iwidth + pad_l + pad_r;
 
   THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
diff --git a/aten/src/THNN/generic/TemporalReplicationPadding.c b/aten/src/THNN/generic/TemporalReplicationPadding.c
index da8aeb5d8e52a0..e47a94144530c3 100644
--- a/aten/src/THNN/generic/TemporalReplicationPadding.c
+++ b/aten/src/THNN/generic/TemporalReplicationPadding.c
@@ -53,14 +53,14 @@ void THNN_(TemporalReplicationPadding_updateOutput)(THNNState *state,
 
   if (input->dim() == 3)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimslices++;
   }
 
   /* sizes */
-  nslices = input->size[dimslices];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimslices);
+  iwidth = input->size(dimw);
   owidth  = iwidth + pad_l + pad_r;
 
   THArgCheck(owidth >= 1 , 2,
@@ -159,14 +159,14 @@ void THNN_(TemporalReplicationPadding_updateGradInput)(THNNState *state,
 
   if (input->dim() == 3)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimslices++;
   }
 
   /* sizes */
-  nslices = input->size[dimslices];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimslices);
+  iwidth = input->size(dimw);
   owidth  = iwidth + pad_l + pad_r;
 
   THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
diff --git a/aten/src/THNN/generic/TemporalRowConvolution.c b/aten/src/THNN/generic/TemporalRowConvolution.c
index db3278b3c2e1b3..5a85065058670c 100644
--- a/aten/src/THNN/generic/TemporalRowConvolution.c
+++ b/aten/src/THNN/generic/TemporalRowConvolution.c
@@ -22,7 +22,7 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)(
     THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
 
 	if (bias != NULL) {
-		THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+		THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
 	}
 
 	// we're always looking at (possibly batch) x feats x seq
@@ -38,8 +38,8 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)(
 	THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
 	              "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
 
-	int64_t inputFrameSize = weight->size[0];
-	int64_t nInputFrame = input->size[dimS];
+	int64_t inputFrameSize = weight->size(0);
+	int64_t nInputFrame = input->size(dimS);
 	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
 
 	if (nOutputFrame < 1) {
@@ -162,7 +162,7 @@ static void THNN_(TemporalRowConvolution_updateOutput_frame)(
 		for (i = 0; i < inputFrameSize; i++)
 			THVector_(fill)
 			        (THStorage_(data)(output->storage) + output->storageOffset
-			        + output->stride[0] * i,
+			        + output->stride(0) * i,
 			        THTensor_(get1d)(bias, i), nOutputFrame);
 	}
 
@@ -186,7 +186,7 @@ void THNN_(TemporalRowConvolution_updateOutput)(
 
 	int ndim = input->dim();
 
-	THTensor *tinput;
+	THTensor *tinput = NULL;
 	if (!featFirst) {
 		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
 		input = THTensor_(newContiguous)(tinput);
@@ -197,8 +197,8 @@ void THNN_(TemporalRowConvolution_updateOutput)(
 	THNN_(TemporalRowConvolution_shapeCheck)(
 		state, input, NULL, weight, bias, kW, dW, padW);
 
-	int64_t inputFrameSize = weight->size[0];
-	int64_t nInputFrame = input->size[ndim - 1];
+	int64_t inputFrameSize = weight->size(0);
+	int64_t nInputFrame = input->size(ndim - 1);
 	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
 
 	if (ndim == 2) { /* non-batch mode */
@@ -215,7 +215,7 @@ void THNN_(TemporalRowConvolution_updateOutput)(
 		        inputFrameSize, nInputFrame, nOutputFrame);
 
 	} else {
-		int64_t T = input->size[0];
+		int64_t T = input->size(0);
 		int64_t t;
 
 		THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
@@ -311,8 +311,8 @@ void THNN_(TemporalRowConvolution_updateGradInput)(
 	THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
 	                                         NULL, kW, dW, padW);
 
-	int64_t inputFrameSize = weight->size[0];
-	int64_t nInputFrame = input->size[ndim - 1];
+	int64_t inputFrameSize = weight->size(0);
+	int64_t nInputFrame = input->size(ndim - 1);
 	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
 
 	THTensor_(resizeAs)(fgradInput, finput);
@@ -330,7 +330,7 @@ void THNN_(TemporalRowConvolution_updateGradInput)(
 		        kW, dW, padW,
 		        inputFrameSize, nInputFrame, nOutputFrame);
 	} else {
-		int64_t T = input->size[0];
+		int64_t T = input->size(0);
 		int64_t t;
 
 #pragma omp parallel for private(t)
@@ -373,9 +373,9 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
 	int64_t i;
 	THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
 		gradOutput->storage, gradOutput->storageOffset,
-		gradOutput->size[0], -1,
+		gradOutput->size(0), -1,
 		1, -1,
-		gradOutput->size[1], -1);
+		gradOutput->size(1), -1);
 
     THTensor *tfinput = THTensor_(new)();
 	THTensor_(transpose)(tfinput, finput, 1, 2);
@@ -386,13 +386,13 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
     THTensor_(free)(tfinput);
 
 	if (gradBias != NULL) {
-		for (i = 0; i < gradBias->size[0]; i++) {
+		for (i = 0; i < gradBias->size(0); i++) {
 			int64_t k;
 			real sum = 0;
 			real *data = THStorage_(data)(gradOutput3d->storage)
 			             + gradOutput3d->storageOffset
-			             + i * gradOutput3d->stride[0];
-			for (k = 0; k < gradOutput3d->size[2]; k++) {
+			             + i * gradOutput3d->stride(0);
+			for (k = 0; k < gradOutput3d->size(2); k++) {
 				sum += data[k];
 			}
 			(THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i]
@@ -441,7 +441,7 @@ void THNN_(TemporalRowConvolution_accGradParameters)(
 		THNN_(TemporalRowConvolution_accGradParameters_frame)(
 			gradOutput, gradWeight, gradBias, finput, scale);
 	} else {
-		int64_t T = input->size[0];
+		int64_t T = input->size(0);
 		int64_t t;
 
 		for (t = 0; t < T; t++) {
diff --git a/aten/src/THNN/generic/TemporalSubSampling.c b/aten/src/THNN/generic/TemporalSubSampling.c
index 8c90d26a2cc403..5467827fe0ca81 100644
--- a/aten/src/THNN/generic/TemporalSubSampling.c
+++ b/aten/src/THNN/generic/TemporalSubSampling.c
@@ -19,15 +19,15 @@ static inline void THNN_(TemporalSubSampling_shapeCheck)(
   THNN_ARGCHECK(!input->is_empty() && input->dim() == 2, 2, input,
                   "non-empty 2D or 3D (batch mode) tensor expected for input, but got: %s");
   if (inputFrameSize != NULL) {
-    THArgCheck( input->size[1] == *inputFrameSize, 2,
+    THArgCheck( input->size(1) == *inputFrameSize, 2,
                 "invalid input frame size.  Got: %d, Expected: %d",
-                input->size[1], *inputFrameSize);
+                input->size(1), *inputFrameSize);
   }
-  THArgCheck( input->size[0] >= kW, 2,
+  THArgCheck( input->size(0) >= kW, 2,
               "input sequence smaller than kernel size.  Got %d, Expected: %d",
-              input->size[0], kW);
+              input->size(0), kW);
 
-  nInputFrame = input->size[0];
+  nInputFrame = input->size(0);
   nOutputFrame = (nInputFrame - kW) / dW + 1;
 
   if (gradOutput != NULL) {
@@ -59,7 +59,7 @@ void THNN_(TemporalSubSampling_updateOutput)(
   outputFrame = THTensor_(new)();
   inputWindow = THTensor_(new)();
 
-  nInputFrame = input->size[0];
+  nInputFrame = input->size(0);
   nOutputFrame = (nInputFrame - kW) / dW + 1;
 
   THTensor_(resize2d)(output,
@@ -105,7 +105,7 @@ void THNN_(TemporalSubSampling_updateGradInput)(
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(zero)(gradInput);
 
-  for(k = 0; k < gradOutput->size[0]; k++)
+  for(k = 0; k < gradOutput->size(0); k++)
   {
     THTensor_(narrow)(gradInputWindow, gradInput, 0, k*dW, kW);
     THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
@@ -139,7 +139,7 @@ void THNN_(TemporalSubSampling_accGradParameters)(
   inputWindow = THTensor_(new)();
   buffer = THTensor_(new)();
 
-  for(k = 0; k < gradOutput->size[0]; k++)
+  for(k = 0; k < gradOutput->size(0); k++)
   {
     THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
     THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
diff --git a/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c b/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c
index 1edf8a99741df4..5956312ab24393 100644
--- a/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c
+++ b/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c
@@ -109,8 +109,8 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
 
   if (input->dim() == 5)
   {
-    istrideB = input->stride[0];
-    sizeB = input->size[0];
+    istrideB = input->stride(0);
+    sizeB = input->size(0);
     dimD++;
     dimT++;
     dimH++;
@@ -118,15 +118,15 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(
   }
 
   /* sizes */
-  sizeD  = input->size[dimD];
-  isizeT = input->size[dimT];
-  isizeH = input->size[dimH];
-  isizeW = input->size[dimW];
+  sizeD  = input->size(dimD);
+  isizeT = input->size(dimT);
+  isizeH = input->size(dimH);
+  isizeW = input->size(dimW);
   /* strides */
-  istrideD = input->stride[dimD];
-  istrideT = input->stride[dimT];
-  istrideH = input->stride[dimH];
-  istrideW = input->stride[dimW];
+  istrideD = input->stride(dimD);
+  istrideT = input->stride(dimT);
+  istrideH = input->stride(dimH);
+  istrideW = input->stride(dimW);
 
   /* resize output */
   if (input->dim() == 4)
@@ -253,7 +253,7 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)(
   THTensor_(zero)(gradInput);
 
   if (input->dim() == 5) {
-    sizeB = input->size[0];
+    sizeB = input->size(0);
     dimD++;
     dimT++;
     dimH++;
@@ -261,13 +261,13 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateGradInput)(
   }
 
   /* sizes */
-  sizeD  = input->size[dimD];
-  isizeT = input->size[dimT];
-  isizeH = input->size[dimH];
-  isizeW = input->size[dimW];
-  osizeT = gradOutput->size[dimT];
-  osizeH = gradOutput->size[dimH];
-  osizeW = gradOutput->size[dimW];
+  sizeD  = input->size(dimD);
+  isizeT = input->size(dimT);
+  isizeH = input->size(dimH);
+  isizeW = input->size(dimW);
+  osizeT = gradOutput->size(dimT);
+  osizeH = gradOutput->size(dimH);
+  osizeW = gradOutput->size(dimW);
 
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
diff --git a/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c b/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c
index 74efa76ebd42d1..00d5a763d38ae0 100644
--- a/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c
+++ b/aten/src/THNN/generic/VolumetricAdaptiveMaxPooling.c
@@ -120,8 +120,8 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)(
 
   if (input->dim() == 5)
   {
-    istrideB = input->stride[0];
-    sizeB = input->size[0];
+    istrideB = input->stride(0);
+    sizeB = input->size(0);
     dimD++;
     dimT++;
     dimH++;
@@ -129,15 +129,15 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateOutput)(
   }
 
   /* sizes */
-  sizeD  = input->size[dimD];
-  isizeT = input->size[dimT];
-  isizeH = input->size[dimH];
-  isizeW = input->size[dimW];
+  sizeD  = input->size(dimD);
+  isizeT = input->size(dimT);
+  isizeH = input->size(dimH);
+  isizeW = input->size(dimW);
   /* strides */
-  istrideD = input->stride[dimD];
-  istrideT = input->stride[dimT];
-  istrideH = input->stride[dimH];
-  istrideW = input->stride[dimW];
+  istrideD = input->stride(dimD);
+  istrideT = input->stride(dimT);
+  istrideH = input->stride(dimH);
+  istrideW = input->stride(dimW);
 
   /* resize output */
   if (input->dim() == 4)
@@ -254,7 +254,7 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)(
   THTensor_(zero)(gradInput);
 
   if (input->dim() == 5) {
-    sizeB = input->size[0];
+    sizeB = input->size(0);
     dimD++;
     dimT++;
     dimH++;
@@ -262,13 +262,13 @@ void THNN_(VolumetricAdaptiveMaxPooling_updateGradInput)(
   }
 
   /* sizes */
-  sizeD  = input->size[dimD];
-  isizeT = input->size[dimT];
-  isizeH = input->size[dimH];
-  isizeW = input->size[dimW];
-  osizeT = gradOutput->size[dimT];
-  osizeH = gradOutput->size[dimH];
-  osizeW = gradOutput->size[dimW];
+  sizeD  = input->size(dimD);
+  isizeT = input->size(dimT);
+  isizeH = input->size(dimH);
+  isizeW = input->size(dimW);
+  osizeT = gradOutput->size(dimT);
+  osizeH = gradOutput->size(dimH);
+  osizeW = gradOutput->size(dimW);
 
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
diff --git a/aten/src/THNN/generic/VolumetricAveragePooling.c b/aten/src/THNN/generic/VolumetricAveragePooling.c
index c9dd9f753dcc53..93448ad2e62534 100644
--- a/aten/src/THNN/generic/VolumetricAveragePooling.c
+++ b/aten/src/THNN/generic/VolumetricAveragePooling.c
@@ -47,11 +47,11 @@ static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
   THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
                 "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
 
-  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
-             && input->size[dimt] >= kT, 2,
+  THArgCheck(input->size(dimw) >= kW && input->size(dimh) >= kH
+             && input->size(dimt) >= kT, 2,
              "input image (T: %d H: %d W: %d) smaller than "
              "kernel size (kT: %d kH: %d kW: %d)",
-             input->size[dimt], input->size[dimh], input->size[dimw],
+             input->size(dimt), input->size(dimh), input->size(dimw),
              kT, kH, kW);
 
   // The second argument is argNumber... here is the index of padH.
@@ -61,10 +61,10 @@ static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
             padT, padW, padH, kT, kW, kH);
 
   /* sizes */
-  nslices = input->size[dimN];
-  itime   = input->size[dimt];
-  iheight = input->size[dimh];
-  iwidth  = input->size[dimw];
+  nslices = input->size(dimN);
+  itime   = input->size(dimt);
+  iheight = input->size(dimh);
+  iwidth  = input->size(dimw);
 
   if (ceil_mode) {
     otime   = (int64_t)(ceil((float)(itime   - kT + 2*padT) / dT)) + 1;
@@ -231,10 +231,10 @@ void THNN_(VolumetricAveragePooling_updateOutput)(
   }
 
   /* sizes */
-  nslices = input->size[dimN];
-  itime   = input->size[dimt];
-  iheight = input->size[dimh];
-  iwidth  = input->size[dimw];
+  nslices = input->size(dimN);
+  itime   = input->size(dimt);
+  iheight = input->size(dimh);
+  iwidth  = input->size(dimw);
   if (ceil_mode)
   {
     otime   = (int64_t)(ceil((float)(itime   - kT + 2*padT) / dT)) + 1;
@@ -283,7 +283,7 @@ void THNN_(VolumetricAveragePooling_updateOutput)(
   else  /* batch mode */
   {
     int64_t p;
-    int64_t nBatch = input->size[0];
+    int64_t nBatch = input->size(0);
 
     int64_t istride = nslices * itime * iwidth * iheight;
     int64_t ostride = nslices * otime * owidth * oheight;
@@ -445,13 +445,13 @@ void THNN_(VolumetricAveragePooling_updateGradInput)(
   }
 
   /* sizes */
-  nslices = input->size[dimN];
-  itime = input->size[dimt];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
-  otime = gradOutput->size[dimt];
-  oheight = gradOutput->size[dimh];
-  owidth = gradOutput->size[dimw];
+  nslices = input->size(dimN);
+  itime = input->size(dimt);
+  iheight = input->size(dimh);
+  iwidth = input->size(dimw);
+  otime = gradOutput->size(dimt);
+  oheight = gradOutput->size(dimh);
+  owidth = gradOutput->size(dimw);
 
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
@@ -473,7 +473,7 @@ void THNN_(VolumetricAveragePooling_updateGradInput)(
   else /* batch mode */
   {
     int64_t p;
-    int64_t nBatch = input->size[0];
+    int64_t nBatch = input->size(0);
 
     int64_t istride = nslices * itime * iwidth * iheight;
     int64_t ostride = nslices * otime * owidth * oheight;
diff --git a/aten/src/THNN/generic/VolumetricConvolution.c b/aten/src/THNN/generic/VolumetricConvolution.c
index d88cc606eea184..4b74445e047705 100644
--- a/aten/src/THNN/generic/VolumetricConvolution.c
+++ b/aten/src/THNN/generic/VolumetricConvolution.c
@@ -33,13 +33,13 @@ void THNN_(VolumetricConvolution_updateOutput)(
     dimw++;
   }
 
-  int64_t nOutputPlane = weight->size[0];
-  int64_t kT           = weight->size[2];
-  int64_t kH           = weight->size[3];
-  int64_t kW           = weight->size[4];
-  int64_t inputDepth   = input->size[dimt];
-  int64_t inputHeight  = input->size[dimh];
-  int64_t inputWidth   = input->size[dimw];
+  int64_t nOutputPlane = weight->size(0);
+  int64_t kT           = weight->size(2);
+  int64_t kH           = weight->size(3);
+  int64_t kW           = weight->size(4);
+  int64_t inputDepth   = input->size(dimt);
+  int64_t inputHeight  = input->size(dimh);
+  int64_t inputWidth   = input->size(dimw);
   int64_t outputDepth  = (inputDepth - kT) / dT + 1;
   int64_t outputWidth  = (inputWidth - kW) / dW + 1;
   int64_t outputHeight = (inputHeight - kH) / dH + 1;
@@ -51,7 +51,7 @@ void THNN_(VolumetricConvolution_updateOutput)(
 
     /* add bias */
     if (bias) {
-      for (i = 0; i < bias->size[0]; i++)
+      for (i = 0; i < bias->size(0); i++)
       {
         THTensor_(select)(outn, output, 0, i);
         THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
@@ -65,7 +65,7 @@ void THNN_(VolumetricConvolution_updateOutput)(
   }
   else /* batch mode */
   {
-    int64_t nBatch = input->size[0];
+    int64_t nBatch = input->size(0);
     THTensor_(resize5d)(output, nBatch, nOutputPlane, outputDepth, outputHeight, outputWidth);
     THTensor *inb = THTensor_(new)();
     THTensor *outb = THTensor_(new)();
@@ -78,7 +78,7 @@ void THNN_(VolumetricConvolution_updateOutput)(
 
       /* add bias */
       if (bias) {
-        for (i = 0; i < bias->size[0]; i++)
+        for (i = 0; i < bias->size(0); i++)
         {
           THTensor_(select)(outn, outb, 0, i);
           THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
@@ -117,7 +117,7 @@ void THNN_(VolumetricConvolution_updateGradInput)(
 		"non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
 		"expected for weight, but got: %s");
 
-  int nOutputPlane = (int)weight->size[0];
+  int nOutputPlane = (int)weight->size(0);
 
   THNN_ARGCHECK(!gradOutput->is_empty() && (gradOutput->dim() == 4 || gradOutput->dim() == 5), 3,
 		gradOutput,
@@ -129,7 +129,7 @@ void THNN_(VolumetricConvolution_updateGradInput)(
     dimPlane++;
   }
 
-  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+  THArgCheck(nOutputPlane == gradOutput->size(dimPlane), 1,
     "Number of output features is not equal to nOutputPlane"
   );
 
@@ -141,13 +141,13 @@ void THNN_(VolumetricConvolution_updateGradInput)(
   }
   else /* batch mode */
   {
-    int64_t nBatch = gradOutput->size[0];
+    int64_t nBatch = gradOutput->size(0);
     THTensor *ginpb = THTensor_(new)();
     THTensor *goutb = THTensor_(new)();
     int64_t j;
 
     THTensor_(resize5d)(gradInput,
-      input->size[0], input->size[1], input->size[2], input->size[3], input->size[4]
+      input->size(0), input->size(1), input->size(2), input->size(3), input->size(4)
     );
 
     /* loop over batches */
@@ -187,9 +187,9 @@ void THNN_(VolumetricConvolution_accGradParameters)(
 		"non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
 		"expected for gradWeight, but got: %s");
 
-  int nOutputPlane = (int)gradWeight->size[0];
+  int nOutputPlane = (int)gradWeight->size(0);
   if (gradBias) {
-    THArgCheck(!gradBias->is_empty() && gradBias->dim() == 1 && gradBias->size[0] == nOutputPlane, 5,
+    THArgCheck(!gradBias->is_empty() && gradBias->dim() == 1 && gradBias->size(0) == nOutputPlane, 5,
       "gradBias tensor has wrong size"
     );
   }
@@ -203,7 +203,7 @@ void THNN_(VolumetricConvolution_accGradParameters)(
     dimPlane++;
   }
 
-  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+  THArgCheck(nOutputPlane == gradOutput->size(dimPlane), 1,
     "Number of output features is not equal to nOutputPlane"
   );
 
@@ -226,7 +226,7 @@ void THNN_(VolumetricConvolution_accGradParameters)(
   }
   else /* batch mode */
   {
-    int64_t nBatch = gradOutput->size[0];
+    int64_t nBatch = gradOutput->size(0);
     THTensor *inpb = THTensor_(new)();
     THTensor *goutb = THTensor_(new)();
     int64_t j;
diff --git a/aten/src/THNN/generic/VolumetricConvolutionMM.c b/aten/src/THNN/generic/VolumetricConvolutionMM.c
index 2fa1874c55941c..525fa5928edc68 100644
--- a/aten/src/THNN/generic/VolumetricConvolutionMM.c
+++ b/aten/src/THNN/generic/VolumetricConvolutionMM.c
@@ -31,7 +31,7 @@ static void inline THNN_(VolumetricConvolutionMM_shapeCheck)(
     THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 5), 5, weight,
                     "non-empty 2D or 5D weight tensor expected, but got: %s");
     if (bias != NULL) {
-      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
     }
   } else if (!weight_nullable) {
     THError("weight tensor is expected to be non-nullable");
@@ -62,9 +62,9 @@ static void inline THNN_(VolumetricConvolutionMM_shapeCheck)(
   int64_t outputHeight;
   int64_t outputWidth;
 
-  inputDepth = input->size[dimt];
-  inputHeight  = input->size[dimh];
-  inputWidth   = input->size[dimw];
+  inputDepth = input->size(dimt);
+  inputHeight  = input->size(dimh);
+  inputWidth   = input->size(dimw);
 
   exactInputDepth = inputDepth + 2*pT;
   exactInputHeight = inputHeight + 2*pH;
@@ -88,7 +88,7 @@ static void inline THNN_(VolumetricConvolutionMM_shapeCheck)(
   }
 
   if (weight != NULL) {
-    int64_t nInputPlane = weight->size[1];
+    int64_t nInputPlane = weight->size(1);
     if (weight->dim() == 2) {
       nInputPlane /= (kT * kH * kW);
     }
@@ -97,10 +97,10 @@ static void inline THNN_(VolumetricConvolutionMM_shapeCheck)(
 
   if (gradOutput != NULL) {
     if (weight != NULL) {
-      int64_t nOutputPlane = weight->size[0];
+      int64_t nOutputPlane = weight->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size[0];
+      int64_t nOutputPlane = bias->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth);
@@ -113,8 +113,8 @@ static THTensor* THNN_(newViewWeight)(THTensor *weight)
 {
   weight = THTensor_(newContiguous)(weight);
   if (weight->dim() == 5) {
-    int64_t s1 = weight->size[0];
-    int64_t s2 = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    int64_t s1 = weight->size(0);
+    int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4);
     THTensor *old_weight = weight;
     weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
 					 s1, -1, s2, -1);
@@ -435,7 +435,7 @@ static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
       for (i = 0; i < nOutputPlane; i++)
       {
         THVector_(fill)(
-          THStorage_(data)(output->storage)+output->storageOffset+output->stride[0]*i,
+          THStorage_(data)(output->storage)+output->storageOffset+output->stride(0)*i,
           THTensor_(get1d)(bias, i),
           outputDepth*outputHeight*outputWidth
         );
@@ -494,11 +494,11 @@ void THNN_(VolumetricConvolutionMM_updateOutput)(
     dimw++;
   }
 
-  nInputPlane = input->size[dimf];
-  inputDepth = input->size[dimt];
-  inputHeight  = input->size[dimh];
-  inputWidth   = input->size[dimw];
-  nOutputPlane = weight->size[0];
+  nInputPlane = input->size(dimf);
+  inputDepth = input->size(dimt);
+  inputHeight  = input->size(dimh);
+  inputWidth   = input->size(dimw);
+  nOutputPlane = weight->size(0);
   outputDepth  = (inputDepth + 2*pT - kT) / dT + 1;
   outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
   outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;
@@ -521,7 +521,7 @@ void THNN_(VolumetricConvolutionMM_updateOutput)(
   }
   else
   {
-    int64_t T = input->size[0];
+    int64_t T = input->size(0);
     int64_t t;
 
     THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
@@ -571,8 +571,8 @@ static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
 {
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
     gradOutput->storage, gradOutput->storageOffset,
-    gradOutput->size[0], -1,
-    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+    gradOutput->size(0), -1,
+    gradOutput->size(1)*gradOutput->size(2)*gradOutput->size(3), -1
   );
 
   THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
@@ -585,8 +585,8 @@ static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
     kT, kW, kH,
     dT, dW, dH,
     pT, pW, pH,
-    gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2],
-    gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]
+    gradInput->size(0), gradInput->size(1), gradInput->size(3), gradInput->size(2),
+    gradOutput->size(1), gradOutput->size(3), gradOutput->size(2)
   );
 }
 
@@ -636,7 +636,7 @@ void THNN_(VolumetricConvolutionMM_updateGradInput)(
   }
   else
   {
-    int64_t T = input->size[0];
+    int64_t T = input->size(0);
     int64_t t;
 
 #ifdef _OPENMP
@@ -677,8 +677,8 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
   int64_t i;
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
     gradOutput->storage, gradOutput->storageOffset,
-    gradOutput->size[0], -1,
-    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+    gradOutput->size(0), -1,
+    gradOutput->size(1)*gradOutput->size(2)*gradOutput->size(3), -1
   );
 
   if (gradWeight){
@@ -689,12 +689,12 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
   }
 
   if (gradBias) {
-    for (i = 0; i < gradBias->size[0]; i++)
+    for (i = 0; i < gradBias->size(0); i++)
     {
       int64_t k;
       real sum = 0;
-      real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
-      for (k = 0; k < gradOutput2d->size[1]; k++)
+      real *data = THStorage_(data)(gradOutput2d->storage) + gradOutput2d->storageOffset + i*gradOutput2d->stride(0);
+      for (k = 0; k < gradOutput2d->size(1); k++)
         sum += data[k];
 
       (THStorage_(data)(gradBias->storage) + gradBias->storageOffset)[i] += scale * sum;
@@ -735,7 +735,7 @@ void THNN_(VolumetricConvolutionMM_accGradParameters)(
   }
   else  // batch mode
   {
-    int64_t T = input->size[0];
+    int64_t T = input->size(0);
     int64_t t;
 
 #ifdef _OPENMP
diff --git a/aten/src/THNN/generic/VolumetricDilatedConvolution.c b/aten/src/THNN/generic/VolumetricDilatedConvolution.c
index 66d560a5d3e06e..845093eb1f0b30 100644
--- a/aten/src/THNN/generic/VolumetricDilatedConvolution.c
+++ b/aten/src/THNN/generic/VolumetricDilatedConvolution.c
@@ -24,7 +24,7 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
                   "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
                   "expected for weight, but got: %s");
     if (bias != NULL) {
-      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
     }
   } else if (!weight_nullable) {
     THError("weight tensor is expected to be non-nullable");
@@ -44,9 +44,9 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
     dimw++;
   }
 
-  int64_t inputDepth  = input->size[dimd];
-  int64_t inputHeight  = input->size[dimh];
-  int64_t inputWidth   = input->size[dimw];
+  int64_t inputDepth  = input->size(dimd);
+  int64_t inputHeight  = input->size(dimh);
+  int64_t inputWidth   = input->size(dimw);
   int64_t outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
   int64_t outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
@@ -58,16 +58,16 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
   }
 
   if (weight != NULL) {
-    int64_t nInputPlane = weight->size[1];
+    int64_t nInputPlane = weight->size(1);
     THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
   }
 
   if (gradOutput != NULL) {
     if (weight != NULL) {
-      int64_t nOutputPlane = weight->size[0];
+      int64_t nOutputPlane = weight->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = bias->size[0];
+      int64_t nOutputPlane = bias->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
@@ -95,8 +95,8 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
         dilationT, dilationH, dilationW, 0);
 
   // Params:
-  int64_t nInputPlane = weight->size[1];
-  int64_t nOutputPlane = weight->size[0];
+  int64_t nInputPlane = weight->size(1);
+  int64_t nOutputPlane = weight->size(0);
 
   input = THTensor_(newContiguous)(input);
   weight = THTensor_(newContiguous)(weight);
@@ -109,18 +109,18 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
   if (input->dim() == 4) {
     // Force batch
     is_batch = 0;
-    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
   }
 
-  int64_t inputDepth  = input->size[2];
-  int64_t inputHeight  = input->size[3];
-  int64_t inputWidth   = input->size[4];
+  int64_t inputDepth  = input->size(2);
+  int64_t inputHeight  = input->size(3);
+  int64_t inputWidth   = input->size(4);
   int64_t outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
   int64_t outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
@@ -133,7 +133,7 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
   if (ones->dim() != 3 ||
-      ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+      ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
     THTensor_(fill)(ones, 1);
@@ -182,7 +182,7 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
 
     // M,N,K are dims of matrix A and B
     int64_t m = nOutputPlane;
-    int64_t n = columns->size[1];
+    int64_t n = columns->size(1);
     int64_t k = nInputPlane*kT*kH*kW;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
@@ -230,8 +230,8 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
         dilationT, dilationH, dilationW, 0);
 
   // Params
-  int64_t nInputPlane = weight->size[1];
-  int64_t nOutputPlane = weight->size[0];
+  int64_t nInputPlane = weight->size(1);
+  int64_t nOutputPlane = weight->size(0);
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
@@ -242,19 +242,19 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
   if (input->dim() == 4) {
     // Force batch
     is_batch = 0;
-    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+    THTensor_(resize5d)(input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3));
   }
 
-  int64_t inputDepth  = input->size[2];
-  int64_t inputWidth   = input->size[4];
-  int64_t inputHeight  = input->size[3];
+  int64_t inputDepth  = input->size(2);
+  int64_t inputWidth   = input->size(4);
+  int64_t inputHeight  = input->size(3);
   int64_t outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
   int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Resize output
   THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
@@ -275,7 +275,7 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
 
     // M,N,K are dims of matrix A and B
     int64_t m = nInputPlane*kT*kW*kH;
-    int64_t n = gradColumns->size[1];
+    int64_t n = gradColumns->size(1);
     int64_t k = nOutputPlane;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
@@ -352,24 +352,24 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)(
   if (input->dim() == 4) {
     // Force batch
     is_batch = 0;
-    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+    THTensor_(resize5d)(input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3));
   }
 
-  int64_t nInputPlane = input->size[1];
-  int64_t nOutputPlane = gradOutput->size[1];
-  int64_t inputDepth  = input->size[2];
-  int64_t inputWidth   = input->size[4];
-  int64_t inputHeight  = input->size[3];
+  int64_t nInputPlane = input->size(1);
+  int64_t nOutputPlane = gradOutput->size(1);
+  int64_t inputDepth  = input->size(2);
+  int64_t inputWidth   = input->size(4);
+  int64_t inputHeight  = input->size(3);
   int64_t outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
   int64_t outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
   int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
   // Batch size + input planes
-  int64_t batchSize = input->size[0];
+  int64_t batchSize = input->size(0);
 
   // Define a buffer of ones, for bias accumulation
-  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+  if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
     THTensor_(fill)(ones, 1);
@@ -405,7 +405,7 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)(
       // M,N,K are dims of matrix A and B
       int64_t m = nOutputPlane;
       int64_t n = nInputPlane*kT*kW*kH;
-      int64_t k = columns->size[1];
+      int64_t k = columns->size(1);
 
       // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
       THBlas_(gemm)(
diff --git a/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c b/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c
index 1641c6018c51d2..aaa00ffa353769 100644
--- a/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c
+++ b/aten/src/THNN/generic/VolumetricDilatedMaxPooling.c
@@ -51,10 +51,10 @@ static inline void THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
              "kT: %d kW: %d, kH: %d, padT: %d, padW: %d, padH: %d",
              kT, kW, kH, pT, pW, pH);
 
-  nslices = input->size[dimN];
-  itime   = input->size[dimt];
-  iheight = input->size[dimh];
-  iwidth  = input->size[dimw];
+  nslices = input->size(dimN);
+  itime   = input->size(dimt);
+  iheight = input->size(dimh);
+  iwidth  = input->size(dimw);
   if (ceilMode)
   {
     otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
@@ -241,10 +241,10 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
         ceilMode);
 
   /* sizes */
-  nslices = input->size[dimN];
-  itime   = input->size[dimt];
-  iheight = input->size[dimh];
-  iwidth  = input->size[dimw];
+  nslices = input->size(dimN);
+  itime   = input->size(dimt);
+  iheight = input->size(dimh);
+  iwidth  = input->size(dimw);
   if (ceilMode)
   {
     otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
@@ -298,7 +298,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
   else /* batch mode */
   {
     int64_t p;
-    int64_t nBatch = input->size[0];
+    int64_t nBatch = input->size(0);
 
     int64_t istride = nslices * itime * iwidth * iheight;
     int64_t ostride = nslices * otime * owidth * oheight;
@@ -444,13 +444,13 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
   }
 
   /* sizes */
-  nslices = input->size[dimN];
-  itime = input->size[dimt];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
-  otime = gradOutput->size[dimt];
-  oheight = gradOutput->size[dimh];
-  owidth = gradOutput->size[dimw];
+  nslices = input->size(dimN);
+  itime = input->size(dimt);
+  iheight = input->size(dimh);
+  iwidth = input->size(dimw);
+  otime = gradOutput->size(dimt);
+  oheight = gradOutput->size(dimh);
+  owidth = gradOutput->size(dimw);
 
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
@@ -474,7 +474,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
   else /* batch mode */
   {
     int64_t p;
-    int64_t nBatch = input->size[0];
+    int64_t nBatch = input->size(0);
 
     int64_t istride = nslices * itime * iwidth * iheight;
     int64_t ostride = nslices * otime * owidth * oheight;
diff --git a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
index c7c18eab7a0ddd..4cc4dcc69837d8 100644
--- a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
+++ b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
@@ -112,7 +112,7 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
                   "non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
                   "expected for weight, but got: %s");
     if (bias != NULL) {
-      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
+      THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(1));
     }
   } else if (!weight_nullable) {
     THError("weight tensor is expected to be non-nullable");
@@ -132,13 +132,13 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
   }
 
   if (weight != NULL) {
-    const int64_t nInputPlane = weight->size[0];
+    const int64_t nInputPlane = weight->size(0);
     THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
   }
 
-  const int64_t inputWidth   = input->size[dimw];
-  const int64_t inputHeight  = input->size[dimh];
-  const int64_t inputDepth   = input->size[dimd];
+  const int64_t inputWidth   = input->size(dimw);
+  const int64_t inputHeight  = input->size(dimh);
+  const int64_t inputDepth   = input->size(dimd);
   const int64_t outputDepth  = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT;
   const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH;
   const int64_t outputWidth  = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW;
@@ -151,10 +151,10 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
 
   if (gradOutput != NULL) {
     if (weight != NULL) {
-      const int64_t nOutputPlane = weight->size[1];
+      const int64_t nOutputPlane = weight->size(1);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      const int64_t nOutputPlane = bias->size[0];
+      const int64_t nOutputPlane = bias->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
@@ -184,8 +184,8 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
         input, NULL, weight, bias, kT, kW, kH,
         dT, dW, dH, pT, pW, pH, dilationT, dilationW, dilationH, aT, aW, aH, 0);
 
-  const int nInputPlane  = (int)weight->size[0];
-  const int nOutputPlane = (int)weight->size[1];
+  const int nInputPlane  = (int)weight->size(0);
+  const int nOutputPlane = (int)weight->size(1);
 
   input = THTensor_(newContiguous)(input);
   weight = THTensor_(newContiguous)(weight);
@@ -195,18 +195,18 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
   {
     // Force batch
     is_batch = 0;
-    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
   }
 
-  const int64_t inputWidth   = input->size[4];
-  const int64_t inputHeight  = input->size[3];
-  const int64_t inputDepth   = input->size[2];
+  const int64_t inputWidth   = input->size(4);
+  const int64_t inputHeight  = input->size(3);
+  const int64_t inputDepth   = input->size(2);
   const int64_t outputDepth  = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT;
   const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH;
   const int64_t outputWidth  = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW;
 
   // Batch size + input planes
-  const int64_t batchSize = input->size[0];
+  const int64_t batchSize = input->size(0);
 
   // Resize output
   THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
@@ -218,7 +218,7 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
-  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth)
   {
     // Resize plane and fill with ones...
     THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
@@ -239,9 +239,9 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    const int64_t m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
-    const int64_t n = columns->size[1];
-    const int64_t k = weight->size[0];
+    const int64_t m = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4);
+    const int64_t n = columns->size(1);
+    const int64_t k = weight->size(0);
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     THBlas_(gemm)(
@@ -324,8 +324,8 @@ void THNN_(VolumetricFullDilatedConvolution_updateGradInput)(
         input, gradOutput, weight, NULL, kT, kW, kH,
         dT, dW, dH, pT, pW, pH, dilationT, dilationW, dilationH, aT, aW, aH, 0);
 
-  const int64_t nInputPlane  = weight->size[0];
-  const int64_t nOutputPlane = weight->size[1];
+  const int64_t nInputPlane  = weight->size(0);
+  const int64_t nOutputPlane = weight->size(1);
 
   input = THTensor_(newContiguous)(input);
   weight = THTensor_(newContiguous)(weight);
@@ -336,19 +336,19 @@ void THNN_(VolumetricFullDilatedConvolution_updateGradInput)(
   {
     // Force batch
     is_batch = 0;
-    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+    THTensor_(resize5d)(input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3));
   }
 
-  const int64_t inputWidth   = input->size[4];
-  const int64_t inputHeight  = input->size[3];
-  const int64_t inputDepth   = input->size[2];
+  const int64_t inputWidth   = input->size(4);
+  const int64_t inputHeight  = input->size(3);
+  const int64_t inputDepth   = input->size(2);
   const int64_t outputDepth  = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT;
   const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH;
   const int64_t outputWidth  = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW;
 
   // Batch size + input planes
-  const int64_t batchSize = input->size[0];
+  const int64_t batchSize = input->size(0);
 
   // Resize output
   THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
@@ -383,9 +383,9 @@ void THNN_(VolumetricFullDilatedConvolution_updateGradInput)(
 
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    const int64_t m = weight->size[0];
-    const int64_t n = gradColumns->size[1];
-    const int64_t k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    const int64_t m = weight->size(0);
+    const int64_t n = gradColumns->size(1);
+    const int64_t k = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4);
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     THBlas_(gemm)(
@@ -464,22 +464,22 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
   {
     // Force batch
     is_batch = 0;
-    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+    THTensor_(resize5d)(input, 1, input->size(0), input->size(1), input->size(2), input->size(3));
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2), gradOutput->size(3));
   }
 
-  const int64_t inputWidth   = input->size[4];
-  const int64_t inputHeight  = input->size[3];
-  const int64_t inputDepth   = input->size[2];
+  const int64_t inputWidth   = input->size(4);
+  const int64_t inputHeight  = input->size(3);
+  const int64_t inputDepth   = input->size(2);
   const int64_t outputDepth  = (inputDepth - 1) * dT - 2*pT + (dilationT * (kT - 1) + 1) + aT;
   const int64_t outputHeight = (inputHeight - 1) * dH - 2*pH + (dilationH * (kH - 1) + 1) + aH;
   const int64_t outputWidth  = (inputWidth - 1) * dW - 2*pW + (dilationW * (kW - 1) + 1) + aW;
 
   // Batch size + input planes
-  const int64_t batchSize = input->size[0];
+  const int64_t batchSize = input->size(0);
 
   // Define a buffer of ones, for bias accumulation
-  if (ones->dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  if (ones->dim() != 3 || ones->size(0)*ones->size(1)*ones->size(2) < outputDepth*outputHeight*outputWidth)
   {
     // Resize plane and fill with ones...
     THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
@@ -519,9 +519,9 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
 
       // M,N,K are dims of matrix A and B
       // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-      const int64_t n = columns->size[0];   // nOutputPlane * kt * kh * kw
-      const int64_t m = input_n->size[0];   // nInputPlane
-      const int64_t k = columns->size[1];   // inputHeight * inputWidth
+      const int64_t n = columns->size(0);   // nOutputPlane * kt * kh * kw
+      const int64_t m = input_n->size(0);   // nInputPlane
+      const int64_t k = columns->size(1);   // inputHeight * inputWidth
 
       // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
       THBlas_(gemm)(
@@ -563,7 +563,7 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
   if (is_batch == 0)
   {
     THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
-    THTensor_(resize4d)(input, input->size[1], inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(input, input->size(1), inputDepth, inputHeight, inputWidth);
   }
 
   THTensor_(free)(input);
diff --git a/aten/src/THNN/generic/VolumetricMaxUnpooling.c b/aten/src/THNN/generic/VolumetricMaxUnpooling.c
index b8e649cc39d7e4..566b656e739b58 100644
--- a/aten/src/THNN/generic/VolumetricMaxUnpooling.c
+++ b/aten/src/THNN/generic/VolumetricMaxUnpooling.c
@@ -38,14 +38,14 @@ static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)(
     dimh++;
     dimn++;
   }
-  int nslices = input->size[dimn];
+  int nslices = input->size(dimn);
 
   if (gradOutput != NULL) {
-    if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
+    if (oT != gradOutput->size(dimt) || oW != gradOutput->size(dimw) || oH != gradOutput->size(dimh))
     {
       THError(
         "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d",
-        oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw]
+        oT, oH, oW, gradOutput->size(dimt), gradOutput->size(dimh), gradOutput->size(dimw)
       );
     }
 
@@ -140,17 +140,17 @@ void THNN_(VolumetricMaxUnpooling_updateOutput)(
 
   if (input->dim() == 5)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimt++;
     dimw++;
     dimh++;
   }
 
   /* sizes */
-  nslices = input->size[dimt-1];
-  iT = input->size[dimt];
-  iH = input->size[dimh];
-  iW = input->size[dimw];
+  nslices = input->size(dimt-1);
+  iT = input->size(dimt);
+  iH = input->size(dimh);
+  iW = input->size(dimw);
 
   /* get contiguous input */
   input = THTensor_(newContiguous)(input);
@@ -287,17 +287,17 @@ void THNN_(VolumetricMaxUnpooling_updateGradInput)(
 
   if (input->dim() == 5)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimt++;
     dimw++;
     dimh++;
   }
 
   /* sizes */
-  nslices = input->size[dimt-1];
-  iT = input->size[dimt];
-  iH = input->size[dimh];
-  iW = input->size[dimw];
+  nslices = input->size(dimt-1);
+  iT = input->size(dimt);
+  iH = input->size(dimh);
+  iW = input->size(dimw);
 
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
diff --git a/aten/src/THNN/generic/VolumetricReplicationPadding.c b/aten/src/THNN/generic/VolumetricReplicationPadding.c
index e64cb3662f01a5..7e91bfff8ed377 100644
--- a/aten/src/THNN/generic/VolumetricReplicationPadding.c
+++ b/aten/src/THNN/generic/VolumetricReplicationPadding.c
@@ -33,10 +33,10 @@ static inline void THNN_(VolumetricReplicationPadding_shapeCheck)(
   }
 
   /* sizes */
-  nslices = input->size[dimslices];
-  idepth = input->size[dimd];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimslices);
+  idepth = input->size(dimd);
+  iheight = input->size(dimh);
+  iwidth = input->size(dimw);
   odepth = idepth + pfront + pback;
   oheight = iheight + ptop + pbottom;
   owidth  = iwidth + pleft + pright;
@@ -151,7 +151,7 @@ THNN_(VolumetricReplicationPadding_shapeCheck)(
 
   if (input->dim() == 5)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
     dimd++;
@@ -159,10 +159,10 @@ THNN_(VolumetricReplicationPadding_shapeCheck)(
   }
 
   /* sizes */
-  nslices = input->size[dimslices];
-  idepth = input->size[dimd];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimslices);
+  idepth = input->size(dimd);
+  iheight = input->size(dimh);
+  iwidth = input->size(dimw);
   odepth = idepth + pfront + pback;
   oheight = iheight + ptop + pbottom;
   owidth  = iwidth + pleft + pright;
@@ -295,7 +295,7 @@ void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state,
 
   if (input->dim() == 5)
   {
-    nbatch = input->size[0];
+    nbatch = input->size(0);
     dimw++;
     dimh++;
     dimd++;
@@ -303,10 +303,10 @@ void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state,
   }
 
   /* sizes */
-  nslices = input->size[dimslices];
-  idepth = input->size[dimd];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
+  nslices = input->size(dimslices);
+  idepth = input->size(dimd);
+  iheight = input->size(dimh);
+  iwidth = input->size(dimw);
   odepth = idepth + pfront + pback;
   oheight = iheight + ptop + pbottom;
   owidth  = iwidth + pleft + pright;
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index a28fb02ed02e23..0518c089235ad3 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -53,6 +53,11 @@ if(BUILD_ATEN)
   ENDIF(USE_ROCM)
 endif()
 
+# ---[ Torch build
+if(BUILD_TORCH)
+  add_subdirectory(../torch torch)
+endif()
+
 # ---[ Caffe2 build
 if(BUILD_CAFFE2)
   # Note: the folders that are being commented out have not been properly
@@ -456,7 +461,8 @@ if(BUILD_CAFFE2)
     endif()
 
     if(USE_ROCM)
-      add_library(caffe2_pybind11_state_hip MODULE ${Caffe2_HIP_PYTHON_SRCS})
+      hip_add_library(caffe2_pybind11_state_hip MODULE ${Caffe2_HIP_PYTHON_SRCS})
+      set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINKER_LANGUAGE HIP)
       set_target_properties(caffe2_pybind11_state_hip PROPERTIES COMPILE_FLAGS "${HIP_HIPCC_FLAGS} -fvisibility=hidden")
       set_target_properties(caffe2_pybind11_state_hip PROPERTIES PREFIX "")
       set_target_properties(caffe2_pybind11_state_hip PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 3b378d23ffb522..cb4eaedfdbeceb 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -11,6 +11,7 @@
 #include "caffe2/core/macros.h"
 
 #include "caffe2/core/asan.h"
+#include "caffe2/core/blob_stats.h"
 #ifdef CAFFE2_USE_CUDNN
 #include "caffe2/core/common_cudnn.h"
 #endif // CAFFE2_USE_CUDNN
@@ -252,7 +253,22 @@ struct Caffe2CudaInitializerHelper {
     }
   }
 };
-}  // namespace
+
+struct TensorCUDAStatGetter : BlobStatGetter {
+  size_t sizeBytes(const Blob& blob) const override {
+    const auto& tensor = blob.Get<TensorCUDA>();
+    auto nbytes = tensor.nbytes();
+    if (nbytes > 0 && tensor.IsType<std::string>()) {
+      const auto* data = tensor.data<std::string>();
+      for (int i = 0; i < tensor.size(); ++i) {
+        nbytes += data[i].size();
+      }
+    }
+    return nbytes;
+  }
+};
+REGISTER_BLOB_STAT_GETTER(TensorCUDA, TensorCUDAStatGetter);
+} // namespace
 
 /**
  * A utility function to rectify the gpu id. If the context specifies the
diff --git a/caffe2/core/dispatch/CMakeLists.txt b/caffe2/core/dispatch/CMakeLists.txt
index 736225fc1c6844..841bfca164684a 100644
--- a/caffe2/core/dispatch/CMakeLists.txt
+++ b/caffe2/core/dispatch/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LIB_SOURCES
         DispatchTable.cpp
         KernelRegistration.cpp
         LayoutId.cpp
+        LeftRight.cpp
         OpSchema.cpp
         OpSchemaRegistration.cpp
         TensorTypeId.cpp
diff --git a/caffe2/core/dispatch/DispatchTable.h b/caffe2/core/dispatch/DispatchTable.h
index 43901b2e5500a7..0f119791dbfa03 100644
--- a/caffe2/core/dispatch/DispatchTable.h
+++ b/caffe2/core/dispatch/DispatchTable.h
@@ -1,104 +1,93 @@
 #pragma once
 
-#include "caffe2/utils/flat_hash_map/flat_hash_map.h"
-#include "caffe2/utils/Metaprogramming.h"
+#include "caffe2/core/dispatch/LeftRight.h"
 #include "caffe2/core/dispatch/OpSchema.h"
+#include "caffe2/utils/Metaprogramming.h"
+#include "caffe2/utils/flat_hash_map/flat_hash_map.h"
 
-#include <type_traits>
 #include <array>
-#include <unordered_map>
+#include <atomic>
 #include <iostream>
 #include <mutex>
+#include <type_traits>
+#include <unordered_map>
 
 namespace c10 {
 
 namespace details {
-
 /// Kernel implementations in a thread-safe hash table.
-template<class Key>
+template <class Key>
 class ThreadsafeOperatorTable_ final {
-public:
-    // TODO The current implementation below does not have the correct correctness characteristics
-    // which we need.  It's worth spelling out exactly what we need:
-    //
-    //  - We need LOCK FREE read access to the table (as per the performance benchmark
-    //    at https://fb.quip.com/hvz3AGnx8MQ8
-    //
-    //  - We need to support writes which are possibly concurrent with reads, occurring when
-    //    a dynamic library is loaded or unloaded.
-    //
-    //  - We probably can require that dynamic library loads/unloads be synchronized (so
-    //    there are never two concurrent loads.)
-
-    template<class Key_>
-    void emplace(Key_&& key, void* value) {
-      using std::to_string;
-      // TODO Locking
-      //std::unique_lock<std::shared_timed_mutex> lock(mutex_);
-
-      auto result = map_.emplace(std::forward<Key>(key), value);
-      if (!result.second) {
-        std::ostringstream msg;
-        msg << "Tried to register conflicting kernels to the dispatcher: " << key;
-        throw std::logic_error(msg.str());
-      }
+ public:
+  template <class Key_>
+  void emplace(Key_&& key, void* value) {
+    bool res = map_.write([&](ska::flat_hash_map<Key, void*>& map) -> bool {
+      auto result = map->emplace(std::forward<Key>(key), value);
+      return result.second;
+    });
+    if (!res) {
+      std::ostringstream msg;
+      msg << "Tried to register conflicting kernels to the dispatcher: " << key;
+      throw std::logic_error(msg.str());
     }
+  }
 
-    void erase(const Key& key) {
-      // TODO Locking
-      //std::unique_lock<std::shared_timed_mutex> lock(mutex_);
-
-      size_t num_removed = map_.erase(key);
-      assert(num_removed <= 1); //This is not a multi-map
-      if (num_removed == 0) {
-        throw std::logic_error("Tried to deregister a kernel that isn't registered.");
-      }
+  void erase(const Key& key) {
+    auto num_removed =
+        map_.write([&](ska::flat_hash_map<Key, void*>& map) -> size_t {
+          return map->erase(key);
+        });
+    assert(num_removed <= 1); // This is not a multi-map
+    if (num_removed == 0) {
+      throw std::logic_error(
+          "Tried to deregister a kernel that isn't registered.");
     }
+  }
 
-    void* lookup(const Key& key) const {
-      // TODO (lock needed but slow perf. Find better way)
-      // std::shared_lock<std::shared_timed_mutex> lock(mutex_);
-      auto found = map_.find(key);
-      if (found == map_.end()) {
-        return nullptr;
-      } else {
+  void* lookup(const Key& key) const {
+    return map_.read([&](const ska::flat_hash_map<Key, void*>& map) -> void* {
+      auto found = map->find(key);
+      if (found != map->end()) {
         return found->second;
+      } else {
+        return nullptr;
       }
-    }
+    });
+  }
 
-private:
-    ska::flat_hash_map<Key, void*> map_;
-    // TODO Figure out how to get fast locking in C++11 (use boost::shared_timed_mutex? folly::SharedMutex? LR pattern?)
-    //mutable std::shared_timed_mutex mutex_;
+ private:
+  LeftRight<ska::flat_hash_map<Key, void*>> map_;
 };
 } // namespace details
 
 /**
  * Per-operator dispatch table.
  *
- * Given an operator specified by 'OpSchemaDef', this class records a dispatch table for
- * various kernels provided for this operator.  For example, if we consider the operator
- * add(Tensor, Tensor), the dispatch table for this operator may contain implementations
- * for various dynamic tensor types, such as (CPUFloatTensor, CPUFloatTensor),
- * (CUDAFloatTensor, CUDAFloatTensor), etc.
+ * Given an operator specified by 'OpSchemaDef', this class records a dispatch
+ * table for various kernels provided for this operator.  For example, if we
+ * consider the operator add(Tensor, Tensor), the dispatch table for this
+ * operator may contain implementations for various dynamic tensor types, such
+ * as (CPUFloatTensor, CPUFloatTensor), (CUDAFloatTensor, CUDAFloatTensor), etc.
  *
  * @tparam OpSchemaDef The operator signature this dispatch table encodes.
  */
 // TODO: Support dispatch for meta-operators (which apply to all dynamic types)
-template<class OpSchemaDef>
+template <class OpSchemaDef>
 class DispatchTable final {
-private:
+ private:
   using Schema = OpSchema<OpSchemaDef>;
 
-public:
-  DispatchTable(): kernels_() {}
+ public:
+  DispatchTable() : kernels_() {}
 
   /**
    * Register a kernel in the table at some dispatch key.
    * @param func Concrete kernel function implementation to register
    * @param dispatch_key Dispatch key to define when this kernel is selected
    */
-  void registerKernel(typename Schema::signature::func_type* func, typename Schema::dispatch::dispatch_key_type dispatch_key) {
+  void registerKernel(
+      typename Schema::signature::func_type* func,
+      typename Schema::dispatch::dispatch_key_type dispatch_key) {
     kernels_.emplace(std::move(dispatch_key), reinterpret_cast<void*>(func));
   }
 
@@ -107,10 +96,11 @@ class DispatchTable final {
    *
    * @param dispatch_key Dispatch key to unregister.
    */
-  // TODO: This isn't going to work so well when we get more complicated override patterns!
-  // In this case, an operator will show up in multiple slots, and erasing them one-by-one
-  // is probably not such a good idea.
-  void deregisterKernel(const typename Schema::dispatch::dispatch_key_type& dispatch_key) {
+  // TODO: This isn't going to work so well when we get more complicated
+  // override patterns! In this case, an operator will show up in multiple
+  // slots, and erasing them one-by-one is probably not such a good idea.
+  void deregisterKernel(
+      const typename Schema::dispatch::dispatch_key_type& dispatch_key) {
     kernels_.erase(dispatch_key);
   }
 
@@ -121,27 +111,36 @@ class DispatchTable final {
    * @param args Arguments to invoke the function with
    * @return Returned value of the operator
    */
-  template<class... Args>
+  template <class... Args>
   typename Schema::signature::return_type call(Args&&... args) const {
-    // TODO Better error message, but need to take care that reference arguments match non-reference arguments and so on.
-    //      static_assert(std::is_same<typename Schema::return_type (Args...), typename Schema::func_type>::value, "Argument types don't match operator signature");
+    // TODO Better error message, but need to take care that reference arguments
+    // match non-reference arguments and so on.
+    //      static_assert(std::is_same<typename Schema::return_type (Args...),
+    //      typename Schema::func_type>::value, "Argument types don't match
+    //      operator signature");
     auto kernel_func = lookupKernelFunc_(args...);
     return kernel_func(std::forward<Args>(args)...);
   }
 
-private:
-  template<class... Args>
-  typename Schema::signature::func_type* lookupKernelFunc_(const Args&... args) const {
+ private:
+  template <class... Args>
+  typename Schema::signature::func_type* lookupKernelFunc_(
+      const Args&... args) const {
     auto dispatch_key = Schema::dispatch::dispatch_key(args...);
     void* found = kernels_.lookup(dispatch_key);
     if (found == nullptr) {
-      // TODO Better error message - include op name and dispatch key (i.e. argument types)
-      throw std::logic_error(std::string() + "Didn't find kernel to dispatch to for operator '" + Schema::metadata::name() + "'");
+      // TODO Better error message - include op name and dispatch key (i.e.
+      // argument types)
+      throw std::logic_error(
+          std::string() + "Didn't find kernel to dispatch to for operator '" +
+          Schema::metadata::name() + "'");
     }
     return reinterpret_cast<typename Schema::signature::func_type*>(found);
   }
 
-  details::ThreadsafeOperatorTable_<typename Schema::dispatch::dispatch_key_type> kernels_;
+  details::ThreadsafeOperatorTable_<
+      typename Schema::dispatch::dispatch_key_type>
+      kernels_;
 };
 
 } // namespace c10
@@ -151,4 +150,5 @@ class DispatchTable final {
  * It has an implementation for each op schema def in a cpp file, because
  * we can't rely on the one-definition-rule.
  */
-template<class OpSchemaDef> c10::DispatchTable<OpSchemaDef>& c10_dispatch_table();
+template <class OpSchemaDef>
+c10::DispatchTable<OpSchemaDef>& c10_dispatch_table();
diff --git a/caffe2/core/dispatch/LeftRight.cpp b/caffe2/core/dispatch/LeftRight.cpp
new file mode 100644
index 00000000000000..26e7a7e114ccf9
--- /dev/null
+++ b/caffe2/core/dispatch/LeftRight.cpp
@@ -0,0 +1 @@
+#include "caffe2/core/dispatch/LeftRight.h"
diff --git a/caffe2/core/dispatch/LeftRight.h b/caffe2/core/dispatch/LeftRight.h
new file mode 100644
index 00000000000000..dc60a303c412cd
--- /dev/null
+++ b/caffe2/core/dispatch/LeftRight.h
@@ -0,0 +1,72 @@
+#include <atomic>
+#include <functional>
+#include <mutex>
+#include <thread>
+
+namespace c10 {
+namespace details {
+
+// LeftRight wait-free readers synchronization primitive
+// https://hal.archives-ouvertes.fr/hal-01207881/document
+template <typename T>
+class LeftRight {
+ public:
+  LeftRight() {
+    counters_[0].store(0);
+    counters_[1].store(0);
+  }
+
+  template <typename F>
+  auto read(F&& readFunc) -> typename std::result_of<F(const T&)>::type {
+    auto localCounterIndex = counterIndex_.load();
+    ++counters_[localCounterIndex];
+    try {
+      auto r = readFunc(data_[dataIndex_.load()]);
+      --counters_[localCounterIndex];
+      return r;
+    } catch (const std::exception& e) {
+      --counters_[localCounterIndex];
+      throw;
+    }
+  }
+
+  // Throwing from write would result in invalid state
+  template <typename F>
+  auto write(F&& writeFunc) -> typename std::result_of<F(T&)>::type {
+    std::unique_lock<std::mutex> lock(mutex_);
+    uniqueWrite(std::forward<F&&>(writeFunc));
+  }
+
+ private:
+  // This function doesn't use any locks for the writers. Use only if you know
+  // what you're doing
+  template <typename F>
+  auto uniqueWrite(F&& writeFunc) -> typename std::result_of<F(T&)>::type {
+    try {
+      auto localDataIndex = dataIndex_.load();
+      writeFunc(data_[localDataIndex ^ 1]);
+      dataIndex_ = localDataIndex ^ 1;
+      auto localCounterIndex = counterIndex_.load();
+      while (counters_[localCounterIndex ^ 1].load()) {
+        std::this_thread::yield();
+      }
+      counterIndex_ = localCounterIndex ^ 1;
+      while (counters_[localCounterIndex].load()) {
+        std::this_thread::yield();
+      }
+      return writeFunc(data_[localDataIndex]);
+    } catch (const std::exception& e) {
+      // rethrow
+      throw;
+    }
+  }
+
+  std::mutex mutex_;
+  std::atomic<uint8_t> counterIndex_{0};
+  std::atomic<uint8_t> dataIndex_{0};
+  std::atomic<int32_t> counters_[2];
+  T data_[2];
+};
+
+} // namespace details
+} // namespace c10
diff --git a/caffe2/core/hip/operator_hip_test.cc b/caffe2/core/hip/operator_hip_test.cc
index 14b0188452fe29..f7c6ef34c43cdc 100644
--- a/caffe2/core/hip/operator_hip_test.cc
+++ b/caffe2/core/hip/operator_hip_test.cc
@@ -49,7 +49,7 @@ TEST(EnginePrefTest, GPUDeviceDefaultPreferredEngines)
     {
       const auto op = CreateOperator(op_def, &ws);
       EXPECT_NE(nullptr, op.get());
-      EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "HIP");
+      EXPECT_EQ(static_cast<JustTest*>(op.get())->type(), "MIOPEN");
     }
 }
 
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index e017456497e743..169e730125a2cd 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -70,8 +70,8 @@ PerOpEnginePrefType& g_per_op_engine_pref() {
 }
 
 GlobalEnginePrefType& g_global_engine_pref() {
-  static auto* g_global_engine_pref_ =
-      new GlobalEnginePrefType{{DeviceType::CUDA, {"CUDNN"}}};
+  static auto* g_global_engine_pref_ = new GlobalEnginePrefType{
+      {DeviceType::CUDA, {"CUDNN"}}, {DeviceType::HIP, {"MIOPEN"}}};
   return *g_global_engine_pref_;
 }
 
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 9372ed49f4243a..26bb02415d3ea2 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -20,6 +20,7 @@
 #include "caffe2/core/types.h"
 #include "caffe2/core/workspace.h"
 #include "caffe2/proto/caffe2.pb.h"
+#include "caffe2/utils/filler.h"
 #include "caffe2/utils/proto_utils.h"
 
 namespace caffe2 {
@@ -521,6 +522,74 @@ class Operator : public OperatorBase {
     return &context_;
   }
 
+  virtual std::vector<TensorFiller<Context>> InputFillers(
+      const std::vector<std::vector<TIndex>>& shapes) {
+    CAFFE_ENFORCE(shapes.size() == Inputs().size());
+    std::vector<TensorFiller<Context>> fillers;
+    for (const auto& shape : shapes) {
+      fillers.emplace_back(shape, &context_);
+    }
+
+    return fillers;
+  }
+
+#define DISABLE_INPUT_FILLERS(Context)                                  \
+  std::vector<TensorFiller<Context>> InputFillers(                      \
+      const std::vector<std::vector<TIndex>>& /* unused */) override {  \
+    throw UnsupportedOperatorFeature("Op does not have input fillers"); \
+  }
+
+  void SparseLengthsFillerHelper(
+      const std::vector<std::vector<TIndex>>& shapes,
+      size_t value_index,
+      size_t length_index,
+      std::vector<TensorFiller<Context>>* fillers) {
+    CAFFE_ENFORCE_EQ(shapes[length_index].size(), 1);
+    (*fillers)[length_index].SparseLengths(shapes[value_index].front());
+  }
+
+  void SparseSegmentsFillerHelper(
+      const std::vector<std::vector<TIndex>>& shapes,
+      size_t value_index,
+      size_t segment_index,
+      std::vector<TensorFiller<Context>>* fillers) {
+    CAFFE_ENFORCE_EQ(shapes[segment_index].size(), 1);
+    // TODO: what would be a proper #segments
+    (*fillers)[segment_index].SparseSegments(shapes[value_index].front() - 1);
+  }
+
+// The helper is build sparse input with values and lengths; e.g.:
+// values  = [1, 2, 3, 2, 4, 6, 7, 3, 6]
+//            \_____/  \________/  \__/
+// lengths =    [3,        4,       2]
+#define USE_VALUE_LENGTH_INPUT_FILLERS(Context, value_index, length_index) \
+  std::vector<TensorFiller<Context>> InputFillers(                         \
+      const std::vector<std::vector<TIndex>>& shapes) override {           \
+    CAFFE_ENFORCE_EQ(shapes.size(), Operator<Context>::Inputs().size());   \
+    auto fillers = Operator<Context>::InputFillers(shapes);                \
+    Operator<Context>::SparseLengthsFillerHelper(                          \
+        shapes, value_index, length_index, &fillers);                      \
+    return fillers;                                                        \
+  }
+
+  // The helper is build sparse input with values, keys, and lengths; e.g.:
+  // values  = [1, 2, 3, 2, 4, 6, 7, 3, 6]
+  // keys    = [0, 1, 4, 0, 1, 2, 5, 1, 2]
+  //            \_____/  \________/  \__/
+  // lengths =    [3,        4,       2]
+#define USE_VALUE_KEY_LENGTH_INPUT_FILLERS(                              \
+    Context, value_index, key_index, length_index)                       \
+  std::vector<TensorFiller<Context>> InputFillers(                       \
+      const std::vector<std::vector<TIndex>>& shapes) override {         \
+    CAFFE_ENFORCE_EQ(shapes.size(), Operator<Context>::Inputs().size()); \
+    auto fillers = Operator<Context>::InputFillers(shapes);              \
+    Operator<Context>::SparseLengthsFillerHelper(                        \
+        shapes, key_index, length_index, &fillers);                      \
+    Operator<Context>::SparseSegmentsFillerHelper(                       \
+        shapes, value_index, key_index, &fillers);                       \
+    return fillers;                                                      \
+  }
+
  protected:
   void RecordEvent(const char* err_msg = nullptr) final {
     if (event_) {
diff --git a/caffe2/core/predictor.cc b/caffe2/core/predictor.cc
index bbe487b3ac6cd7..2aaa7a2dac3a30 100644
--- a/caffe2/core/predictor.cc
+++ b/caffe2/core/predictor.cc
@@ -71,13 +71,13 @@ Predictor::Predictor(const MetaNetDef& def, Workspace* parent, bool run_init)
   const auto& inputs =
       getBlobs(def, PredictorConsts::default_instance().inputs_blob_type());
   for (const auto& input : inputs) {
-    inputNames_.insert(input);
+    config_.input_names.emplace_back(input);
   }
 
   const auto& outputs =
       getBlobs(def, PredictorConsts::default_instance().outputs_blob_type());
   for (const auto& output : outputs) {
-    outputNames_.emplace_back(output);
+    config_.output_names.emplace_back(output);
   }
 }
 
@@ -87,19 +87,19 @@ Predictor::Predictor(
     Workspace* parent,
     bool run_init,
     int optimization)
-    : run_net_(run_net), ws_(parent) {
-
+    : ws_(parent) {
+  config_.predict_net = std::make_shared<NetDef>(run_net);
   if (run_init) {
     CAFFE_ENFORCE(ws_.RunNetOnce(init_net));
   }
 #if CAFFE2_MOBILE
   GlobalInit();
 #endif
-
+  auto predict_net = config_.predict_net;
   if (optimization) {
 #ifdef CAFFE2_OPTIMIZER
     try {
-      run_net_ = opt::optimize(run_net_, &ws_, optimization);
+      *predict_net = opt::optimize(*predict_net, &ws_, optimization);
     } catch (const std::exception& e) {
       LOG(WARNING) << "Optimization pass failed: " << e.what();
     }
@@ -112,45 +112,52 @@ Predictor::Predictor(
   const auto& initialized_vec = ws_.Blobs();
   const std::unordered_set<std::string> initialized{initialized_vec.begin(),
                                                     initialized_vec.end()};
-  for (const auto& name : run_net.external_input()) {
+  for (const auto& name : predict_net->external_input()) {
     if (!initialized.count(name)) {
       auto* blob = ws_.CreateBlob(name);
       blob->template GetMutable<TensorCPU>();
     }
   }
 
-  CAFFE_ENFORCE(ws_.CreateNet(run_net));
+  CAFFE_ENFORCE(ws_.CreateNet(predict_net));
 }
 
 bool Predictor::run(const TensorVector& inputs, TensorVector* outputs) {
-  CAFFE_ENFORCE(inputs.size() <= (unsigned)run_net_.external_input_size());
+  CAFFE_ENFORCE(
+      inputs.size() <=
+      static_cast<unsigned>(config_.predict_net->external_input_size()));
   for (size_t i = 0; i < inputs.size(); ++i) {
-    shareInputTensor(&ws_, run_net_.external_input(i), inputs[i]);
+    shareInputTensor(&ws_, config_.predict_net->external_input(i), inputs[i]);
   }
 
-  if (!ws_.RunNet(run_net_.name())) {
+  if (!ws_.RunNet(config_.predict_net->name())) {
     return false;
   }
 
-  outputs->resize(run_net_.external_output_size());
+  outputs->resize(config_.predict_net->external_output_size());
   for (size_t i = 0; i < outputs->size(); ++i) {
-    (*outputs)[i] = extractOutputTensor(&ws_, run_net_.external_output(i));
+    (*outputs)[i] =
+        extractOutputTensor(&ws_, config_.predict_net->external_output(i));
   }
   return true;
 }
 
 bool Predictor::run_map_workspace(const TensorMap& inputs) {
-  if (!inputNames_.empty()) {
-    CAFFE_ENFORCE_EQ(inputs.size(), inputNames_.size());
+  if (!config_.input_names.empty()) {
+    CAFFE_ENFORCE_EQ(inputs.size(), input_names().size());
   }
   for (auto input : inputs) {
-    if (!inputNames_.empty()) {
-      CAFFE_ENFORCE_GT(inputNames_.count(input.first), 0);
+    if (!input_names().empty()) {
+      CAFFE_ENFORCE(
+          std::find(input_names().begin(), input_names().end(), input.first) !=
+              input_names().end(),
+          "Input can't be found: ",
+          input.first);
     }
     shareInputTensor(&ws_, input.first, input.second);
   }
 
-  return ws_.RunNet(run_net_.name());
+  return ws_.RunNet(config_.predict_net->name());
 }
 
 bool Predictor::run_map(const TensorMap& inputs, TensorVector* outputs) {
@@ -158,9 +165,10 @@ bool Predictor::run_map(const TensorMap& inputs, TensorVector* outputs) {
     return false;
   }
 
-  outputs->resize(run_net_.external_output_size());
+  outputs->resize(config_.predict_net->external_output_size());
   for (size_t i = 0; i < outputs->size(); ++i) {
-    (*outputs)[i] = extractOutputTensor(&ws_, run_net_.external_output(i));
+    (*outputs)[i] =
+        extractOutputTensor(&ws_, config_.predict_net->external_output(i));
   }
   return true;
 }
@@ -170,8 +178,8 @@ bool Predictor::run_map_outputs(const TensorMap& inputs, TensorMap* outputs) {
     return false;
   }
 
-  outputs->reserve(outputNames_.size());
-  for (const std::string& outputName : outputNames_) {
+  outputs->reserve(output_names().size());
+  for (const std::string& outputName : output_names()) {
     (*outputs)[outputName] = extractOutputTensor(&ws_, outputName);
   }
   return true;
diff --git a/caffe2/core/predictor.h b/caffe2/core/predictor.h
index 1212946038e834..b56401a35da5c3 100644
--- a/caffe2/core/predictor.h
+++ b/caffe2/core/predictor.h
@@ -2,6 +2,7 @@
 
 #include <unordered_set>
 #include "caffe2/core/net.h"
+#include "caffe2/core/predictor_config.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/proto/metanet.pb.h"
 #include "caffe2/proto/predictor_consts.pb.h"
@@ -52,29 +53,24 @@ class Predictor {
   bool run_map_outputs(const TensorMap& inputs, TensorMap* outputs);
 
   const NetDef& def() const {
-    return run_net_;
+    return *config_.predict_net;
   };
 
   Workspace* ws() {
     return &ws_;
   };
 
-  const std::unordered_set<std::string>& input_names() const {
-    return inputNames_;
+  const std::vector<std::string>& input_names() const {
+    return config_.input_names;
   }
 
   const std::vector<std::string>& output_names() const {
-    return outputNames_;
+    return config_.output_names;
   }
 
  private:
   bool run_map_workspace(const TensorMap& inputs);
-
-  NetDef run_net_;
+  PredictorConfig config_;
   Workspace ws_;
-  std::unordered_set<std::string> inputNames_;
-  // Outputs need to be ordered since TensorVector outputs rely on the outputs
-  // being in a certain order.
-  std::vector<std::string> outputNames_;
 };
 }
diff --git a/caffe2/core/predictor_config.h b/caffe2/core/predictor_config.h
new file mode 100644
index 00000000000000..343c573c59e93f
--- /dev/null
+++ b/caffe2/core/predictor_config.h
@@ -0,0 +1,36 @@
+#pragma once
+#include <memory>
+#include <unordered_map>
+#include "caffe2/core/tensor.h"
+
+namespace caffe2 {
+
+/*
+ * Parameters for a Predictor provided by name.
+ * They are stored as shared_ptr to accommodate parameter sharing
+ */
+using PredictorParameters = std::map<std::string, std::shared_ptr<Blob>>;
+
+/**
+ * Stores parameters nessasary for creating a PredictorInterface object.
+ */
+struct PredictorConfig {
+  // A map of parameter name to Tensor object. Predictor is supposed to
+  // guarantee constness of all these Tensor objects.
+  std::shared_ptr<PredictorParameters> parameters;
+
+  std::shared_ptr<NetDef> predict_net;
+
+  // Input names of a model. User will have to provide all of the inputs
+  // for inference
+  std::vector<std::string> input_names;
+  // Output names of a model. All outputs will be returned as results of
+  // inference
+  std::vector<std::string> output_names;
+  // Parameter names of a model. Should be a subset of parameters map passed in.
+  // We provide a separate set of parameter names here as whole parameter set
+  // passed in by a user might contain extra tensors used by other models
+  std::vector<std::string> parameter_names;
+};
+
+} // namespace caffe2
diff --git a/caffe2/core/stats.h b/caffe2/core/stats.h
index 42b5c752fcce7c..09de1b1d095b81 100644
--- a/caffe2/core/stats.h
+++ b/caffe2/core/stats.h
@@ -258,6 +258,25 @@ class DetailedExportedStat : public ExportedStat {
   }
 };
 
+class StaticStat : public Stat {
+ private:
+  StatValue* value_;
+
+ public:
+  StaticStat(const std::string& groupName, const std::string& name)
+      : Stat(groupName, name),
+        value_(StatRegistry::get().add(groupName + "/" + name)) {}
+
+  int64_t increment(int64_t value = 1) {
+    return value_->reset(value);
+  }
+
+  template <typename T, typename Unused1, typename... Unused>
+  int64_t increment(T value, Unused1, Unused...) {
+    return increment(value);
+  }
+};
+
 namespace detail {
 
 template <class T>
@@ -285,7 +304,7 @@ template <class T>
 _ScopeGuard<T> ScopeGuard(T f) {
   return _ScopeGuard<T>(f);
 }
-}
+} // namespace detail
 
 #define CAFFE_STAT_CTOR(ClassName)                 \
   ClassName(std::string name) : groupName(name) {} \
@@ -316,6 +335,11 @@ _ScopeGuard<T> ScopeGuard(T f) {
     groupName, #name     \
   }
 
+#define CAFFE_STATIC_STAT(name) \
+  StaticStat name {             \
+    groupName, #name            \
+  }
+
 #define CAFFE_EVENT(stats, field, ...)                              \
   {                                                                 \
     auto __caffe_event_value_ = stats.field.increment(__VA_ARGS__); \
@@ -330,4 +354,4 @@ _ScopeGuard<T> ScopeGuard(T f) {
   if (auto g = detail::ScopeGuard([&](int64_t nanos) {   \
         CAFFE_EVENT(stats, field, nanos, ##__VA_ARGS__); \
       }))
-}
+} // namespace caffe2
diff --git a/caffe2/core/stats_test.cc b/caffe2/core/stats_test.cc
index 383daaf80d35c2..5d7c86bc918d12 100644
--- a/caffe2/core/stats_test.cc
+++ b/caffe2/core/stats_test.cc
@@ -117,5 +117,31 @@ TEST(StatsTest, StatsTestSimple) {
       toMap(reg2.publish()), ExportedStatMap({{"i1/s3", 0}, {"i2/s3", 0}}));
 }
 
+TEST(StatsTest, StatsTestStatic) {
+  struct TestStats {
+    CAFFE_STAT_CTOR(TestStats);
+    CAFFE_STATIC_STAT(cpuUsage);
+    CAFFE_STATIC_STAT(memUsage);
+  };
+  TestStats i1("i1");
+  TestStats i2("i2");
+  CAFFE_EVENT(i1, cpuUsage, 95);
+  CAFFE_EVENT(i2, memUsage, 80);
+
+  ExportedStatList data;
+  StatRegistry::get().publish(data);
+  EXPECT_SUBSET(
+      toMap(data), ExportedStatMap({{"i1/cpuUsage", 95}, {"i2/memUsage", 80}}));
+
+  CAFFE_EVENT(i1, cpuUsage, 80);
+  CAFFE_EVENT(i1, memUsage, 50);
+  CAFFE_EVENT(i2, memUsage, 90);
+
+  StatRegistry::get().publish(data);
+  EXPECT_SUBSET(
+      toMap(data),
+      ExportedStatMap(
+          {{"i1/cpuUsage", 80}, {"i1/memUsage", 50}, {"i2/memUsage", 90}}));
+}
 } // namespace
 } // namespace caffe2
diff --git a/caffe2/distributed/file_store_handler_op_gpu.cc b/caffe2/distributed/file_store_handler_op_gpu.cc
index c2e3ff27c937c3..6c13d14f36a6b2 100644
--- a/caffe2/distributed/file_store_handler_op_gpu.cc
+++ b/caffe2/distributed/file_store_handler_op_gpu.cc
@@ -1,4 +1,4 @@
-#include "file_store_handler_op.h"
+#include "caffe2/distributed/file_store_handler_op.h"
 
 #include <caffe2/core/context_gpu.h>
 
diff --git a/caffe2/distributed/redis_store_handler_op_gpu.cc b/caffe2/distributed/redis_store_handler_op_gpu.cc
index 9bf8d25bba6cb1..5a759e5340a616 100644
--- a/caffe2/distributed/redis_store_handler_op_gpu.cc
+++ b/caffe2/distributed/redis_store_handler_op_gpu.cc
@@ -1,4 +1,4 @@
-#include "redis_store_handler_op.h"
+#include "caffe2/distributed/redis_store_handler_op.h"
 
 #include <caffe2/core/context_gpu.h>
 
diff --git a/caffe2/experiments/operators/sparse_funhash_op.h b/caffe2/experiments/operators/sparse_funhash_op.h
index 04c2441f297b12..5c5f27e46667c1 100644
--- a/caffe2/experiments/operators/sparse_funhash_op.h
+++ b/caffe2/experiments/operators/sparse_funhash_op.h
@@ -47,6 +47,9 @@ class SparseFunHashOp : public Operator<Context> {
     adaptive_ = (InputSize() == 5);
   }
 
+  // TODO: enable the filler
+  DISABLE_INPUT_FILLERS(Context)
+
   bool RunOnDevice() override {
     const auto& val = Input(0);
     const auto& key = Input(1);
@@ -151,6 +154,9 @@ class SparseFunHashGradientOp : public Operator<Context> {
     adaptive_ = (InputSize() == 6);
   }
 
+  // TODO: enable the filler
+  DISABLE_INPUT_FILLERS(Context)
+
   bool RunOnDevice() override {
     const auto& grad_out = Input(0);
     const auto& val = Input(1);
diff --git a/caffe2/experiments/operators/sparse_matrix_reshape_op.h b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
index b952a72158f450..8c8d51c4ed01dc 100644
--- a/caffe2/experiments/operators/sparse_matrix_reshape_op.h
+++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.h
@@ -91,6 +91,9 @@ class SparseMatrixReshapeOp : public Operator<Context> {
     new_stride_ = new_shape[1];
   }
 
+  // TODO: enable the filler
+  DISABLE_INPUT_FILLERS(Context)
+
   bool RunOnDevice() override {
     auto& old_col = Input(0);
     CAFFE_ENFORCE(old_col.ndim() == 1, "Row index tensor must be 1-D.");
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index 97bc8d1b348ef3..ac27cd7253b864 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -81,6 +81,18 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         } else {
           input.reorder_to(dtensor->template mutable_data<float>());
         }
+      } else if (
+          InputIsType<itensor>(i) &&
+          Input(i).get_data_type() == itensor::data_type::s32) {
+        auto& input = Input(i);
+        auto dtensor = local_input_blobs_[i]->template GetMutable<TensorCPU>();
+        dtensor->Resize(input.get_dims());
+        if (input.is_public_format()) {
+          dtensor->ShareExternalPointer(
+              static_cast<long*>(input.get_data_handle()));
+        } else {
+          input.reorder_to(dtensor->template mutable_data<long>());
+        }
       } else {
         VLOG(1) << "Input " << i << " is not ideep::tensor. Skipping copy.";
         // Note(jiayq): This removes a const but conceptually
@@ -122,6 +134,19 @@ class IDEEPFallbackOp final : public IDEEPOperator {
           dtensor->resize(dst_dims, itensor::data_type::f32);
         }
         dtensor->set_data_handle(const_cast<void*>(src.raw_data()));
+      } else if (src.template IsType<long>()) {
+        Blob* dst = OperatorBase::OutputBlob(i);
+        if (!dst->template IsType<itensor>()) {
+          dst->Reset(new itensor());
+        }
+
+        auto src_dims = src.dims();
+        itensor::dims dst_dims(src_dims.begin(), src_dims.end());
+        auto dtensor = dst->template GetMutable<itensor>();
+        if (dtensor->get_dims() != dst_dims) {
+          dtensor->resize(dst_dims, itensor::data_type::s32);
+        }
+        dtensor->set_data_handle(const_cast<void*>(src.raw_data()));
       } else {
         CAFFE_THROW("ideep memory only supports float data type.");
       }
diff --git a/caffe2/image/CMakeLists.txt b/caffe2/image/CMakeLists.txt
index 84d9007ad6a7da..fdc74ec0e50328 100644
--- a/caffe2/image/CMakeLists.txt
+++ b/caffe2/image/CMakeLists.txt
@@ -11,6 +11,14 @@ if(USE_OPENCV AND OpenCV_FOUND)
   file(GLOB tmp *_test.cc)
   exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
 
+  # ---[ HIP files
+  # ------[ general hip
+  file(GLOB_RECURSE tmp *_hip.cc)
+  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
+  # exclude test files
+  file(GLOB_RECURSE tmp *_test.cc)
+  exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
+  
   # ---[ CPU files.
   file(GLOB tmp *.cc)
   set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
@@ -18,21 +26,29 @@ if(USE_OPENCV AND OpenCV_FOUND)
   file(GLOB tmp *_test.cc)
   exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
   exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
+  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
 
   # ---[ GPU test files
   file(GLOB tmp *_gpu_test.cc)
   set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
 
+  # ---[ HIP test files
+  file(GLOB_RECURSE tmp *_hip_test.cc)
+  set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
+  
   # ---[ CPU test files
   file(GLOB tmp *_test.cc)
   set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
   exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
+  exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS})
 
   # ---[ Send the lists to the parent scope.
   set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
   set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+  set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
   set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
   set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+  set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
 else()
         message(STATUS "Excluding image processing operators due to no opencv")
 endif()
diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index de44ab67cf6c00..6f3986b837d308 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -340,8 +340,8 @@ Caffe2Backend::get_special_operators() const {
               {"Cast", &Caffe2Backend::CreateCast},
               {"Constant", &Caffe2Backend::CreateConstant},
               {"Conv", &Caffe2Backend::CreateConvPoolOpBase},
-              {"AveragePool", &Caffe2Backend::CreateConvPoolOpBase},
-              {"GlobalAveragePool", &Caffe2Backend::CreateConvPoolOpBase},
+              {"AveragePool", &Caffe2Backend::CreatePadPool},
+              {"GlobalAveragePool", &Caffe2Backend::CreatePadPool},
               {"GlobalMaxPool", &Caffe2Backend::CreateConvPoolOpBase},
               {"MaxPool", &Caffe2Backend::CreateConvPoolOpBase},
               {"Reshape", &Caffe2Backend::CreateReshape},
@@ -515,6 +515,63 @@ Caffe2Ops Caffe2Backend::CreateConvPoolOpBase(
   return CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
 }
 
+Caffe2Ops Caffe2Backend::CreatePadPool(OnnxNode* onnx_node, int opset_version) {
+  auto& node = onnx_node->node;
+  auto& attributes = onnx_node->attributes;
+  Caffe2Ops ret;
+  // Pad
+  bool padding = false;
+  const std::string pad_name = opset_version < 2 ? "paddings" : "pads";
+  const auto pad_input = dummy_->NewDummyName();
+  if (attributes.HasAttribute("count_include_pad") &&
+      attributes.HasAttribute(pad_name)) {
+    auto count_include_pad = attributes.get<int64_t>("count_include_pad", 0L);
+    ::google::protobuf::RepeatedField<::google::protobuf::int64> pads;
+    pads =
+        attributes
+            .get<::google::protobuf::RepeatedField<::google::protobuf::int64>>(
+                pad_name);
+    if (count_include_pad == 1 && pads.size() == 4 &&
+        !(pads.Get(0) == 0 && pads.Get(1) == 0 && pads.Get(2) == 0 &&
+          pads.Get(3) == 0)) {
+      padding = true;
+      attributes.remove(pad_name);
+      caffe2::Argument arg_pads;
+      arg_pads.add_ints(pads.Get(0));
+      arg_pads.add_ints(pads.Get(1));
+      arg_pads.add_ints(pads.Get(2));
+      arg_pads.add_ints(pads.Get(3));
+      arg_pads.set_name("pads");
+      auto* c2_op = ret.ops.Add();
+      BuildOperator(
+          c2_op, "PadImage", {node.input(0)}, {pad_input}, {arg_pads});
+    } else if (count_include_pad == 1) {
+      std::string str;
+      bool pads_flag = false;
+      str += "[";
+      for (const auto& i : pads) {
+        str += caffe2::to_string(i) + ",";
+        pads_flag = pads_flag || i > 0;
+      }
+      str += "]";
+      if (pads_flag == true) {
+        CAFFE_THROW(
+            "Caffe2 only supports padding 2D Tensor, whereas padding is ", str);
+      }
+    }
+  }
+  // Pool
+  auto c2_ops = Caffe2Backend::CreateConvPoolOpBase(onnx_node, opset_version);
+  auto* pool_op = c2_ops.ops.Mutable(0);
+  if (padding) {
+    pool_op->set_input(0, pad_input);
+  }
+  auto* c2_op = ret.ops.Add();
+  c2_op->CopyFrom(*pool_op);
+
+  return ret;
+}
+
 Caffe2Ops Caffe2Backend::CreateReshape(OnnxNode* onnx_node, int opset_version) {
   auto c2_op = CommonOnnxNodeToCaffe2Ops(onnx_node, opset_version);
   CAFFE_ENFORCE_EQ(c2_op.ops.size(), 1);
diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h
index e8a8ec3c65bc57..437e572b8528b7 100644
--- a/caffe2/onnx/backend.h
+++ b/caffe2/onnx/backend.h
@@ -168,6 +168,8 @@ class Caffe2Backend {
 
   Caffe2Ops CreateConvPoolOpBase(OnnxNode* onnx_node, int opset_version);
 
+  Caffe2Ops CreatePadPool(OnnxNode* onnx_node, int opset_version);
+
   Caffe2Ops CreateReshape(OnnxNode* onnx_node, int opset_version);
 
   Caffe2Ops CreateGather(OnnxNode* onnx_node, int opset_version);
diff --git a/caffe2/onnx/helper.h b/caffe2/onnx/helper.h
index 85b27dda87d524..42c9a639431de3 100644
--- a/caffe2/onnx/helper.h
+++ b/caffe2/onnx/helper.h
@@ -40,6 +40,18 @@ inline AttributeProto MakeAttribute(
   return attr;
 }
 
+inline AttributeProto MakeAttribute(
+    const std::string& name,
+    const std::vector<float>& vals) {
+  AttributeProto attr;
+  attr.set_name(name);
+  for (const auto v : vals) {
+    attr.add_floats(v);
+  }
+  attr.set_type(AttributeProto::FLOATS);
+  return attr;
+}
+
 inline AttributeProto MakeAttribute(const std::string& name, int64_t val) {
   AttributeProto attr;
   attr.set_name(name);
diff --git a/caffe2/onnx/onnx_exporter.cc b/caffe2/onnx/onnx_exporter.cc
index 02d870d511d6d2..0c36e26d248e9f 100644
--- a/caffe2/onnx/onnx_exporter.cc
+++ b/caffe2/onnx/onnx_exporter.cc
@@ -245,7 +245,8 @@ OnnxExporter::get_special_operators() const {
           {"LRN", &OnnxExporter::CreateLrnNodes},
           {"Reshape", &OnnxExporter::CreateReshapeNodes},
           {"Slice", &OnnxExporter::CreateSliceNodes},
-          {"ChannelShuffle", &OnnxExporter::CreateChannelShuffleNodes}};
+          {"ChannelShuffle", &OnnxExporter::CreateChannelShuffleNodes},
+          {"ResizeNearest", &OnnxExporter::CreateUpsampleNodes}};
   return kSpecialOperators;
 }
 
@@ -681,6 +682,41 @@ ConvertedResult OnnxExporter::CreateChannelShuffleNodes(
   return result;
 }
 
+ConvertedResult OnnxExporter::CreateUpsampleNodes(
+    const caffe2::OperatorDef& def,
+    const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
+  float width_scale = 1.0;
+  float height_scale = 1.0;
+  for (const auto& a : def.arg()) {
+    if (a.name() == "width_scale") {
+      width_scale = a.f();
+    } else if (a.name() == "height_scale") {
+      height_scale = a.f();
+    }
+  }
+  CAFFE_ENFORCE_GT(width_scale, 0);
+  CAFFE_ENFORCE_GT(height_scale, 0);
+
+  auto x = def.input(0);
+  const auto& x_shape = shapes.at(x);
+  CAFFE_ENFORCE_GE(x_shape.dims().size(), 2);
+
+  std::vector<float> scales(x_shape.dims().size(), 1.0);
+  scales[scales.size() - 2] = height_scale;
+  scales[scales.size() - 1] = width_scale;
+
+  ConvertedResult result;
+  auto& nodes = result.first;
+  std::vector<std::string> inputs(def.input().begin(), def.input().end());
+  std::vector<std::string> outputs(def.output().begin(), def.output().end());
+  auto node = MakeNode("Upsample", inputs, outputs, def.name());
+  node.add_attribute()->CopyFrom(MakeAttribute("scales", scales));
+  node.add_attribute()->CopyFrom(MakeAttribute("mode", "nearest"));
+  nodes.emplace_back(node);
+
+  return result;
+}
+
 ConvertedResult OnnxExporter::CreateSliceNodes(
     const caffe2::OperatorDef& def,
     const std::unordered_map<std::string, caffe2::TensorShape>& shapes) {
diff --git a/caffe2/onnx/onnx_exporter.h b/caffe2/onnx/onnx_exporter.h
index 7fcd54044d9d66..51f62df0eb2212 100644
--- a/caffe2/onnx/onnx_exporter.h
+++ b/caffe2/onnx/onnx_exporter.h
@@ -92,6 +92,10 @@ class OnnxExporter {
       const caffe2::OperatorDef& def,
       const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
 
+  ConvertedResult CreateUpsampleNodes(
+      const caffe2::OperatorDef& def,
+      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);
+
   // \brief Check black listed arguemnts where we won't pass down when
   // converting to ONNX node
   bool IsBlackListed(const caffe2::Argument& arg);
diff --git a/caffe2/onnx/onnxifi_init.cc b/caffe2/onnx/onnxifi_init.cc
index 7ec264b342fe63..62c44b0b38b3e1 100644
--- a/caffe2/onnx/onnxifi_init.cc
+++ b/caffe2/onnx/onnxifi_init.cc
@@ -11,7 +11,7 @@ onnxifi_library* initOnnxifiLibrary() {
   static onnxifi_library core{};
   std::call_once(once, []() {
     auto ret =
-        onnxifi_load(ONNXIFI_LOADER_FLAG_VERSION_1_0, nullptr, nullptr, &core);
+        onnxifi_load(ONNXIFI_LOADER_FLAG_VERSION_1_0, nullptr, &core);
     if (!ret) {
       CAFFE_THROW("Cannot load onnxifi lib");
     }
diff --git a/caffe2/operators/affine_channel_op.cc b/caffe2/operators/affine_channel_op.cc
index 26953876b4891a..823a3cf8fee378 100644
--- a/caffe2/operators/affine_channel_op.cc
+++ b/caffe2/operators/affine_channel_op.cc
@@ -21,6 +21,8 @@ void AffineChannelScaleBiasBackwardNCHW(
   const int stride = C * HxW;
   EigenVectorArrayMap<T> dscale_arr(dscale, C);
   EigenVectorArrayMap<T> dbias_arr(dbias, C);
+  dscale_arr.setZero();
+  dbias_arr.setZero();
   for (int i = 0; i < N; ++i) {
     ConstEigenArrayMap<T> dY_arr(dY_ptr, HxW, C);
     ConstEigenArrayMap<T> X_arr(X_ptr, HxW, C);
diff --git a/caffe2/operators/affine_channel_op_cudnn.cc b/caffe2/operators/affine_channel_op_cudnn.cc
deleted file mode 100644
index e3bf3b140a3ec3..00000000000000
--- a/caffe2/operators/affine_channel_op_cudnn.cc
+++ /dev/null
@@ -1,371 +0,0 @@
-#include "caffe2/operators/affine_channel_op.h"
-
-#include <algorithm>
-#include <array>
-#include <vector>
-
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/cudnn_wrappers.h"
-#include "caffe2/core/types.h"
-#include "caffe2/utils/conversions.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-namespace {
-
-class CuDNNAffineChannelOpBase : public Operator<CUDAContext> {
- public:
-  CuDNNAffineChannelOpBase(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CUDAContext>(operator_def, ws),
-        order_(StringToStorageOrder(
-            OperatorBase::GetSingleArgument<std::string>("order", "NCHW"))),
-        OP_SINGLE_ARG(bool, "is_learnable", is_learnable_, false),
-        cudnn_wrapper_(&context_) {
-    CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
-
-    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_));
-    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&scale_desc_));
-    CUDNN_ENFORCE(cudnnCreateOpTensorDescriptor(&mul_desc_));
-  }
-
-  virtual ~CuDNNAffineChannelOpBase() {
-    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_));
-    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(scale_desc_));
-    CUDNN_ENFORCE(cudnnDestroyOpTensorDescriptor(mul_desc_));
-  }
-
- protected:
-  void SetTensorDesc4D(
-      const cudnnDataType_t cudnn_type,
-      const int N,
-      const int C,
-      const int H,
-      const int W) {
-    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
-        X_desc_, GetCudnnTensorFormat(order_), cudnn_type, N, C, H, W));
-    CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
-        scale_desc_, GetCudnnTensorFormat(order_), cudnn_type, 1, C, 1, 1));
-  }
-
-  void SetTensorDescND(
-      const cudnnDataType_t cudnn_type,
-      const std::vector<int>& X_dims) {
-    const int ndim = X_dims.size();
-    const int C_dim = order_ == StorageOrder::NCHW ? 1 : ndim - 1;
-    const int C = X_dims[C_dim];
-    std::vector<int> X_strides(ndim);
-    X_strides.back() = 1;
-    for (int i = ndim - 1; i > 0; --i) {
-      X_strides[i - 1] = X_strides[i] * X_dims[i];
-    }
-    std::vector<int> scale_dims(ndim, 1);
-    scale_dims[C_dim] = C;
-    std::vector<int> scale_strides(ndim);
-    std::fill(scale_strides.begin(), scale_strides.begin() + C_dim, C);
-    std::fill(scale_strides.begin() + C_dim, scale_strides.end(), 1);
-    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
-        X_desc_, cudnn_type, ndim, X_dims.data(), X_strides.data()));
-    CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
-        scale_desc_,
-        cudnn_type,
-        ndim,
-        scale_dims.data(),
-        scale_strides.data()));
-  }
-
-  const StorageOrder order_;
-  const bool is_learnable_;
-
-  CuDNNWrapper cudnn_wrapper_;
-  cudnnTensorDescriptor_t X_desc_;
-  cudnnTensorDescriptor_t scale_desc_;
-  cudnnOpTensorDescriptor_t mul_desc_;
-};
-
-class CuDNNAffineChannelOp final : public CuDNNAffineChannelOpBase {
- public:
-  CuDNNAffineChannelOp(const OperatorDef& operator_def, Workspace* ws)
-      : CuDNNAffineChannelOpBase(operator_def, ws) {
-    CUDNN_ENFORCE(cudnnCreateOpTensorDescriptor(&add_desc_));
-  }
-
-  ~CuDNNAffineChannelOp() {
-    CUDNN_ENFORCE(cudnnDestroyOpTensorDescriptor(add_desc_));
-  }
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType() {
-    const auto& X = Input(0);
-    const auto& scale = Input(1);
-    const auto& bias = Input(2);
-    auto* Y = Output(0);
-    if (is_learnable_) {
-      CAFFE_ENFORCE_NE(
-          Y,
-          &X,
-          "In-place affine_channel_op is not supported when "
-          "is_learnable = true.");
-    }
-    Y->ResizeLike(X);
-    const T* X_data = X.data<T>();
-    const T* scale_data = scale.data<T>();
-    const T* bias_data = bias.data<T>();
-    T* Y_data = Y->mutable_data<T>();
-    const int ndim = X.ndim();
-    CAFFE_ENFORCE_GE(ndim, 4);
-    const cudnnDataType_t cudnn_type = cudnnTypeWrapper<T>::type;
-    if (ndim == 4) {
-      const int N = X.dim32(0);
-      const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3);
-      const int H = order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1);
-      const int W = order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2);
-      SetTensorDesc4D(cudnn_type, N, C, H, W);
-    } else {
-      const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
-      SetTensorDescND(cudnn_type, X_dims);
-    }
-    CUDNN_ENFORCE(cudnnSetOpTensorDescriptor(
-        mul_desc_, CUDNN_OP_TENSOR_MUL, cudnn_type, CUDNN_PROPAGATE_NAN));
-    CUDNN_ENFORCE(cudnnOpTensor(
-        cudnn_wrapper_.inline_cudnn_handle(),
-        mul_desc_,
-        cudnnTypeWrapper<T>::kOne(),
-        X_desc_,
-        X_data,
-        cudnnTypeWrapper<T>::kOne(),
-        scale_desc_,
-        scale_data,
-        cudnnTypeWrapper<T>::kZero(),
-        X_desc_,
-        Y_data));
-    if (ndim == 4) {
-      CUDNN_ENFORCE(cudnnAddTensor(
-          cudnn_wrapper_.inline_cudnn_handle(),
-          cudnnTypeWrapper<T>::kOne(),
-          scale_desc_,
-          bias_data,
-          cudnnTypeWrapper<T>::kOne(),
-          X_desc_,
-          Y_data));
-    } else {
-      CUDNN_ENFORCE(cudnnSetOpTensorDescriptor(
-          add_desc_, CUDNN_OP_TENSOR_ADD, cudnn_type, CUDNN_PROPAGATE_NAN));
-      CUDNN_ENFORCE(cudnnOpTensor(
-          cudnn_wrapper_.inline_cudnn_handle(),
-          add_desc_,
-          cudnnTypeWrapper<T>::kOne(),
-          X_desc_,
-          Y_data,
-          cudnnTypeWrapper<T>::kOne(),
-          scale_desc_,
-          bias_data,
-          cudnnTypeWrapper<T>::kZero(),
-          X_desc_,
-          Y_data));
-    }
-    return true;
-  }
-
- private:
-  cudnnOpTensorDescriptor_t add_desc_;
-};
-
-class CuDNNAffineChannelGradientOp final : public CuDNNAffineChannelOpBase {
- public:
-  CuDNNAffineChannelGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : CuDNNAffineChannelOpBase(operator_def, ws) {
-#if CUDNN_VERSION_MIN(6, 0, 0)
-    CUDNN_ENFORCE(cudnnCreateReduceTensorDescriptor(&reduce_desc_));
-#endif
-  }
-
-  ~CuDNNAffineChannelGradientOp() {
-#if CUDNN_VERSION_MIN(6, 0, 0)
-    CUDNN_ENFORCE(cudnnDestroyReduceTensorDescriptor(reduce_desc_));
-#endif
-  }
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType() {
-    const auto& dY = Input(0);
-    const auto& scale = is_learnable_ ? Input(2) : Input(1);
-    auto* dX = Output(0);
-    dX->ResizeLike(dY);
-    const T* dY_data = dY.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* dX_data = dX->mutable_data<T>();
-    const int ndim = dY.ndim();
-    CAFFE_ENFORCE_GE(ndim, 4);
-    const cudnnDataType_t cudnn_type = cudnnTypeWrapper<T>::type;
-    const std::vector<int> X_dims(dY.dims().cbegin(), dY.dims().cend());
-    SetTensorDescND(cudnn_type, X_dims);
-    CUDNN_ENFORCE(cudnnSetOpTensorDescriptor(
-        mul_desc_, CUDNN_OP_TENSOR_MUL, cudnn_type, CUDNN_PROPAGATE_NAN));
-    CUDNN_ENFORCE(cudnnOpTensor(
-        cudnn_wrapper_.inline_cudnn_handle(),
-        mul_desc_,
-        cudnnTypeWrapper<T>::kOne(),
-        X_desc_,
-        dY_data,
-        cudnnTypeWrapper<T>::kOne(),
-        scale_desc_,
-        scale_data,
-        cudnnTypeWrapper<T>::kZero(),
-        X_desc_,
-        dX_data));
-    if (is_learnable_) {
-      const auto& X = Input(1);
-      const T* X_data = X.data<T>();
-      auto* dscale = Output(1);
-      auto* dbias = Output(2);
-      dscale->ResizeLike(scale);
-      dbias->ResizeLike(scale);
-      T* dscale_data = dscale->mutable_data<T>();
-      T* dbias_data = dbias->mutable_data<T>();
-      if (X.size() == scale.size()) {
-        CUDNN_ENFORCE(cudnnOpTensor(
-            cudnn_wrapper_.inline_cudnn_handle(),
-            mul_desc_,
-            cudnnTypeWrapper<T>::kOne(),
-            X_desc_,
-            dY_data,
-            cudnnTypeWrapper<T>::kOne(),
-            X_desc_,
-            X_data,
-            cudnnTypeWrapper<T>::kZero(),
-            X_desc_,
-            dscale_data));
-        context_.Copy<T, CUDAContext, CUDAContext>(
-            dY.size(), dY_data, dbias_data);
-      } else {
-        dYxX_.ResizeLike(X);
-        T* dYxX_data = dYxX_.mutable_data<T>();
-        CUDNN_ENFORCE(cudnnOpTensor(
-            cudnn_wrapper_.inline_cudnn_handle(),
-            mul_desc_,
-            cudnnTypeWrapper<T>::kOne(),
-            X_desc_,
-            dY_data,
-            cudnnTypeWrapper<T>::kOne(),
-            X_desc_,
-            X_data,
-            cudnnTypeWrapper<T>::kZero(),
-            X_desc_,
-            dYxX_data));
-#if CUDNN_VERSION_MIN(6, 0, 0)
-        ComputeScaleBiasGradient<T>(
-            dYxX_data, dY_data, dscale_data, dbias_data);
-#else
-        const int N = X.dim32(0);
-        const int C =
-            order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
-        const int HxW = X.size() / (N * C);
-        ComputeScaleBiasGradientFallback<T>(
-            N, C, HxW, dYxX_data, dY_data, dscale_data, dbias_data);
-#endif
-      }
-    }
-    return true;
-  }
-
- private:
-#if CUDNN_VERSION_MIN(6, 0, 0)
-  template <typename T>
-  void
-  ComputeScaleBiasGradient(const T* dYxX, const T* dY, T* dscale, T* dbias) {
-    const cudnnDataType_t cudnn_type = cudnnTypeWrapper<T>::type;
-    CUDNN_ENFORCE(cudnnSetReduceTensorDescriptor(
-        reduce_desc_,
-        CUDNN_REDUCE_TENSOR_ADD,
-        cudnn_type,
-        CUDNN_PROPAGATE_NAN,
-        CUDNN_REDUCE_TENSOR_NO_INDICES,
-        CUDNN_32BIT_INDICES));
-    std::size_t workspace_size = 0;
-    CUDNN_ENFORCE(cudnnGetReductionWorkspaceSize(
-        cudnn_wrapper_.inline_cudnn_handle(),
-        reduce_desc_,
-        X_desc_,
-        scale_desc_,
-        &workspace_size));
-    workspace_buff_.Resize((workspace_size + sizeof(T) - 1) / sizeof(T));
-    T* workspace_data = workspace_buff_.mutable_data<T>();
-    CUDNN_ENFORCE(cudnnReduceTensor(
-        cudnn_wrapper_.inline_cudnn_handle(),
-        reduce_desc_,
-        nullptr,
-        0,
-        workspace_data,
-        workspace_size,
-        cudnnTypeWrapper<T>::kOne(),
-        X_desc_,
-        dYxX,
-        cudnnTypeWrapper<T>::kZero(),
-        scale_desc_,
-        dscale));
-    CUDNN_ENFORCE(cudnnReduceTensor(
-        cudnn_wrapper_.inline_cudnn_handle(),
-        reduce_desc_,
-        nullptr,
-        0,
-        workspace_data,
-        workspace_size,
-        cudnnTypeWrapper<T>::kOne(),
-        X_desc_,
-        dY,
-        cudnnTypeWrapper<T>::kZero(),
-        scale_desc_,
-        dbias));
-  }
-#else
-  template <typename T>
-  void ComputeScaleBiasGradientFallback(
-      const int N,
-      const int C,
-      const int HxW,
-      const T* dYxX,
-      const T* dY,
-      T* dscale,
-      T* dbias) {
-    if (order_ == StorageOrder::NCHW) {
-      std::array<int, 3> dims = {N, C, HxW};
-      std::array<int, 2> axes = {0, 2};
-      math::ReduceSum<T, CUDAContext>(
-          3, dims.data(), 2, axes.data(), dYxX, dscale, &context_);
-      math::ReduceSum<T, CUDAContext>(
-          3, dims.data(), 2, axes.data(), dY, dbias, &context_);
-    } else {
-      std::array<int, 2> dims = {N * HxW, C};
-      const int axis = 0;
-      math::ReduceSum<T, CUDAContext>(
-          2, dims.data(), 1, &axis, dYxX, dscale, &context_);
-      math::ReduceSum<T, CUDAContext>(
-          2, dims.data(), 1, &axis, dY, dbias, &context_);
-    }
-  }
-#endif
-
-  Tensor<CUDAContext> dYxX_;
-
-#if CUDNN_VERSION_MIN(6, 0, 0)
-  cudnnReduceTensorDescriptor_t reduce_desc_;
-
-  Tensor<CUDAContext> workspace_buff_;
-#endif
-};
-
-} // namespace
-
-REGISTER_CUDNN_OPERATOR(AffineChannel, CuDNNAffineChannelOp);
-REGISTER_CUDNN_OPERATOR(AffineChannelGradient, CuDNNAffineChannelGradientOp);
-
-} // namespace caffe2
diff --git a/caffe2/operators/batch_moments_op.cc b/caffe2/operators/batch_moments_op.cc
new file mode 100644
index 00000000000000..8247b9af228cb8
--- /dev/null
+++ b/caffe2/operators/batch_moments_op.cc
@@ -0,0 +1,122 @@
+#include "caffe2/operators/batch_moments_op.h"
+
+#include <string>
+#include <vector>
+
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <>
+bool BatchMomentsOp<float, CPUContext>::ComputeBatchMomentsNCHW(
+    const int N,
+    const int C,
+    const int HxW,
+    const float* X,
+    float* mu,
+    float* var) {
+  math::Set<float, CPUContext>(C, 0.0f, mu, &context_);
+  math::Set<float, CPUContext>(C, 0.0f, var, &context_);
+  EigenVectorArrayMap<float> mu_arr(mu, C);
+  EigenVectorArrayMap<float> var_arr(var, C);
+  const float* X_ptr = X;
+  const int stride = C * HxW;
+  for (int i = 0; i < N; ++i) {
+    ConstEigenArrayMap<float> X_arr(X_ptr, HxW, C);
+    mu_arr += X_arr.colwise().sum();
+    var_arr += X_arr.square().colwise().sum();
+    X_ptr += stride;
+  }
+  const float scale = 1.0f / static_cast<float>(N * HxW);
+  math::Scale<float, CPUContext>(C, scale, mu, mu, &context_);
+  math::Scale<float, CPUContext>(C, scale, var, var, &context_);
+  return true;
+}
+
+template <>
+bool BatchMomentsOp<float, CPUContext>::ComputeBatchMomentsNHWC(
+    const int N,
+    const int C,
+    const int HxW,
+    const float* X,
+    float* mu,
+    float* var) {
+  ConstEigenArrayMap<float> X_arr(X, C, N * HxW);
+  EigenVectorMap<float>(mu, C) = X_arr.rowwise().mean();
+  EigenVectorMap<float>(var, C) = X_arr.square().rowwise().mean();
+  return true;
+}
+
+template <>
+bool BatchMomentsGradientOp<float, CPUContext>::ComputeBatchMomentsGradientNCHW(
+    const int N,
+    const int C,
+    const int HxW,
+    const float* dmu,
+    const float* dvar,
+    const float* X,
+    float* dX) {
+  ConstEigenVectorArrayMap<float> dmu_arr(dmu, C);
+  ConstEigenVectorArrayMap<float> dvar_arr(dvar, C);
+  const float* X_ptr = X;
+  float* dX_ptr = dX;
+  const int stride = C * HxW;
+  for (int i = 0; i < N; ++i) {
+    EigenArrayMap<float> dX_arr(dX_ptr, HxW, C);
+    dX_arr = ConstEigenArrayMap<float>(X_ptr, HxW, C).rowwise() *
+        dvar_arr.transpose() * 2.0f;
+    dX_arr.rowwise() += dmu_arr.transpose();
+    X_ptr += stride;
+    dX_ptr += stride;
+  }
+  const float scale = 1.0f / static_cast<float>(N * HxW);
+  math::Scale<float, CPUContext>(N * C * HxW, scale, dX, dX, &context_);
+  return true;
+}
+
+template <>
+bool BatchMomentsGradientOp<float, CPUContext>::ComputeBatchMomentsGradientNHWC(
+    const int N,
+    const int C,
+    const int HxW,
+    const float* dmu,
+    const float* dvar,
+    const float* X,
+    float* dX) {
+  const float scale = 1.0f / static_cast<float>(N * HxW);
+  EigenArrayMap<float> dX_arr(dX, C, N * HxW);
+  dX_arr = ConstEigenArrayMap<float>(X, C, N * HxW).colwise() *
+      ConstEigenVectorArrayMap<float>(dvar, C) * 2.0f;
+  dX_arr.colwise() += ConstEigenVectorArrayMap<float>(dmu, C);
+  math::Scale<float, CPUContext>(N * C * HxW, scale, dX, dX, &context_);
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(BatchMoments, BatchMomentsOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    BatchMomentsGradient,
+    BatchMomentsGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(BatchMoments).NumInputs(1).NumOutputs(2);
+OPERATOR_SCHEMA(BatchMomentsGradient).NumInputs(3).NumOutputs(1);
+
+namespace {
+
+class GetBatchMomentsGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "BatchMomentsGradient",
+        "",
+        std::vector<std::string>{GO(0), GO(1), I(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(BatchMoments, GetBatchMomentsGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/batch_moments_op.cu b/caffe2/operators/batch_moments_op.cu
new file mode 100644
index 00000000000000..7aadc8d8e69e07
--- /dev/null
+++ b/caffe2/operators/batch_moments_op.cu
@@ -0,0 +1,152 @@
+#include "caffe2/operators/batch_moments_op.h"
+
+#include <cub/block/block_reduce.cuh>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;
+
+template <typename T, StorageOrder kOrder>
+__global__ void BatchMomentsCUDAKernel(
+    const int N,
+    const int C,
+    const int HxW,
+    const T* X,
+    T* mu,
+    T* var) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  __shared__ typename BlockReduce<T>::TempStorage m_storage;
+  __shared__ typename BlockReduce<T>::TempStorage v_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T m_sum = 0;
+    T v_sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = kOrder == StorageOrder::NCHW
+          ? (j / HxW * C + i) * HxW + j % HxW
+          : j * outer_size + i;
+#if __CUDA_ARCH__ >= 350
+      m_sum += __ldg(X + index);
+      v_sum += __ldg(X + index) * __ldg(X + index);
+#else
+      m_sum += X[index];
+      v_sum += X[index] * X[index];
+#endif
+    }
+    m_sum = BlockReduce<T>(m_storage).Reduce(m_sum, cub::Sum());
+    v_sum = BlockReduce<T>(v_storage).Reduce(v_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      mu[i] = m_sum / static_cast<T>(N * HxW);
+      var[i] = v_sum / static_cast<T>(N * HxW);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, StorageOrder kOrder>
+__global__ void BatchMomentsGradientCUDAKernel(
+    const int N,
+    const int C,
+    const int HxW,
+    const T* dmu,
+    const T* dvar,
+    const T* X,
+    T* dX) {
+  const int size = N * C * HxW;
+  const T scale = T(1) / static_cast<T>(N * HxW);
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    const int i_mu = kOrder == StorageOrder::NCHW ? i / (HxW) % C : i % C;
+#if __CUDA_ARCH__ >= 350
+    dX[i] =
+        (__ldg(dmu + i_mu) + __ldg(dvar + i_mu) * T(2) * __ldg(X + i)) * scale;
+#else
+    dX[i] = (dmu[i_mu] + dvar[i_mu] * T(2) * X[i]) * scale;
+#endif
+  }
+}
+
+} // namespace
+
+template <>
+bool BatchMomentsOp<float, CUDAContext>::ComputeBatchMomentsNCHW(
+    const int N,
+    const int C,
+    const int HxW,
+    const float* X,
+    float* mu,
+    float* var) {
+  const int outer_size = N * HxW;
+  BatchMomentsCUDAKernel<float, StorageOrder::NCHW>
+      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(N, C, HxW, X, mu, var);
+  return true;
+}
+
+template <>
+bool BatchMomentsOp<float, CUDAContext>::ComputeBatchMomentsNHWC(
+    const int N,
+    const int C,
+    const int HxW,
+    const float* X,
+    float* mu,
+    float* var) {
+  const int outer_size = N * HxW;
+  BatchMomentsCUDAKernel<float, StorageOrder::NHWC>
+      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(N, C, HxW, X, mu, var);
+  return true;
+}
+
+template <>
+bool BatchMomentsGradientOp<float, CUDAContext>::
+    ComputeBatchMomentsGradientNCHW(
+        const int N,
+        const int C,
+        const int HxW,
+        const float* dmu,
+        const float* dvar,
+        const float* X,
+        float* dX) {
+  const int size = N * C * HxW;
+  BatchMomentsGradientCUDAKernel<float, StorageOrder::NCHW>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(N, C, HxW, dmu, dvar, X, dX);
+  return true;
+}
+
+template <>
+bool BatchMomentsGradientOp<float, CUDAContext>::
+    ComputeBatchMomentsGradientNHWC(
+        const int N,
+        const int C,
+        const int HxW,
+        const float* dmu,
+        const float* dvar,
+        const float* X,
+        float* dX) {
+  const int size = N * C * HxW;
+  BatchMomentsGradientCUDAKernel<float, StorageOrder::NHWC>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(N, C, HxW, dmu, dvar, X, dX);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(BatchMoments, BatchMomentsOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    BatchMomentsGradient,
+    BatchMomentsGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/batch_moments_op.h b/caffe2/operators/batch_moments_op.h
new file mode 100644
index 00000000000000..eea4c84943d0fe
--- /dev/null
+++ b/caffe2/operators/batch_moments_op.h
@@ -0,0 +1,117 @@
+#ifndef CAFFE2_OPERATORS_BATCH_MOMENTS_OP_H_
+#define CAFFE2_OPERATORS_BATCH_MOMENTS_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class BatchMomentsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  BatchMomentsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<std::string>("order", "NCHW"))) {
+    CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
+  }
+
+  bool RunOnDevice() override {
+    const auto& X = Input(0);
+    auto* mu = Output(0);
+    auto* var = Output(1);
+    const int ndim = X.ndim();
+    const int N = X.dim32(0);
+    const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
+    const int HxW = X.size() / (N * C);
+    mu->Resize(C);
+    var->Resize(C);
+    const T* X_data = X.template data<T>();
+    T* mu_data = mu->template mutable_data<T>();
+    T* var_data = var->template mutable_data<T>();
+    return order_ == StorageOrder::NCHW
+        ? ComputeBatchMomentsNCHW(N, C, HxW, X_data, mu_data, var_data)
+        : ComputeBatchMomentsNHWC(N, C, HxW, X_data, mu_data, var_data);
+  }
+
+ private:
+  bool ComputeBatchMomentsNCHW(
+      const int N,
+      const int C,
+      const int HxW,
+      const T* X,
+      T* mu,
+      T* var);
+
+  bool ComputeBatchMomentsNHWC(
+      const int N,
+      const int C,
+      const int HxW,
+      const T* X,
+      T* mu,
+      T* var);
+
+  const StorageOrder order_;
+};
+
+template <typename T, class Context>
+class BatchMomentsGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  BatchMomentsGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        order_(StringToStorageOrder(
+            OperatorBase::GetSingleArgument<std::string>("order", "NCHW"))) {
+    CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
+  }
+
+  bool RunOnDevice() override {
+    const auto& dmu = Input(0);
+    const auto& dvar = Input(1);
+    const auto& X = Input(2);
+    auto* dX = Output(0);
+    const int ndim = X.ndim();
+    const int N = X.dim32(0);
+    const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
+    const int HxW = X.size() / (N * C);
+    dX->ResizeLike(X);
+    const T* dmu_data = dmu.template data<T>();
+    const T* dvar_data = dvar.template data<T>();
+    const T* X_data = X.template data<T>();
+    T* dX_data = dX->template mutable_data<T>();
+    return order_ == StorageOrder::NCHW
+        ? ComputeBatchMomentsGradientNCHW(
+              N, C, HxW, dmu_data, dvar_data, X_data, dX_data)
+        : ComputeBatchMomentsGradientNHWC(
+              N, C, HxW, dmu_data, dvar_data, X_data, dX_data);
+  }
+
+ private:
+  bool ComputeBatchMomentsGradientNCHW(
+      const int N,
+      const int C,
+      const int HxW,
+      const T* dmu,
+      const T* dvar,
+      const T* X,
+      T* dX);
+
+  bool ComputeBatchMomentsGradientNHWC(
+      const int N,
+      const int C,
+      const int HxW,
+      const T* dmu,
+      const T* dvar,
+      const T* X,
+      T* dX);
+
+  const StorageOrder order_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_BATCH_MOMENTS_OP_H_
diff --git a/caffe2/operators/batch_sparse_to_dense_op.h b/caffe2/operators/batch_sparse_to_dense_op.h
index de6c69b795d128..0e854d9e467bc0 100644
--- a/caffe2/operators/batch_sparse_to_dense_op.h
+++ b/caffe2/operators/batch_sparse_to_dense_op.h
@@ -19,6 +19,9 @@ class BatchSparseToDenseOp : public Operator<Context> {
         OP_SINGLE_ARG(T, "default_value", default_value_, static_cast<T>(0)) {}
   bool RunOnDevice() override;
 
+  // TODO: enable the filler
+  DISABLE_INPUT_FILLERS(Context)
+
  private:
   TIndex dense_last_dim_;
   T default_value_;
diff --git a/caffe2/operators/boolean_mask_ops.cu b/caffe2/operators/boolean_mask_ops.cu
index a976e7159309e8..85315768bd85d9 100644
--- a/caffe2/operators/boolean_mask_ops.cu
+++ b/caffe2/operators/boolean_mask_ops.cu
@@ -6,13 +6,12 @@
 namespace caffe2 {
 
 namespace {
-template <typename T>
 __global__ void BooleanMaskCopyKernel(
     const TIndex numOfOutput,
     const TIndex numBytes,
     const TIndex* indices,
-    const T* src,
-    T* dest) {
+    const uint8_t* src,
+    uint8_t* dest) {
   for (TIndex i = blockIdx.x; i < numOfOutput; i += gridDim.x) {
     const auto srcBase = indices[i] * numBytes;
     const auto destBase = i * numBytes;
@@ -81,8 +80,8 @@ class BooleanMaskOp<CUDAContext> final : public Operator<CUDAContext> {
     std::vector<TIndex> dims = src.dims();
     dims[0] = numOfOutput;
     dest->Resize(dims);
-    auto* destData = (char*)dest->raw_mutable_data(src.meta());
-    const auto* srcData = (char*)src.raw_data();
+    auto* destData = (uint8_t*)dest->raw_mutable_data(src.meta());
+    const auto* srcData = (uint8_t*)src.raw_data();
     if (OutputSize() == 2) {
       auto* indicesOut = Output(1);
       indicesOut->Resize(numOfOutput);
diff --git a/caffe2/operators/boolean_unmask_ops.cu b/caffe2/operators/boolean_unmask_ops.cu
index 42801e17a64815..dcdec9c33df7be 100644
--- a/caffe2/operators/boolean_unmask_ops.cu
+++ b/caffe2/operators/boolean_unmask_ops.cu
@@ -27,7 +27,7 @@ __global__ void FillValuesKernel(
     const size_t itemSize,
     const int* indices,
     char* const values[],
-    int valueSizes[],
+    int* valueSizes,
     char* dest) {
   CUDA_1D_KERNEL_LOOP(j, numMasks) {
     int k = 0;
diff --git a/caffe2/operators/byte_weight_dequant_op.cc b/caffe2/operators/byte_weight_dequant_op.cc
new file mode 100644
index 00000000000000..6596fff76647ad
--- /dev/null
+++ b/caffe2/operators/byte_weight_dequant_op.cc
@@ -0,0 +1,11 @@
+#include "caffe2/operators/byte_weight_dequant_op.h"
+
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(ByteWeightDequant, ByteWeightDequantOp<CPUContext>);
+
+OPERATOR_SCHEMA(ByteWeightDequant).NumInputs(1).NumOutputs(1);
+
+} // namespace caffe2
diff --git a/caffe2/operators/byte_weight_dequant_op.h b/caffe2/operators/byte_weight_dequant_op.h
new file mode 100644
index 00000000000000..14df6826a3bfcd
--- /dev/null
+++ b/caffe2/operators/byte_weight_dequant_op.h
@@ -0,0 +1,55 @@
+#ifndef CAFFE2_OPERATORS_BYTE_WEIGHT_DEQUANT_OP_H_
+#define CAFFE2_OPERATORS_BYTE_WEIGHT_DEQUANT_OP_H_
+
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename Context>
+class ByteWeightDequantOp : public Operator<Context> {
+ public:
+  ByteWeightDequantOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        min_(OperatorBase::GetSingleArgument<float>("min", -3)),
+        max_(OperatorBase::GetSingleArgument<float>("max", 3)),
+        shape_(OperatorBase::GetRepeatedArgument<int>("shape")) {}
+
+  USE_OPERATOR_FUNCTIONS(Context);
+  using Operator<Context>::Operator;
+
+  bool RunOnDevice() override {
+    const auto& WI = Input(0);
+    auto* Y = Output(0);
+    Y->Resize(shape_);
+    float bin_interval = (max_ - min_) / 255.0;
+    int total = 1;
+    for (int i = 0; i < shape_.size(); i++) {
+      total *= Y->dim(i);
+    }
+    const uint8_t* Xdata;
+    if (WI.template IsType<uint8_t>()) {
+      CAFFE_ENFORCE(total, WI.nbytes());
+      Xdata = WI.template data<uint8_t>();
+    } else {
+      CAFFE_ENFORCE(total, WI.template data<std::string>()[0].size());
+      Xdata = reinterpret_cast<const uint8_t*>(
+          WI.template data<std::string>()[0].c_str());
+    }
+    auto* Ydata = Y->template mutable_data<float>();
+    ConstEigenVectorMap<uint8_t> index(&Xdata[0], total);
+    EigenVectorMap<float> weights(&Ydata[0], total);
+    weights = (index.cast<float>().array() * bin_interval) + min_;
+    return true;
+  }
+
+ private:
+  float min_;
+  float max_;
+  std::vector<int> shape_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_BYTE_WEIGHT_DEQUANT_OP_H_
diff --git a/caffe2/operators/channel_shuffle_op_gpu.cu b/caffe2/operators/channel_shuffle_op_gpu.cu
index 7e53add8615bee..447ba55d3436fa 100644
--- a/caffe2/operators/channel_shuffle_op_gpu.cu
+++ b/caffe2/operators/channel_shuffle_op_gpu.cu
@@ -1,5 +1,5 @@
 #include "caffe2/core/context_gpu.h"
-#include "channel_shuffle_op.h"
+#include "caffe2/operators/channel_shuffle_op.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/conv_op_shared_gpu.cc b/caffe2/operators/conv_op_shared_gpu.cc
index 70570bde1c4161..eb5a762cbd3e3e 100644
--- a/caffe2/operators/conv_op_shared_gpu.cc
+++ b/caffe2/operators/conv_op_shared_gpu.cc
@@ -1,5 +1,5 @@
 #include "caffe2/core/context_gpu.h"
-#include "conv_op_shared.h"
+#include "caffe2/operators/conv_op_shared.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/cosine_embedding_criterion_op.cu b/caffe2/operators/cosine_embedding_criterion_op.cu
index 792062f086b593..69a37ff3294453 100644
--- a/caffe2/operators/cosine_embedding_criterion_op.cu
+++ b/caffe2/operators/cosine_embedding_criterion_op.cu
@@ -9,7 +9,7 @@ __global__ void CECKernel(
     const int N, const float* S, const int* Y, const float margin,
     float* output) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    output[i] = Y[i] == 1 ? (1. - S[i]) : max(0.f, S[i] - margin);
+    output[i] = Y[i] == 1 ? (1. - S[i]) : fmaxf(0.f, S[i] - margin);
   }
 }
 
diff --git a/caffe2/operators/counter_ops_gpu.cc b/caffe2/operators/counter_ops_gpu.cc
index 7880aeeb419379..1c157633a62093 100644
--- a/caffe2/operators/counter_ops_gpu.cc
+++ b/caffe2/operators/counter_ops_gpu.cc
@@ -1,5 +1,5 @@
 #include "caffe2/core/context_gpu.h"
-#include "counter_ops.h"
+#include "caffe2/operators/counter_ops.h"
 
 namespace caffe2 {
 REGISTER_CUDA_OPERATOR(CreateCounter, CreateCounterOp<int64_t, CUDAContext>);
diff --git a/caffe2/operators/distance_op.cu b/caffe2/operators/distance_op.cu
index 037b1a50e1fd59..e1a56399a2f947 100644
--- a/caffe2/operators/distance_op.cu
+++ b/caffe2/operators/distance_op.cu
@@ -131,9 +131,9 @@ __global__ void L1DistanceKernel(
   for (int i = blockIdx.x; i < N; i += gridDim.x) {
     float sum = 0.0f;
     for (int j = threadIdx.x; j < D; j += blockDim.x) {
-      sum +=
-          abs(convert::To<T, float>(X[i * D + j]) -
-              convert::To<T, float>(Y[i * D + j]));
+      sum += fabsf(
+          convert::To<T, float>(X[i * D + j]) -
+          convert::To<T, float>(Y[i * D + j]));
     }
 
     float aggregate = BlockReduce(temp_storage).Sum(sum);
@@ -395,33 +395,33 @@ bool CosineSimilarityGradientOp<float, CUDAContext>::RunOnDevice() {
       context_.cuda_stream()>>>(N, D, X_data, Y_data, xy);
   math::Div<float, CUDAContext>(N, dCos_data, xyn, scale, &context_);
   // dX
-  BatchedMul<<<
+  BatchedMul<float><<<
       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(N, D, Y_data, scale, dX_data);
-  Scale2AxpyScale<<<
+  Scale2AxpyScale<float><<<
       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(N, scale, xy, xn, axpy_scale);
-  BatchedAxpy<<<
+  BatchedAxpy<float><<<
       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(N, D, axpy_scale, X_data, dX_data);
   // dY
-  BatchedMul<<<
+  BatchedMul<float><<<
       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(N, D, X_data, scale, dY_data);
-  Scale2AxpyScale<<<
+  Scale2AxpyScale<float><<<
       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context_.cuda_stream()>>>(N, scale, xy, yn, axpy_scale);
-  BatchedAxpy<<<
+  BatchedAxpy<float><<<
       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
       0,
diff --git a/caffe2/operators/elementwise_linear_op.cu b/caffe2/operators/elementwise_linear_op.cu
index 503675a86d7de8..e4cd235eeffa38 100644
--- a/caffe2/operators/elementwise_linear_op.cu
+++ b/caffe2/operators/elementwise_linear_op.cu
@@ -1,6 +1,6 @@
 #include <assert.h>
 
-#include "elementwise_linear_op.h"
+#include "caffe2/operators/elementwise_linear_op.h"
 
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/operator_fallback_gpu.h"
diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu
index b564d8ae8005f8..1dee0d62724706 100644
--- a/caffe2/operators/elementwise_ops.cu
+++ b/caffe2/operators/elementwise_ops.cu
@@ -8,6 +8,11 @@
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/utils/conversions.h"
 
+#ifdef __HIPCC__
+// rocblas doesn't fully support fp16 yet
+#define ROCBLAS_FP16 0
+#endif
+
 namespace caffe2 {
 
 REGISTER_CUDA_OPERATOR(
@@ -111,6 +116,9 @@ void device_reduce<float16>(
     int N,
     Tensor<CUDAContext>* buffer,
     CUDAContext* context) {
+#if defined(__HIPCC__) && !ROCBLAS_FP16
+  CAFFE_THROW("HIP rocblas doesn't fully support fp16 device_reduce yet.");
+#else
   auto buffer_size = 1;
 
   if (buffer->size() != buffer_size) {
@@ -135,6 +143,7 @@ void device_reduce<float16>(
       out,
       CUDA_R_16F,
       CUDA_R_32F));
+#endif
 }
 
 template <typename T, int BLOCK_THREADS>
diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc
index 3fb7ed92e90a6a..af9214379becd8 100644
--- a/caffe2/operators/generate_proposals_op_test.cc
+++ b/caffe2/operators/generate_proposals_op_test.cc
@@ -2,6 +2,11 @@
 
 #include <gtest/gtest.h>
 #include "caffe2/core/flags.h"
+#include "caffe2/core/macros.h"
+
+#ifdef CAFFE2_USE_OPENCV
+#include <opencv2/opencv.hpp>
+#endif // CAFFE2_USE_OPENCV
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h
index 39e7febe272961..5d6f87d4d30563 100644
--- a/caffe2/operators/generate_proposals_op_util_nms.h
+++ b/caffe2/operators/generate_proposals_op_util_nms.h
@@ -4,12 +4,13 @@
 #include <vector>
 
 #include "caffe2/core/logging.h"
+#include "caffe2/core/macros.h"
 #include "caffe2/utils/eigen_utils.h"
 #include "caffe2/utils/math.h"
 
-#if defined(CV_MAJOR_VERSION) && (CV_MAJOR_VERSION >= 3)
+#ifdef CAFFE2_USE_OPENCV
 #include <opencv2/opencv.hpp>
-#endif // CV_MAJOR_VERSION >= 3
+#endif // CAFFE2_USE_OPENCV
 
 namespace caffe2 {
 namespace utils {
diff --git a/caffe2/operators/group_norm_op.cu b/caffe2/operators/group_norm_op.cu
index daf3ab91e11161..cfdd308bceb050 100644
--- a/caffe2/operators/group_norm_op.cu
+++ b/caffe2/operators/group_norm_op.cu
@@ -6,7 +6,7 @@
 // This is a stand-alone op: Y = gamma * (X - mu) / sig + beta
 // ------------------------------------------------------------------
 
-#include "group_norm_op.h"
+#include "caffe2/operators/group_norm_op.h"
 
 #include <array>
 
diff --git a/caffe2/operators/gru_unit_op_gpu.cu b/caffe2/operators/gru_unit_op_gpu.cu
index 2df357bf057865..ee923ae7a9667c 100644
--- a/caffe2/operators/gru_unit_op_gpu.cu
+++ b/caffe2/operators/gru_unit_op_gpu.cu
@@ -2,7 +2,7 @@
 #include <cmath>
 #include <vector>
 #include "caffe2/core/context_gpu.h"
-#include "gru_unit_op.h"
+#include "caffe2/operators/gru_unit_op.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/heatmap_max_keypoint_op.cc b/caffe2/operators/heatmap_max_keypoint_op.cc
new file mode 100644
index 00000000000000..ed714bdfddec38
--- /dev/null
+++ b/caffe2/operators/heatmap_max_keypoint_op.cc
@@ -0,0 +1,161 @@
+#include "heatmap_max_keypoint_op.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+namespace {
+
+REGISTER_CPU_OPERATOR(
+    HeatmapMaxKeypoint,
+    HeatmapMaxKeypointOp<float, CPUContext>);
+
+// Input: heatmaps [size x size], boxes [x0, y0, x1, y1]
+// Output: keypoints (#rois, 4, #keypoints)
+OPERATOR_SCHEMA(HeatmapMaxKeypoint).NumInputs(2).NumOutputs(1);
+
+SHOULD_NOT_DO_GRADIENT(HeatmapMaxKeypoint);
+} // namespace
+
+/**
+Mask R-CNN uses bicubic upscaling before taking the maximum of the heat map
+for keypoints. We would like to avoid bicubic upscaling, because it is
+computationally expensive. This approach uses the Taylor expansion up to the
+quadratic terms on approximation of the heatmap function.
+**/
+template <>
+bool HeatmapMaxKeypointOp<float, CPUContext>::RunOnDevice() {
+  const auto& heatmaps_in = Input(0);
+  const auto& bboxes_in = Input(1);
+  auto* keypoints_out = Output(0);
+
+  CAFFE_ENFORCE_EQ(heatmaps_in.ndim(), 4);
+  const int N = heatmaps_in.dim32(0);
+  CAFFE_ENFORCE_EQ(heatmaps_in.dim32(0), N);
+  const int keypoint_count = heatmaps_in.dim32(1);
+  const int heatmap_size = heatmaps_in.dim32(2);
+  CAFFE_ENFORCE_GE(heatmap_size, 2); // at least 2x2 for approx
+  CAFFE_ENFORCE_EQ(heatmaps_in.dim32(2), heatmaps_in.dim32(3));
+
+  CAFFE_ENFORCE_EQ(bboxes_in.ndim(), 2);
+  CAFFE_ENFORCE_EQ(bboxes_in.dim32(0), N);
+  CAFFE_ENFORCE_GE(bboxes_in.dim32(1), 4);
+
+  // Wrap inputs in Eigen
+  Eigen::Map<const ERArrXXf> heatmaps(
+      heatmaps_in.data<float>(),
+      heatmaps_in.dim32(0) * heatmaps_in.dim32(1),
+      heatmaps_in.dim32(2) * heatmaps_in.dim32(3));
+  Eigen::Map<const ERArrXXf> bboxes(
+      bboxes_in.data<float>(), bboxes_in.dim32(0), bboxes_in.dim32(1));
+
+  // Calculate the softmax
+  ERArrXXf probs(
+      heatmaps_in.dim32(0) * heatmaps_in.dim32(1),
+      heatmaps_in.dim32(2) * heatmaps_in.dim32(3));
+  if (should_output_softmax_) {
+    // softmax output is expensive to compute, if should_output_softmax is not
+    // specified, don't populate it
+    ERArrXXf heatmap_exp = heatmaps.exp();
+    for (int r = 0; r < N * keypoint_count; r++) {
+      probs.row(r) = heatmap_exp.row(r) / heatmap_exp.row(r).sum();
+    }
+  } /* otherwise not initialized */
+
+  // Resize and wrap outputs in Eigen
+  keypoints_out->Resize(N, 4, keypoint_count);
+  Eigen::Map<ERArrXXf> keypoints(
+      keypoints_out->mutable_data<float>(), N, 4 * keypoint_count);
+
+  EArrXi maxIndices(N * keypoint_count);
+  // finding max value first (only maxCoeff() is vectorized, not
+  // maxCoeff(&index)), then find the index (equalness check is also fast)
+  EArrXf maxScores = heatmaps.rowwise().maxCoeff();
+  for (int r = 0; r < N * keypoint_count; r++) {
+    float maxScore = maxScores[r];
+    for (int c = 0; c < heatmap_size * heatmap_size; c++) {
+      if (heatmaps(r, c) == maxScore) {
+        maxIndices[r] = c;
+        break;
+      }
+    }
+  }
+
+  // Populate outputs
+  for (int k = 0; k < N; k++) { // For each box, even skipped
+
+    float x0 = bboxes(k, 0);
+    float y0 = bboxes(k, 1);
+    float xLen = std::max(bboxes(k, 2) - bboxes(k, 0), 1.0f);
+    float yLen = std::max(bboxes(k, 3) - bboxes(k, 1), 1.0f);
+
+    // Extract max keypoints and probabilities from heatmaps
+    for (int j = 0; j < keypoint_count; j++) {
+      const int heatmap_index = k * keypoint_count + j;
+      const int maxIndex = maxIndices[heatmap_index];
+      const float maxScore = maxScores[heatmap_index];
+      const int maxY = maxIndex / heatmap_size;
+      const int maxX = maxIndex - heatmap_size * maxY;
+
+      assert(heatmaps(heatmap_index, maxIndex) == maxScore);
+      ERArrXXf fmax = ERArrXXf::Zero(3, 3);
+
+      // initialize fmax values of local 3x3 grid
+      // when 3x3 grid going out-of-bound, mirrowing around center
+      for (int y = -1; y <= 1; y++) {
+        for (int x = -1; x <= 1; x++) {
+          int xx = x - 2 * (x + maxX >= heatmap_size) + 2 * (x + maxX < 0);
+          int yy = y - 2 * (y + maxY >= heatmap_size) + 2 * (y + maxY < 0);
+          assert((xx + maxX < heatmap_size) && (xx + maxX >= 0));
+          assert((yy + maxY < heatmap_size) && (yy + maxY >= 0));
+          const int coord_index = (yy + maxY) * heatmap_size + xx + maxX;
+          fmax(y + 1, x + 1) = heatmaps(heatmap_index, coord_index);
+        }
+      }
+
+      // b = -f'(0), A = f''(0) Hessian matrix
+      EVecXf b(2);
+      b << -(fmax(1, 2) - fmax(1, 0)) / 2, -(fmax(2, 1) - fmax(0, 1)) / 2;
+      EMatXf A(2, 2);
+      A << fmax(1, 0) - 2 * fmax(1, 1) + fmax(1, 2),
+          (fmax(2, 2) - fmax(2, 0) - fmax(0, 2) + fmax(0, 0)) / 4,
+          (fmax(2, 2) - fmax(2, 0) - fmax(0, 2) + fmax(0, 0)) / 4,
+          fmax(0, 1) - 2 * fmax(1, 1) + fmax(2, 1);
+
+      // Solve Ax=b
+      const float div = A.determinant();
+      EVecXf delta(2);
+      float deltaScore;
+      const float MAX_DELTA = 1.5;
+      if (std::abs(div) < 1e-4f) {
+        delta << 0.0f, 0.0f;
+        deltaScore = maxScore;
+      } else {
+        delta = A.ldlt().solve(b);
+        // clip delta if going out-of-range of 3x3 grid
+        if (std::abs(delta(0)) > MAX_DELTA || std::abs(delta(1)) > MAX_DELTA) {
+          float larger_delta = std::max(std::abs(delta(0)), std::abs(delta(1)));
+          delta(0) = delta(0) / larger_delta * MAX_DELTA;
+          delta(1) = delta(1) / larger_delta * MAX_DELTA;
+        }
+        deltaScore = fmax(1, 1) - b.transpose() * delta +
+            1.0 / 2.0 * delta.transpose() * A * delta;
+      }
+      assert(std::abs(delta(0)) <= MAX_DELTA);
+      assert(std::abs(delta(1)) <= MAX_DELTA);
+      // find maximum of detla scores
+      keypoints(k, 0 * keypoint_count + j) =
+          x0 + (0.5 + maxX + delta(0)) * xLen / heatmap_size;
+      keypoints(k, 1 * keypoint_count + j) =
+          y0 + (0.5 + maxY + delta(1)) * yLen / heatmap_size;
+      keypoints(k, 2 * keypoint_count + j) = deltaScore;
+      if (should_output_softmax_) {
+        keypoints(k, 3 * keypoint_count + j) = probs(heatmap_index, maxIndex);
+      } else {
+        keypoints(k, 3 * keypoint_count + j) = .0f;
+      }
+    }
+  }
+
+  return true;
+}
+
+} // namespace caffe2
diff --git a/caffe2/operators/heatmap_max_keypoint_op.h b/caffe2/operators/heatmap_max_keypoint_op.h
new file mode 100644
index 00000000000000..352c9ff109de94
--- /dev/null
+++ b/caffe2/operators/heatmap_max_keypoint_op.h
@@ -0,0 +1,31 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#ifndef HEATMAP_MAX_KEYPOINT_OP_H_
+#define HEATMAP_MAX_KEYPOINT_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class HeatmapMaxKeypointOp final : public Operator<Context> {
+ public:
+  HeatmapMaxKeypointOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        should_output_softmax_(OperatorBase::GetSingleArgument<bool>(
+            "should_output_softmax",
+            false)) {}
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+
+ protected:
+  bool should_output_softmax_ = false;
+};
+
+} // namespace caffe2
+
+#endif // HEATMAP_MAX_KEYPOINT_OP_H_
diff --git a/caffe2/operators/hip/operator_fallback_hip.h b/caffe2/operators/hip/operator_fallback_hip.h
deleted file mode 100644
index 62e5fe8f01e5dc..00000000000000
--- a/caffe2/operators/hip/operator_fallback_hip.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifndef CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
-#define CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
-
-#include "caffe2/core/common.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/hip/context_hip.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/proto/caffe2.pb.h"
-
-namespace caffe2 {
-
-/**
- * @brief A templated class to allow one to wrap a CPU operator as a CUDA
- * operator.
- *
- * This class can be used when one does not have the CUDA implementation ready
- * yet for an operator. Essentially, what this op does is to automatically
- * deal with data copy for you. Plausibly, this causes a lot of overhead and
- * is not optimal, so you should use this operator mostly for quick prototyping
- * purpose.
- *
- * All the input and output of the original operator should be TensorCPU.
- *
- * Example usage: if you have a class MyMagicOp that is CPU based, and you use
- * the registration code
- *     REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp);
- * to register the CPU side, you can create its corresponding GPU operator
- * (with performance hits of course) via
- *     REGISTER_HIP_OPERATOR(MyMagic,
- *                            GPUFallbackOp<MyMagicOp>);
- *
- * Advanced usage: if you want to have some specific outputs never copied, you
- * can use the SkipOutputCopy template argument to do that. For example, if
- * MyMagic produces two outputs and the first output is always going to live on
- * the CPU, you can do
- *     REGISTER_HIP_OPERATOR(MyMagic,
- *                            GPUFallbackOp<MyMagicOp, SkipIndices<0>>);
- */
-template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
-class GPUFallbackOp final : public Operator<HIPContext> {
- public:
-  USE_OPERATOR_FUNCTIONS(HIPContext);
-  GPUFallbackOp(const OperatorDef& def, Workspace* ws)
-      : Operator<HIPContext>(def, ws) {
-    CAFFE_ENFORCE_EQ(def.device_option().device_type(), HIP);
-    OperatorDef base_def_(def);
-    // base_def_ runs on CPU, so we will set its device option to CPU.
-    base_def_.clear_device_option();
-    base_def_.mutable_device_option()->set_device_type(CPU);
-    // Set up the symbols for the local workspace.
-    for (const string& name : def.input()) {
-      local_input_blobs_.push_back(local_ws_.CreateBlob(name));
-      CHECK_NOTNULL(local_input_blobs_.back());
-    }
-    base_op_.reset(new CPUOp(base_def_, &local_ws_));
-    for (const string& name : def.output()) {
-      local_output_blobs_.push_back(local_ws_.GetBlob(name));
-      CHECK_NOTNULL(local_output_blobs_.back());
-    }
-  }
-
-  bool RunOnDevice() override {
-    bool need_sync = false;
-    for (int i = 0; i < InputSize(); ++i) {
-      if (OperatorBase::InputIsType<TensorHIP>(i)) {
-        local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
-            Input(i), &context_);
-        need_sync = true;
-      } else {
-        VLOG(1) << "Input " << i << " is not TensorHIP. Skipping copy.";
-        // Note(jiayq): This removes a const but conceptually
-        // local_input_blobs will only be used as const blob input for the
-        // base op so we are still fine.
-        local_input_blobs_[i]->ShareExternal(
-            const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
-            OperatorBase::Inputs()[i]->meta());
-      }
-    }
-
-    // Sync to make sure copies are done.
-    if (need_sync) {
-      context_.FinishDeviceComputation();
-    }
-
-    if (!base_op_->Run()) {
-      LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: "
-                 << ProtoDebugString(this->debug_def());
-      return false;
-    }
-    for (int i = 0; i < OutputSize(); ++i) {
-      if (SkipOutputCopy::Contains(i)) {
-        VLOG(1) << "Copy output: index " << i << " skipped.";
-        continue;
-      }
-      CAFFE_ENFORCE(
-          local_output_blobs_[i]->template IsType<TensorCPU>(),
-          "GPU fallback op currently does not support non-TensorCPU "
-          "output type who needs copying.");
-      Output(i)->CopyFrom(
-          local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
-    }
-    return true;
-  }
-
- protected:
-  Workspace local_ws_;
-  vector<Blob*> local_input_blobs_;
-  vector<Blob*> local_output_blobs_;
-  std::unique_ptr<CPUOp> base_op_;
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
diff --git a/caffe2/operators/hip/operator_fallback_hip_test.cc b/caffe2/operators/hip/operator_fallback_hip_test.cc
deleted file mode 100644
index 4a074c35f8a186..00000000000000
--- a/caffe2/operators/hip/operator_fallback_hip_test.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-#include <iostream>
-
-#include <gtest/gtest.h>
-#include "caffe2/core/operator.h"
-#include "caffe2/operators/hip/operator_fallback_hip.h"
-
-namespace caffe2 {
-
-class IncrementByOneOp final : public Operator<CPUContext> {
- public:
-  IncrementByOneOp(const OperatorDef& def, Workspace* ws)
-      : Operator<CPUContext>(def, ws) {}
-  bool RunOnDevice() {
-    const auto& in = Input(0);
-    auto* out = Output(0);
-    out->ResizeLike(in);
-    const float* in_data = in.template data<float>();
-    float* out_data = out->template mutable_data<float>();
-    for (int i = 0; i < in.size(); ++i) {
-      out_data[i] = in_data[i] + 1.f;
-    }
-    return true;
-  }
-};
-
-OPERATOR_SCHEMA(IncrementByOne)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}});
-
-REGISTER_CPU_OPERATOR(IncrementByOne, IncrementByOneOp);
-REGISTER_HIP_OPERATOR(IncrementByOne, GPUFallbackOp<IncrementByOneOp>);
-
-TEST(OperatorFallbackTest, IncrementByOneOp) {
-  OperatorDef op_def = CreateOperatorDef(
-      "IncrementByOne", "", vector<string>{"X"}, vector<string>{"X"});
-  Workspace ws;
-  TensorCPU source_tensor(vector<TIndex>{2, 3});
-  for (int i = 0; i < 6; ++i) {
-    source_tensor.mutable_data<float>()[i] = i;
-  }
-  ws.CreateBlob("X")->GetMutable<TensorCPU>()->CopyFrom(source_tensor);
-  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
-  EXPECT_TRUE(op.get() != nullptr);
-  EXPECT_TRUE(op->Run());
-  const TensorCPU& output = ws.GetBlob("X")->Get<TensorCPU>();
-  EXPECT_EQ(output.ndim(), 2);
-  EXPECT_EQ(output.dim(0), 2);
-  EXPECT_EQ(output.dim(1), 3);
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(output.data<float>()[i], i + 1);
-  }
-}
-
-TEST(OperatorFallbackTest, GPUIncrementByOneOp) {
-  if (!HasHipGPU())
-    return;
-  OperatorDef op_def = CreateOperatorDef(
-      "IncrementByOne", "", vector<string>{"X"}, vector<string>{"X"});
-  op_def.mutable_device_option()->set_device_type(HIP);
-  Workspace ws;
-  TensorCPU source_tensor(vector<TIndex>{2, 3});
-  for (int i = 0; i < 6; ++i) {
-    source_tensor.mutable_data<float>()[i] = i;
-  }
-  ws.CreateBlob("X")->GetMutable<TensorHIP>()->CopyFrom(source_tensor);
-  unique_ptr<OperatorBase> op(CreateOperator(op_def, &ws));
-  EXPECT_TRUE(op.get() != nullptr);
-  EXPECT_TRUE(op->Run());
-  const TensorHIP& output = ws.GetBlob("X")->Get<TensorHIP>();
-  TensorCPU output_cpu(output);
-  EXPECT_EQ(output.ndim(), 2);
-  EXPECT_EQ(output.dim(0), 2);
-  EXPECT_EQ(output.dim(1), 3);
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(output_cpu.data<float>()[i], i + 1);
-  }
-}
-
-} // namespace caffe2
diff --git a/caffe2/operators/instance_norm_op.cu b/caffe2/operators/instance_norm_op.cu
index 5a5010dc608b36..87532066278b2e 100644
--- a/caffe2/operators/instance_norm_op.cu
+++ b/caffe2/operators/instance_norm_op.cu
@@ -51,7 +51,7 @@ __global__ void InstanceNormInvStdevKernel(
     }
     inv_stdev_data[i] /= dim;
     inv_stdev_data[i] += epsilon;
-    inv_stdev_data[i] = 1.0 / std::sqrt(inv_stdev_data[i]);
+    inv_stdev_data[i] = 1.0 / sqrtf(inv_stdev_data[i]);
   }
 }
 
diff --git a/caffe2/operators/integral_image_op.cu b/caffe2/operators/integral_image_op.cu
index d5c122001292fb..872d29bd0dddb4 100644
--- a/caffe2/operators/integral_image_op.cu
+++ b/caffe2/operators/integral_image_op.cu
@@ -1,5 +1,5 @@
 #include "caffe2/core/context_gpu.h"
-#include "integral_image_op.h"
+#include "caffe2/operators/integral_image_op.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/layer_norm_op.cu b/caffe2/operators/layer_norm_op.cu
index 0309b4e6d7fe10..bcec393b2ad95c 100644
--- a/caffe2/operators/layer_norm_op.cu
+++ b/caffe2/operators/layer_norm_op.cu
@@ -116,7 +116,7 @@ bool LayerNormOp<CUDAContext>::DoRunWithType<float>() {
     mean->CopyFrom(input);
     mean->Resize(stats_dims);
     math::Set<float, CUDAContext>(
-        left, std::sqrt(epsilon_), stdev->mutable_data<float>(), &context_);
+        left, sqrtf(epsilon_), stdev->mutable_data<float>(), &context_);
   } else {
     // Calculate row-wise means
     // First stage: sum up feature vectors
diff --git a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
index 5fdcb1d13058bf..7c42d522f2e71f 100644
--- a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
+++ b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
@@ -68,6 +68,8 @@ class SparseLengthsFused8BitRowwiseOp : public Operator<Context> {
     return true;
   }
 
+  USE_VALUE_KEY_LENGTH_INPUT_FILLERS(Context, DATA, INDICES, LENGTHS)
+
  private:
   enum {
     DATA = 0,
diff --git a/caffe2/operators/lengths_reducer_ops.h b/caffe2/operators/lengths_reducer_ops.h
index 461038ca3cb97f..505dad1b102de3 100644
--- a/caffe2/operators/lengths_reducer_ops.h
+++ b/caffe2/operators/lengths_reducer_ops.h
@@ -92,6 +92,8 @@ class CPUSparseLengthsReductionOp : public Operator<CPUContext> {
     return true;
   }
 
+  USE_VALUE_KEY_LENGTH_INPUT_FILLERS(CPUContext, DATA, INDICES, LENGTHS)
+
  private:
   enum {
     DATA = 0, // Data input.
diff --git a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
index 58ebe6cb58e846..8af4a413239997 100644
--- a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
+++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
@@ -87,6 +87,8 @@ class SparseLengths8BitsRowwiseOp : public Operator<Context> {
     return true;
   }
 
+  USE_VALUE_LENGTH_INPUT_FILLERS(Context, DATA, LENGTHS)
+
   enum {
     DATA = 0,
     WEIGHTS = 1,
diff --git a/caffe2/operators/lstm_unit_op_gpu.cu b/caffe2/operators/lstm_unit_op_gpu.cu
index bdd62e5c3c4228..e15d706635b447 100644
--- a/caffe2/operators/lstm_unit_op_gpu.cu
+++ b/caffe2/operators/lstm_unit_op_gpu.cu
@@ -2,7 +2,7 @@
 #include <cmath>
 #include <vector>
 #include "caffe2/core/context_gpu.h"
-#include "lstm_unit_op.h"
+#include "caffe2/operators/lstm_unit_op.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/max_pool_with_index.cu b/caffe2/operators/max_pool_with_index.cu
index b8e6d2b469e5dc..5ac3c58bb5f89b 100644
--- a/caffe2/operators/max_pool_with_index.cu
+++ b/caffe2/operators/max_pool_with_index.cu
@@ -1,4 +1,4 @@
-#include "caffe2/operators/max_pool_with_index.h"
+#include "caffe2/operators/max_pool_with_index_gpu.h"
 #include "caffe2/utils/conversions.h"
 
 namespace caffe2 {
diff --git a/caffe2/operators/max_pool_with_index.h b/caffe2/operators/max_pool_with_index_gpu.h
similarity index 88%
rename from caffe2/operators/max_pool_with_index.h
rename to caffe2/operators/max_pool_with_index_gpu.h
index 64337dc56088f7..abc26233a7dce6 100644
--- a/caffe2/operators/max_pool_with_index.h
+++ b/caffe2/operators/max_pool_with_index_gpu.h
@@ -1,5 +1,4 @@
-#ifndef CAFFE2_OPERATORS_MAX_POOL_WITH_INDEX_H_
-#define CAFFE2_OPERATORS_MAX_POOL_WITH_INDEX_H_
+#pragma once
 
 #include <cfloat>
 #include "caffe2/core/context.h"
@@ -45,5 +44,3 @@ class MaxPoolWithIndexGradientOp final : public ConvPoolOpBase<CUDAContext> {
 };
 
 }; // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_MAX_POOL_WITH_INDEX_H_
diff --git a/caffe2/operators/normalize_op.cc b/caffe2/operators/normalize_op.cc
index 1a7d720deb6c3c..73a88201b3278e 100644
--- a/caffe2/operators/normalize_op.cc
+++ b/caffe2/operators/normalize_op.cc
@@ -12,6 +12,7 @@ void NormalizeOp<T, Context>::DoNormalize(
     const int m,
     const int n,
     const int sf) {
+  const T kEps = 1e-12f;
   using InnerStride = Eigen::InnerStride<Eigen::Dynamic>;
   using StridedVec =
       Eigen::Map<Eigen::Matrix<T, 1, Eigen::Dynamic>, 0, InnerStride>;
@@ -22,10 +23,9 @@ void NormalizeOp<T, Context>::DoNormalize(
     auto base = (i / sf) * sf * m + (i % sf);
     ConstStridedVec xVec(xData + base, 1, m, InnerStride(sf));
     auto norm = xVec.template lpNorm<2>();
-    if (norm != 0) {
-      StridedVec yVec(yData + base, 1, m, InnerStride(sf));
-      yVec = xVec / norm;
-    }
+    norm = std::max(norm, kEps);
+    StridedVec yVec(yData + base, 1, m, InnerStride(sf));
+    yVec = xVec / norm;
   }
 };
 
@@ -37,6 +37,7 @@ void NormalizeGradientOp<T, Context>::DoNormalize(
     const int m,
     const int n,
     const int sf) {
+  const T kEps = 1e-12f;
   using InnerStride = Eigen::InnerStride<Eigen::Dynamic>;
   using StridedVec =
       Eigen::Map<Eigen::Matrix<T, 1, Eigen::Dynamic>, 0, InnerStride>;
@@ -50,11 +51,10 @@ void NormalizeGradientOp<T, Context>::DoNormalize(
 
     auto row_sum = xVec.dot(gOutVec);
     auto row_norm = xVec.template lpNorm<2>();
+    row_norm = std::max(row_norm, kEps);
     auto row_norm_3 = pow(row_norm, 3);
-    if (row_norm != 0) {
-      StridedVec gInVec(gInData + base, 1, m, InnerStride(sf));
-      gInVec = (gOutVec / row_norm) - ((xVec / row_norm_3) * row_sum);
-    }
+    StridedVec gInVec(gInData + base, 1, m, InnerStride(sf));
+    gInVec = (gOutVec / row_norm) - ((xVec / row_norm_3) * row_sum);
   }
 };
 
diff --git a/caffe2/operators/normalize_ops.cu b/caffe2/operators/normalize_ops.cu
index 343a8cee24ec8d..dcffe02f650abf 100644
--- a/caffe2/operators/normalize_ops.cu
+++ b/caffe2/operators/normalize_ops.cu
@@ -12,6 +12,7 @@ __global__ void NormalizeKernel(
     const int sf,
     const float* xData,
     float* yData) {
+  const float kEps = 1e-12f;
   typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
   __shared__ BlockReduce::TempStorage temp_storage;
 
@@ -27,14 +28,13 @@ __global__ void NormalizeKernel(
     float reduce_result = BlockReduce(temp_storage).Sum(sum);
 
     if (threadIdx.x == 0) {
-      norm = sqrt(reduce_result);
+      norm = sqrtf(reduce_result);
+      norm = fmaxf(norm, kEps);
     }
     __syncthreads();
-    if (norm != 0) {
-      for (int j = threadIdx.x; j < m; j += blockDim.x) {
-        const auto index = base + j * sf;
-        yData[index] = xData[index] / norm;
-      }
+    for (int j = threadIdx.x; j < m; j += blockDim.x) {
+      const auto index = base + j * sf;
+      yData[index] = xData[index] / norm;
     }
   }
 }
@@ -46,6 +46,7 @@ __global__ void NormalizeGradientKernel(
     const float* in_mat,
     const float* grad_out_mat,
     float* grad_mat) {
+  const float kEps = 1e-12f;
   typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
   __shared__ BlockReduce::TempStorage temp_storage_sum;
   __shared__ BlockReduce::TempStorage temp_storage_norm;
@@ -66,8 +67,9 @@ __global__ void NormalizeGradientKernel(
 
     if (threadIdx.x == 0) {
       row_sum = reduce_result;
-      row_norm = sqrt(reduce_norm);
-      row_norm_3 = pow(row_norm, 3);
+      row_norm = sqrtf(reduce_norm);
+      row_norm = fmaxf(row_norm, kEps);
+      row_norm_3 = powf(row_norm, 3);
     }
     __syncthreads();
     for (int j = threadIdx.x; j < N; j += blockDim.x) {
@@ -131,7 +133,7 @@ __global__ void NormalizeL1Kernel(
     __shared__ float norm;
     for (int j = threadIdx.x; j < m; j += blockDim.x) {
       const auto x_ij = xData[base + j * sf];
-      sum += abs(x_ij);
+      sum += fabsf(x_ij);
     }
     float reduce_result = BlockReduce(temp_storage).Sum(sum);
 
diff --git a/caffe2/operators/piecewise_linear_transform_op.cu b/caffe2/operators/piecewise_linear_transform_op.cu
index 877b795c19076b..ecc9f0f2493972 100644
--- a/caffe2/operators/piecewise_linear_transform_op.cu
+++ b/caffe2/operators/piecewise_linear_transform_op.cu
@@ -256,8 +256,8 @@ bool PiecewiseLinearTransformOp<float, CUDAContext>::TransformBinary() {
         X.data<float>(),
         Y->mutable_data<float>());
   } else {
+    // don't want N*M threads, only N*M/2
     PieceWiseLinearTransformBinaryKernel2<<<
-        // don't want N*M threads, only N*M/2
         CAFFE_GET_BLOCKS(X.size() / 2),
         CAFFE_CUDA_NUM_THREADS,
         0,
diff --git a/caffe2/operators/relu_op.cu b/caffe2/operators/relu_op.cu
index d392e4994bc14e..7309270aa28cc8 100644
--- a/caffe2/operators/relu_op.cu
+++ b/caffe2/operators/relu_op.cu
@@ -9,6 +9,10 @@ namespace caffe2 {
 
 namespace {
 
+#ifdef __HIPCC__
+typedef __half2 half2;
+#endif
+
 template <typename T>
 __global__ void ReluCUDAKernel(const int N, const T* X, T* Y) {
   CUDA_1D_KERNEL_LOOP(i, N) {
diff --git a/caffe2/operators/resize_op.cc b/caffe2/operators/resize_op.cc
index 508ab3390e6072..8a272a3d40f96a 100644
--- a/caffe2/operators/resize_op.cc
+++ b/caffe2/operators/resize_op.cc
@@ -153,7 +153,8 @@ output_width = floor(input_width * width_scale)
 output_height = floor(output_height * height_scale)
 )DOC")
     .Input(0, "X", "Input tensor")
-    .Output(0, "Y", "Output tensor");
+    .Output(0, "Y", "Output tensor")
+    .InheritOnnxSchema("Upsample");
 
 // Input: dY, output: dX
 OPERATOR_SCHEMA(ResizeNearestGradient)
diff --git a/caffe2/operators/resize_op.cu b/caffe2/operators/resize_op.cu
index 5098d1d590ccbe..0e1d55e5a4f309 100644
--- a/caffe2/operators/resize_op.cu
+++ b/caffe2/operators/resize_op.cu
@@ -1,6 +1,6 @@
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/utils/math.h"
-#include "resize_op.h"
+#include "caffe2/operators/resize_op.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/reverse_packed_segs_op.cu b/caffe2/operators/reverse_packed_segs_op.cu
index fdcffc66c240e4..aab600e27c7496 100644
--- a/caffe2/operators/reverse_packed_segs_op.cu
+++ b/caffe2/operators/reverse_packed_segs_op.cu
@@ -1,5 +1,5 @@
 #include "caffe2/core/context_gpu.h"
-#include "reverse_packed_segs_op.h"
+#include "caffe2/operators/reverse_packed_segs_op.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/roi_align_gradient_op.cu b/caffe2/operators/roi_align_gradient_op.cu
index 26fb555d184cde..534d55ddd9a469 100644
--- a/caffe2/operators/roi_align_gradient_op.cu
+++ b/caffe2/operators/roi_align_gradient_op.cu
@@ -1,4 +1,4 @@
-#include "roi_align_gradient_op.h"
+#include "caffe2/operators/roi_align_gradient_op.h"
 
 #include <stdio.h>
 #include <cfloat>
diff --git a/caffe2/operators/roi_align_op.cu b/caffe2/operators/roi_align_op.cu
index 4a448ae8939e3f..e512f3d9741393 100644
--- a/caffe2/operators/roi_align_op.cu
+++ b/caffe2/operators/roi_align_op.cu
@@ -1,4 +1,4 @@
-#include "roi_align_op.h"
+#include "caffe2/operators/roi_align_op.h"
 
 #include <stdio.h>
 #include <cfloat>
@@ -76,6 +76,7 @@ __global__ void RoIAlignForward(
     const int pooled_width,
     const int sampling_ratio,
     const T* bottom_rois,
+    int roi_cols,
     T* top_data) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the pooled output
@@ -84,18 +85,23 @@ __global__ void RoIAlignForward(
     int c = (index / pooled_width / pooled_height) % channels;
     int n = index / pooled_width / pooled_height / channels;
 
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
+    // RoI could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+    int roi_batch_ind = 0;
+    if (roi_cols == 5) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
 
     // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
-    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
-    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
-    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
-    // T roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale);
-    // T roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale);
-    // T roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale);
-    // T roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale);
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    // T roi_start_w = roundf(offset_bottom_rois[0] * spatial_scale);
+    // T roi_start_h = roundf(offset_bottom_rois[1] * spatial_scale);
+    // T roi_end_w = roundf(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_h = roundf(offset_bottom_rois[3] * spatial_scale);
 
     // Force malformed ROIs to be 1x1
     T roi_width = max(roi_end_w - roi_start_w, (T)1.);
@@ -173,6 +179,7 @@ bool RoIAlignOp<float, CUDAContext>::RunOnDevice() {
           pooled_width_,
           sampling_ratio_,
           R.data<float>(),
+          R.dim32(1),
           Y->mutable_data<float>());
   return true;
 }
diff --git a/caffe2/operators/roi_align_op_gpu_test.cc b/caffe2/operators/roi_align_op_gpu_test.cc
index 199500f93df3a8..ed4ef33a1d6880 100644
--- a/caffe2/operators/roi_align_op_gpu_test.cc
+++ b/caffe2/operators/roi_align_op_gpu_test.cc
@@ -1,5 +1,5 @@
 #include "caffe2/utils/eigen_utils.h"
-#include "roi_align_op.h"
+#include "caffe2/operators/roi_align_op.h"
 
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/flags.h"
diff --git a/caffe2/operators/roi_align_rotated_gradient_op.cu b/caffe2/operators/roi_align_rotated_gradient_op.cu
index 1606209944d0eb..1941029fc9d3fb 100644
--- a/caffe2/operators/roi_align_rotated_gradient_op.cu
+++ b/caffe2/operators/roi_align_rotated_gradient_op.cu
@@ -3,7 +3,7 @@
 #endif // _MSC_VER
 #include <cmath>
 
-#include "roi_align_rotated_gradient_op.h"
+#include "caffe2/operators/roi_align_rotated_gradient_op.h"
 
 #include <stdio.h>
 #include <cfloat>
diff --git a/caffe2/operators/roi_align_rotated_op.cu b/caffe2/operators/roi_align_rotated_op.cu
index 3f8a609451fbd4..0fad3d74b397eb 100644
--- a/caffe2/operators/roi_align_rotated_op.cu
+++ b/caffe2/operators/roi_align_rotated_op.cu
@@ -3,7 +3,7 @@
 #endif // _MSC_VER
 #include <cmath>
 
-#include "roi_align_rotated_op.h"
+#include "caffe2/operators/roi_align_rotated_op.h"
 
 #include <stdio.h>
 #include <cfloat>
diff --git a/caffe2/operators/roi_pool_op.cu b/caffe2/operators/roi_pool_op.cu
index 34dcebb72b3ca4..45839117b2eda3 100644
--- a/caffe2/operators/roi_pool_op.cu
+++ b/caffe2/operators/roi_pool_op.cu
@@ -1,7 +1,7 @@
 #include <cfloat>
 
 #include "caffe2/core/context_gpu.h"
-#include "roi_pool_op.h"
+#include "caffe2/operators/roi_pool_op.h"
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/selu_op.cu b/caffe2/operators/selu_op.cu
index 314f2b48370e53..95eb2c54ee96a1 100644
--- a/caffe2/operators/selu_op.cu
+++ b/caffe2/operators/selu_op.cu
@@ -33,7 +33,7 @@ bool SeluOp<float, CUDAContext>::RunOnDevice() {
   auto* Y = Output(0);
   CAFFE_ENFORCE_GT(X.size(), 0);
   Y->ResizeLike(X);
-  SeluKernel<<<
+  SeluKernel<float><<<
       CAFFE_GET_BLOCKS(X.size()),
       CAFFE_CUDA_NUM_THREADS,
       0,
@@ -50,7 +50,7 @@ bool SeluGradientOp<float, CUDAContext>::RunOnDevice() {
   CAFFE_ENFORCE_GT(Y.size(), 0);
   CAFFE_ENFORCE_EQ(dY.size(), Y.size());
   dX->ResizeLike(Y);
-  SeluGradientKernel<<<
+  SeluGradientKernel<float><<<
       CAFFE_GET_BLOCKS(Y.size()),
       CAFFE_CUDA_NUM_THREADS,
       0,
diff --git a/caffe2/operators/softmax_ops.cu b/caffe2/operators/softmax_ops.cu
index 8795b6c9f7a8a0..08dbf6e7d07a48 100644
--- a/caffe2/operators/softmax_ops.cu
+++ b/caffe2/operators/softmax_ops.cu
@@ -2,9 +2,9 @@
 #include <cub/block/block_reduce.cuh>
 
 #include "caffe2/core/context_gpu.h"
-#include "softmax_op.h"
-#include "softmax_with_loss_op.h"
-#include "spatial_softmax_with_loss_op.h"
+#include "caffe2/operators/softmax_op.h"
+#include "caffe2/operators/softmax_with_loss_op.h"
+#include "caffe2/operators/spatial_softmax_with_loss_op.h"
 
 namespace caffe2 {
 
@@ -70,7 +70,7 @@ __global__ void ProbCrossEntropyKernel(
       int idx = i * D + j;
       CUDA_KERNEL_ASSERT(labeldata[idx] >= 0);
       total_prob += labeldata[idx];
-      sum += -logf(max(Pdata[idx], FLT_MIN)) * labeldata[idx] * weight;
+      sum += -logf(fmaxf(Pdata[idx], FLT_MIN)) * labeldata[idx] * weight;
     }
     float tot = BlockReduce(temp_storage).Sum(sum);
     __syncthreads();
@@ -78,7 +78,7 @@ __global__ void ProbCrossEntropyKernel(
     if (threadIdx.x == 0) {
       Ydata[i] = tot;
       // Sanity check
-      CUDA_KERNEL_ASSERT(abs(1.0 - total_prob_sum) < 1e-5f);
+      CUDA_KERNEL_ASSERT(fabsf(1.0 - total_prob_sum) < 1e-5f);
     }
     __syncthreads();
   }
@@ -118,14 +118,14 @@ __global__ void SpatialSoftmaxKernel(
     float max_val = -FLT_MAX;
     for(int c = 0; c < D; ++c) {
       int idx = i * (H * W * D) + c * (H * W) + y * W + x;
-      max_val = max(max_val, Xdata[idx]);
+      max_val = fmaxf(max_val, Xdata[idx]);
     }
 
     // Exponentiate
     float expsum = 0.0f;
     for(int c = 0; c < D; ++c) {
       int idx = i * (H * W * D) + c * (H * W) + y * W + x;
-      float expx = exp(Xdata[idx] - max_val);
+      float expx = expf(Xdata[idx] - max_val);
       Pdata[idx] = expx;
       expsum += expx;
     }
@@ -160,7 +160,7 @@ __global__ void SpatialCrossEntropyLossKernel(
     if (label != DONTCARE) {
       CUDA_KERNEL_ASSERT(label >= 0 && label < D);
       float weight = (weights == NULL ? 1.0 : weights[index]);
-      loss_data[index] = -log(max(
+      loss_data[index] = -logf(fmaxf(
         Pdata[i * W * H * D + label * W * H + y * W + x], 1e-20f)) * weight;
       weight_data[index] = weight;
     } else {
@@ -213,7 +213,7 @@ __global__ void SoftmaxNormalizeLogsKernel(
     float* out_log) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int n = index / D;
-    out_log[index] = logits[index] - rowmax[n] - logf(max(scales[n], FLT_MIN));
+    out_log[index] = logits[index] - rowmax[n] - logf(fmaxf(scales[n], FLT_MIN));
   }
 }
 
diff --git a/caffe2/operators/softplus_op.cu b/caffe2/operators/softplus_op.cu
index e733c47a6be386..7e542f5a9b7c84 100644
--- a/caffe2/operators/softplus_op.cu
+++ b/caffe2/operators/softplus_op.cu
@@ -26,7 +26,7 @@ bool SoftplusOp<float, CUDAContext>::RunOnDevice() {
   auto* Y = Output(0);
   DCHECK_GT(X.size(), 0);
   Y->ResizeLike(X);
-  SoftplusKernel<<<
+  SoftplusKernel<float><<<
       CAFFE_GET_BLOCKS(X.size()),
       CAFFE_CUDA_NUM_THREADS,
       0,
@@ -43,7 +43,7 @@ bool SoftplusGradientOp<float, CUDAContext>::RunOnDevice() {
   DCHECK_GT(Y.size(), 0);
   DCHECK_EQ(dY.size(), Y.size());
   dX->ResizeLike(Y);
-  SoftplusGradientKernel<<<
+  SoftplusGradientKernel<float><<<
       CAFFE_GET_BLOCKS(Y.size()),
       CAFFE_CUDA_NUM_THREADS,
       0,
diff --git a/caffe2/operators/softsign_op.cu b/caffe2/operators/softsign_op.cu
index 9eeaad33c4251e..e3a32507adad66 100644
--- a/caffe2/operators/softsign_op.cu
+++ b/caffe2/operators/softsign_op.cu
@@ -14,13 +14,26 @@ inline __host__ __device__ T SquareCUDA(const T x) {
   return x * x;
 }
 
+template <typename T>
+inline __device__ T typed_abs(T x);
+
+template <>
+inline __device__ float typed_abs(float x) {
+  return fabsf(x);
+}
+
+template <>
+inline __device__ double typed_abs(double x) {
+  return fabs(x);
+}
+
 template <typename T>
 __global__ void SoftsignCUDAKernel(const int N, const T* X, T* Y) {
   CUDA_1D_KERNEL_LOOP(i, N) {
 #if __CUDA_ARCH__ >= 350
-    Y[i] = __ldg(X + i) / (T(1) + abs(__ldg(X + i)));
+    Y[i] = __ldg(X + i) / (T(1) + typed_abs(__ldg(X + i)));
 #else
-    Y[i] = X[i] / (T(1) + abs(X[i]));
+    Y[i] = X[i] / (T(1) + typed_abs(X[i]));
 #endif
   }
 }
@@ -30,9 +43,9 @@ __global__ void
 SoftsignGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
 #if __CUDA_ARCH__ >= 350
-    dX[i] = __ldg(dY + i) / SquareCUDA(T(1) + abs(__ldg(X + i)));
+    dX[i] = __ldg(dY + i) / SquareCUDA(T(1) + typed_abs(__ldg(X + i)));
 #else
-    dX[i] = dY[i] / SquareCUDA(T(1) + abs(X[i]));
+    dX[i] = dY[i] / SquareCUDA(T(1) + typed_abs(X[i]));
 #endif
   }
 }
diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h
index 517c9ac1639617..5ea10d17c3f29f 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.h
+++ b/caffe2/operators/sparse_to_dense_mask_op.h
@@ -37,6 +37,9 @@ class SparseToDenseMaskBase : public Operator<Context> {
     }
   }
 
+  // TODO: enable the filler
+  DISABLE_INPUT_FILLERS(Context)
+
  protected:
   const int64_t kMaxDenseSize = 1024 * 128;
 
diff --git a/caffe2/operators/sparse_to_dense_op.cu b/caffe2/operators/sparse_to_dense_op.cu
index 1086c0a6c521d5..c62718a8ece1b7 100644
--- a/caffe2/operators/sparse_to_dense_op.cu
+++ b/caffe2/operators/sparse_to_dense_op.cu
@@ -1,4 +1,4 @@
-#include "sparse_to_dense_op.h"
+#include "caffe2/operators/sparse_to_dense_op.h"
 
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
diff --git a/caffe2/operators/top_k_radix_selection.cuh b/caffe2/operators/top_k_radix_selection.cuh
index c9913df9f41053..69a9710ec78f04 100644
--- a/caffe2/operators/top_k_radix_selection.cuh
+++ b/caffe2/operators/top_k_radix_selection.cuh
@@ -360,7 +360,7 @@ __global__ void gatherTopK(const T* inputPtr,
   // Find the start offset for our slice
   const T* inputSliceStart = &inputPtr[slice * inputSliceSize];
   T* topKSliceStart = &topKPtr[slice * outputSliceSize];
-  caffe2::TIndex* indicesSliceStart = &indicesPtr[slice * outputSliceSize];
+  IndicesType* indicesSliceStart = &indicesPtr[slice * outputSliceSize];
 
   // Find the k-th highest element in our input
   T topKValue = (T)0;
diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu
index 9e68790f0a262c..12ded223f6925d 100644
--- a/caffe2/operators/utility_ops.cu
+++ b/caffe2/operators/utility_ops.cu
@@ -1,9 +1,3 @@
-#include <math.h>
-#include <cfloat>
-// TODO(jamesreed): I would use <cmath> here but std::isnan
-// and std::isinf are declared constexpr there and the nvidia
-// compiler throws an error because of it
-
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/flatten_op.h"
 #include "caffe2/operators/minmax_ops.h"
@@ -169,7 +163,7 @@ bool NanCheckOp<CUDAContext>::RunOnDevice() {
         std::cerr << "NaN idxs:" << std::endl;
         auto* cpu_X_data = cpu_X.data<float>();
         for (size_t i = 0; i < cpu_X.size(); ++i) {
-          if (isnan(cpu_X_data[i]) || isinf(cpu_X_data[i])) {
+          if (std::isnan(cpu_X_data[i]) || std::isinf(cpu_X_data[i])) {
             std::cerr << i << " ";
           }
         }
@@ -404,7 +398,7 @@ bool ScatterWeightedSumOp<float, CUDAContext>::DoRunWithType() {
   TIndex K = indices.size();
   TIndex block_size = M / N;
 
-  T* data = output->template mutable_data<T>();
+  float* data = output->template mutable_data<float>();
 
   // In order to have all device pointers of x_i (and weight_i similarly)
   // consecutively in device memory, copy pointers to a host vector and then
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index d79296c1ebfa40..a0eb0f3c531f03 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -703,6 +703,9 @@ class LengthsToSegmentIdsOp : public Operator<Context> {
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(LengthsToSegmentIdsOp);
 
+  // TODO: enable the InputFillers
+  DISABLE_INPUT_FILLERS(Context)
+
   bool RunOnDevice() override {
     auto& input = Input(0);
     auto* output = Output(0);
@@ -758,6 +761,9 @@ class SegmentIdsToLengthsOp : public Operator<Context> {
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(SegmentIdsToLengthsOp);
 
+  // TODO: enable the InputFillers
+  DISABLE_INPUT_FILLERS(Context)
+
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
   }
@@ -815,6 +821,9 @@ class SegmentIdsToRangesOp : public Operator<Context> {
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(SegmentIdsToRangesOp);
 
+  // TODO: enable the InputFillers
+  DISABLE_INPUT_FILLERS(Context)
+
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
   }
diff --git a/caffe2/python/_import_c_extension.py b/caffe2/python/_import_c_extension.py
index 5405329e40a58b..ba2cbe1677c8b1 100644
--- a/caffe2/python/_import_c_extension.py
+++ b/caffe2/python/_import_c_extension.py
@@ -9,33 +9,31 @@
 # attempt to load the cpu version. The cpu backend is the minimum required, so
 # if that still fails, we will exit loud.
 with extension_loader.DlopenGuard():
+    has_hip_support = False
+    has_gpu_support = False
+
     try:
         from caffe2.python.caffe2_pybind11_state_gpu import *  # noqa
         if num_cuda_devices():  # noqa
             has_gpu_support = True
-        else:
-            has_gpu_support = False
-    except ImportError as e:
-        has_gpu_support = False
+    except ImportError as gpu_e:
+        logging.info('Failed to import cuda module: {}'.format(gpu_e))
         try:
             from caffe2.python.caffe2_pybind11_state_hip import *  # noqa
             if num_hip_devices():
                 has_hip_support = True
                 logging.info('This caffe2 python run has AMD GPU support!')
-            else:
-                has_hip_support = False
-        except ImportError as e:
-            logging.info('Failed to import AMD hip module: {}'.format(e))
+        except ImportError as hip_e:
+            logging.info('Failed to import AMD hip module: {}'.format(hip_e))
 
             logging.warning(
                 'This caffe2 python run does not have GPU support. '
                 'Will run in CPU only mode.')
-            logging.warning('Debug message: {0}'.format(str(e)))
             try:
                 from caffe2.python.caffe2_pybind11_state import *  # noqa
-            except ImportError as e:
+            except ImportError as cpu_e:
                 logging.critical(
-                    'Cannot load caffe2.python. Error: {0}'.format(str(e)))
+                    'Cannot load caffe2.python. Error: {0}'.format(str(cpu_e)))
                 sys.exit(1)
 
 # libcaffe2_python contains a global Workspace that we need to properly delete
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 3caa3ee715d5d2..9fef5724ad2f62 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -2732,6 +2732,8 @@ def create_from_proto(cls, plan_proto):
         assert isinstance(plan_proto, caffe2_pb2.PlanDef)
         plan = Plan(plan_proto.name)
         plan._plan.CopyFrom(plan_proto)
+        del plan._plan.network[:]
+        del plan._plan.execution_step[:]
 
         net_obj_dict = {}
         net_proto_dict = {}
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index eafdc5385836ee..2a4afc82c8067f 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -481,8 +481,11 @@ def test_create_plan_from_proto_correctly(self):
 
         self.assertEqual(len(plan.Steps()), 1)
         self.assertEqual(len(test_plan.Steps()), 1)
+        self.assertEqual(len(plan.Proto().network), 9)
+        self.assertEqual(len(test_plan.Proto().network), 9)
+        self.assertEqual(len(plan.Proto().execution_step), 1)
+        self.assertEqual(len(test_plan.Proto().execution_step), 1)
         self.assertEqual(plan.Steps()[0].Name(), test_plan.Steps()[0].Name())
-
         self.assertEqual(len(plan.Nets()), len(test_plan.Nets()))
         for idx in range(0, len(plan.Nets())):
             # When we create Net for test_plan, we will end up with new Net
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index 9deb588ea866d5..e501a7d41d3ecc 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -251,7 +251,8 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None):
 
 cpu_do = caffe2_pb2.DeviceOption()
 gpu_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA)
-device_options = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else [])
+hip_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.HIP)
+device_options = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else []) + ([hip_do] if workspace.has_hip_support else [])
 # Include device option for each GPU
 expanded_device_options = [cpu_do] + (
     [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, cuda_gpu_id=i)
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 3d7e76a7176d35..fa62fbe6588d1e 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -888,13 +888,17 @@ def kmap(k):
                 return cls._global_renamed_attrs[k]
             return k
         c2_op.arg.extend(onnx_node.attrs.caffe2(kmap=kmap))
-        if c2_op.type in cls._broadcast_operators:
-            already_broadcast = False
-            for arg in c2_op.arg:
-                if arg.name == 'broadcast':
-                    already_broadcast = True
-            if not already_broadcast:
-                c2_op.arg.extend([caffe2.python.utils.MakeArgument('broadcast', 1)])
+
+        if opset_version < 7:
+            # onnx opset 7 and newest caffe2 have adopted full onnx broadcast semantics
+            # so we don't need this hack anymore
+            if c2_op.type in cls._broadcast_operators:
+                already_broadcast = False
+                for arg in c2_op.arg:
+                    if arg.name == 'broadcast':
+                        already_broadcast = True
+                if not already_broadcast:
+                    c2_op.arg.extend([caffe2.python.utils.MakeArgument('broadcast', 1)])
 
         return c2_op
 
diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py
index a531ee37d33a9c..97d824e05897a5 100644
--- a/caffe2/python/onnx/tests/c2_ref_test.py
+++ b/caffe2/python/onnx/tests/c2_ref_test.py
@@ -37,20 +37,16 @@ def test_dummy_name(self):
         assert n1 != n2, "Got same names in different calls: {}".format(n1)
 
     def test_check_arguments(self):
-        X = np.random.randn(3, 2).astype(np.float32)
-        Y = np.random.randn(3, 2).astype(np.float32)
-        Z = np.zeros((3, 2)).astype(np.float32)
-
         b2 = C.Caffe2Backend()
 
         node_def = make_node("Add", inputs = ["X", "Y"], outputs = ["Z"])
-        output = b2.convert_node(node_def.SerializeToString(), 6)
+        b2.convert_node(node_def.SerializeToString(), 6)
 
         bad_node_def = make_node("Add", inputs = ["X", "Y"], outputs = ["Z"], foo = 42, bar = 56)
         with self.assertRaisesRegexp(
             RuntimeError,
             ".*?Don't know how to map unexpected argument (foo|bar) \(from operator .*?\).*$"):
-            output = b2.convert_node(bad_node_def.SerializeToString(), 6)
+            b2.convert_node(bad_node_def.SerializeToString(), 6)
 
     def test_relu_graph(self):
         X = np.random.randn(3, 2).astype(np.float32)
@@ -105,6 +101,37 @@ def sigmoid(x):
         output = c2_rep.run({"X": X, "Y": Y})
         np.testing.assert_almost_equal(output["W3"], W_ref)
 
+    def test_upsample(self):
+        X = np.random.randn(1, 1, 2, 2).astype(np.float32)
+        width_scale = 2.0
+        height_scale = 2.0
+
+        predict_net = caffe2_pb2.NetDef()
+        predict_net.name = 'test-upsample-net'
+        predict_net.external_input[:] = ['X']
+        predict_net.external_output[:] = ['Y']
+        predict_net.op.extend([
+            core.CreateOperator(
+                'ResizeNearest',
+                inputs=['X'],
+                outputs=['Y'],
+                width_scale=width_scale,
+                height_scale=height_scale,
+            ),
+        ])
+        ws, c2_outputs = c2_native_run_net(
+            init_net=None,
+            predict_net=predict_net,
+            inputs=[X])
+
+        onnx_model = c2_onnx.caffe2_net_to_onnx_model(
+            predict_net=predict_net,
+            value_info={
+                'X': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[X.dtype], X.shape)
+            })
+        onnx_outputs = c2.run_model(onnx_model, inputs=[X])
+        self.assertSameOutputs(c2_outputs, onnx_outputs)
+
     def test_gemm(self):
         # simple
         A = np.random.randn(3, 2).astype(np.float32)
@@ -379,9 +406,9 @@ def test_vgg16(self):
 
     @unittest.skipIf(
         os.environ.get('JENKINS_URL'),
-        'Running vgg19 on Travis with Python 2 keeps getting OOM!')
-    def test_vgg19(self):
-        self._test_net('vgg19')
+        'Taking too long to download!')
+    def test_zfnet(self):
+        self._test_net('zfnet')
 
     def test_inception_v1(self):
         self._test_net('inception_v1', decimal=2)
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
index e1604cc8a36d02..24d6bc83878def 100644
--- a/caffe2/python/onnx/tests/onnx_backend_test.py
+++ b/caffe2/python/onnx/tests/onnx_backend_test.py
@@ -35,7 +35,6 @@
                      '|test_operator_repeat.*'  # Tile is not compliant with ONNX yet
                      '|test_.*pool_.*same.*'  # Does not support pool same.
                      '|test_convtranspose.*'  # ConvTranspose needs some more complicated translation
-                     '|test_averagepool.*count_include_pad.*'  # Waiting for the support in Caffe2 onnx backend.
                      ')')
 
 # Quick patch to unbreak master CI, is working on the debugging.
diff --git a/caffe2/python/operator_test/affine_channel_op_test.py b/caffe2/python/operator_test/affine_channel_op_test.py
index 70aa45d6d50bdd..6e56da29b7f6a9 100644
--- a/caffe2/python/operator_test/affine_channel_op_test.py
+++ b/caffe2/python/operator_test/affine_channel_op_test.py
@@ -32,17 +32,15 @@ def affine_channel_nhwc_ref(self, X, scale, bias):
 
     @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5),
            W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]),
-           is_learnable=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
-           in_place=st.booleans(), **hu.gcs)
+           is_learnable=st.booleans(), in_place=st.booleans(), **hu.gcs)
     def test_affine_channel_2d(
-            self, N, C, H, W, order, is_learnable, engine, in_place, gc, dc):
+            self, N, C, H, W, order, is_learnable, in_place, gc, dc):
         op = core.CreateOperator(
             "AffineChannel",
             ["X", "scale", "bias"],
             ["X"] if in_place and not is_learnable else ["Y"],
             order=order,
             is_learnable=is_learnable,
-            engine=engine,
         )
 
         if order == "NCHW":
@@ -73,17 +71,15 @@ def ref_op(X, scale, bias):
     @given(N=st.integers(1, 5), C=st.integers(1, 5), T=st.integers(1, 3),
            H=st.integers(1, 3), W=st.integers(1, 3),
            order=st.sampled_from(["NCHW", "NHWC"]), is_learnable=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]), in_place=st.booleans(),
-           **hu.gcs)
+           in_place=st.booleans(), **hu.gcs)
     def test_affine_channel_3d(
-            self, N, C, T, H, W, order, is_learnable, engine, in_place, gc, dc):
+            self, N, C, T, H, W, order, is_learnable, in_place, gc, dc):
         op = core.CreateOperator(
             "AffineChannel",
             ["X", "scale", "bias"],
             ["X"] if in_place and not is_learnable else ["Y"],
             order=order,
             is_learnable=is_learnable,
-            engine=engine,
         )
 
         if order == "NCHW":
diff --git a/caffe2/python/operator_test/batch_moments_op_test.py b/caffe2/python/operator_test/batch_moments_op_test.py
new file mode 100644
index 00000000000000..2db25e73892563
--- /dev/null
+++ b/caffe2/python/operator_test/batch_moments_op_test.py
@@ -0,0 +1,92 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from caffe2.python import core
+from hypothesis import given
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+
+
+class TestBatchMomentsOp(hu.HypothesisTestCase):
+    def batch_moments_nchw_ref(self, X):
+        dims = X.shape
+        N = dims[0]
+        C = dims[1]
+        X = X.reshape(N, C, -1)
+        mu = np.mean(X, axis=(0, 2))
+        var = np.mean(np.square(X), axis=(0, 2))
+        return [mu, var]
+
+    def batch_moments_nhwc_ref(self, X):
+        dims = X.shape
+        C = dims[-1]
+        X = X.reshape(-1, C)
+        mu = np.mean(X, axis=0)
+        var = np.mean(np.square(X), axis=0)
+        return [mu, var]
+
+    @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5),
+           W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]),
+           **hu.gcs)
+    def test_batch_moments_2d(self, N, C, H, W, order, gc, dc):
+        op = core.CreateOperator(
+            "BatchMoments",
+            ["X"],
+            ["mu", "var"],
+            order=order,
+        )
+
+        if order == "NCHW":
+            X = np.random.randn(N, C, H, W).astype(np.float32)
+        else:
+            X = np.random.randn(N, H, W, C).astype(np.float32)
+
+        def ref(X):
+            if order == "NCHW":
+                return self.batch_moments_nchw_ref(X)
+            else:
+                return self.batch_moments_nhwc_ref(X)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=ref,
+        )
+        self.assertDeviceChecks(dc, op, [X], [0, 1])
+        self.assertGradientChecks(gc, op, [X], 0, [0, 1])
+
+    @given(N=st.integers(1, 5), C=st.integers(1, 5), T=st.integers(1, 3),
+           H=st.integers(1, 3), W=st.integers(1, 3),
+           order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
+    def test_batch_moments_3d(self, N, C, T, H, W, order, gc, dc):
+        op = core.CreateOperator(
+            "BatchMoments",
+            ["X"],
+            ["mu", "var"],
+            order=order,
+        )
+
+        if order == "NCHW":
+            X = np.random.randn(N, C, T, H, W).astype(np.float32)
+        else:
+            X = np.random.randn(N, T, H, W, C).astype(np.float32)
+
+        def ref(X):
+            if order == "NCHW":
+                return self.batch_moments_nchw_ref(X)
+            else:
+                return self.batch_moments_nhwc_ref(X)
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=ref,
+        )
+        self.assertDeviceChecks(dc, op, [X], [0, 1])
+        self.assertGradientChecks(gc, op, [X], 0, [0, 1])
diff --git a/caffe2/python/operator_test/normalize_op_test.py b/caffe2/python/operator_test/normalize_op_test.py
index 965bbe73fec8c8..933d78f4e4f07f 100644
--- a/caffe2/python/operator_test/normalize_op_test.py
+++ b/caffe2/python/operator_test/normalize_op_test.py
@@ -9,45 +9,45 @@
 import hypothesis.strategies as st
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import copy
 
 
 class TestNormalizeOp(hu.HypothesisTestCase):
-
-    @given(X=hu.tensor(min_dim=1,
-                       max_dim=5,
-                       elements=st.floats(min_value=0.5, max_value=1.0)),
-           **hu.gcs)
+    @given(
+        X=hu.tensor(
+            min_dim=1, max_dim=5, elements=st.floats(min_value=0.5, max_value=1.0)
+        ),
+        **hu.gcs
+    )
     def test_normalize(self, X, gc, dc):
         def ref_normalize(X, axis):
-            x_normed = X / (
-                np.sqrt((X**2).sum(axis=axis, keepdims=True)) + np.finfo(X.dtype).tiny)
+            x_normed = X / np.maximum(
+                np.sqrt((X ** 2).sum(axis=axis, keepdims=True)), 1e-12
+            )
             return (x_normed,)
 
         for axis in range(-X.ndim, X.ndim):
+            x = copy.copy(X)
             op = core.CreateOperator("Normalize", "X", "Y", axis=axis)
             self.assertReferenceChecks(
-                gc,
-                op,
-                [X],
-                functools.partial(ref_normalize, axis=axis))
-            self.assertDeviceChecks(dc, op, [X], [0])
-            self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(X=hu.tensor(min_dim=1,
-                       max_dim=5,
-                       elements=st.floats(min_value=0.5, max_value=1.0)),
-           **hu.gcs)
+                gc, op, [x], functools.partial(ref_normalize, axis=axis)
+            )
+            self.assertDeviceChecks(dc, op, [x], [0])
+            self.assertGradientChecks(gc, op, [x], 0, [0])
+
+    @given(
+        X=hu.tensor(
+            min_dim=1, max_dim=5, elements=st.floats(min_value=0.5, max_value=1.0)
+        ),
+        **hu.gcs
+    )
     def test_normalize_L1(self, X, gc, dc):
         def ref(X, axis):
             norm = abs(X).sum(axis=axis, keepdims=True)
             return (X / norm,)
 
         for axis in range(-X.ndim, X.ndim):
-            print('axis: ', axis)
+            print("axis: ", axis)
             op = core.CreateOperator("NormalizeL1", "X", "Y", axis=axis)
-            self.assertReferenceChecks(
-                gc,
-                op,
-                [X],
-                functools.partial(ref, axis=axis))
+            self.assertReferenceChecks(gc, op, [X], functools.partial(ref, axis=axis))
             self.assertDeviceChecks(dc, op, [X], [0])
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index e58f5ba6be1ebe..7c421ff2a870e1 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -5,6 +5,7 @@
 
 #include "caffe2/contrib/script/compiler.h"
 #include "caffe2/core/asan.h"
+#include "caffe2/core/blob_stats.h"
 #include "caffe2/core/db.h"
 #include "caffe2/core/numa.h"
 #include "caffe2/core/operator.h"
@@ -1433,6 +1434,12 @@ void addGlobalMethods(py::module& m) {
     CAFFE_ENFORCE(raw_data);
     return GetNUMANode(raw_data);
   });
+  m.def("get_blob_size_bytes", [](const std::string& blob_name) {
+    CAFFE_ENFORCE(gWorkspace);
+    auto* blob = gWorkspace->GetBlob(blob_name);
+    CAFFE_ENFORCE(blob);
+    return BlobStat::sizeBytes(*blob);
+  });
   m.def("support_onnx_export", [](const std::string& op) -> bool {
     const OpSchema* schema = caffe2::OpSchemaRegistry::Schema(op);
     if (!schema) {
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index cb73dda38e8120..1c618ac9efabd7 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -42,6 +42,7 @@
 
 is_asan = C.is_asan
 has_gpu_support = C.has_gpu_support
+has_hip_support = C.has_hip_support
 if has_gpu_support:
     NumCudaDevices = C.num_cuda_devices
     GetCUDAVersion = C.get_cuda_version
@@ -61,6 +62,7 @@ def GetCudaPeerAccessPattern():
 IsNUMAEnabled = C.is_numa_enabled
 GetNumNUMANodes = C.get_num_numa_nodes
 GetBlobNUMANode = C.get_blob_numa_node
+GetBlobSizeBytes = C.get_blob_size_bytes
 
 def _GetFreeFlaskPort():
     """Get a free flask port."""
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index 7e5047c74a5181..78468ec8548af9 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -189,6 +189,30 @@ def testFetchFeedBlobBool(self):
         self.assertEqual(fetched_back.dtype, np.bool)
         np.testing.assert_array_equal(fetched_back, data)
 
+    def testGetBlobSizeBytes(self):
+        for dtype in [np.float16, np.float32, np.float64, np.bool,
+                      np.int8, np.int16, np.int32, np.int64,
+                      np.uint8, np.uint16]:
+            data = np.random.randn(2, 3).astype(dtype)
+            self.assertTrue(workspace.FeedBlob("testblob_sizeBytes", data), True)
+            self.assertEqual(
+                workspace.GetBlobSizeBytes("testblob_sizeBytes"),
+                6 * np.dtype(dtype).itemsize)
+        strs1 = np.array([b'Hello World!', b'abcd'])
+        strs2 = np.array([b'element1', b'element2'])
+        strs1_len, strs2_len = 0, 0
+        for str in strs1:
+            strs1_len += len(str)
+        for str in strs2:
+            strs2_len += len(str)
+        self.assertTrue(workspace.FeedBlob("testblob_str1", strs1), True)
+        self.assertTrue(workspace.FeedBlob("testblob_str2", strs2), True)
+        # size of blob "testblob_str1" = size_str1 * meta_.itemsize() + strs1_len
+        # size of blob "testblob_str2" = size_str2 * meta_.itemsize() + strs2_len
+        self.assertEqual(
+            workspace.GetBlobSizeBytes("testblob_str1") -
+            workspace.GetBlobSizeBytes("testblob_str2"), strs1_len - strs2_len)
+
     def testFetchFeedBlobZeroDim(self):
         data = np.empty(shape=(2, 0, 3), dtype=np.float32)
         self.assertEqual(workspace.FeedBlob("testblob_empty", data), True)
diff --git a/caffe2/sgd/CMakeLists.txt b/caffe2/sgd/CMakeLists.txt
index 740d9741508b5c..55d0a9124836e4 100644
--- a/caffe2/sgd/CMakeLists.txt
+++ b/caffe2/sgd/CMakeLists.txt
@@ -9,6 +9,14 @@ set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp})
 file(GLOB tmp *_test.cc)
 exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp})
 
+# ---[ HIP files
+# ------[ general GPU
+file(GLOB_RECURSE tmp *_hip.cc)
+set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp})
+# exclude test files
+file(GLOB_RECURSE tmp *_test.cc)
+exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp})
+
 # ---[ CPU files.
 file(GLOB tmp *.cc)
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
@@ -16,18 +24,26 @@ set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
 file(GLOB tmp *_test.cc)
 exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
 exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
+exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS})
 
 # ---[ GPU test files
 file(GLOB tmp *_gpu_test.cc)
 set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp})
 
+# ---[ HIP test files
+file(GLOB_RECURSE tmp *_hip_test.cc)
+set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp})
+
 # ---[ CPU test files
 file(GLOB tmp *_test.cc)
 set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
 exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS})
+exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS})
 
 # ---[ Send the lists to the parent scope.
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
 set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
+set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
 set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
 set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
+set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu
index 71e9f9253ab7d7..df43aaf042ac75 100644
--- a/caffe2/sgd/adagrad_op_gpu.cu
+++ b/caffe2/sgd/adagrad_op_gpu.cu
@@ -1,5 +1,5 @@
 #include <cub/block/block_reduce.cuh>
-#include "adagrad_op.h"
+#include "caffe2/sgd/adagrad_op.h"
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/utils/mixed_utils.h"
@@ -19,7 +19,7 @@ __global__ void AdagradUpdate(
   CUDA_1D_KERNEL_LOOP(i, N) {
     float gi = g[i];
     float hi = nh[i] = decay * h[i] + gi * gi;
-    nw[i] = w[i] + lr[0] * gi / (std::sqrt(hi) + epsilon);
+    nw[i] = w[i] + lr[0] * gi / (sqrtf(hi) + epsilon);
   }
 }
 
@@ -63,7 +63,7 @@ __global__ void SparseAdagradKernel(
         mixed_add(grad[gradIdx] * grad[gradIdx], param_mom[paramIdx]);
     mixed_store(&mom_new, &(param_mom[paramIdx]));
     float param_new = mixed_add(
-        LR * grad[gradIdx] / (sqrt(mom_new) + epsilon), param[paramIdx]);
+        LR * grad[gradIdx] / (sqrtf(mom_new) + epsilon), param[paramIdx]);
     mixed_store(&param_new, &(param[paramIdx]));
   }
 }
@@ -107,7 +107,7 @@ __global__ void RowWiseSparseAdagradKernel(
     }
     __syncthreads();
     // update param
-    float step = lr[0] / (std::sqrt(param_mom[index]) + epsilon);
+    float step = lr[0] / (sqrtf(param_mom[index]) + epsilon);
     for (int j = threadIdx.x; j < N; j += blockDim.x) {
       param[index * N + j] = param[index * N + j] + grad[i * N + j] * step;
     }
diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
index c9b94e62766f88..8eb1b8835c96da 100644
--- a/caffe2/sgd/adam_op_gpu.cu
+++ b/caffe2/sgd/adam_op_gpu.cu
@@ -1,4 +1,4 @@
-#include "adam_op.h"
+#include "caffe2/sgd/adam_op.h"
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 
@@ -21,7 +21,7 @@ __global__ void AdamUpdate(
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
-    ng[i] = lr[0] * correction * mi / (std::sqrt(vi) + eps_hat);
+    ng[i] = lr[0] * correction * mi / (sqrtf(vi) + eps_hat);
   }
 }
 
@@ -66,7 +66,7 @@ __global__ void AdamCompute(
     float gi = g[i];
     float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
     float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
-    float ng = lr[0] * correction * mi / (std::sqrt(vi) + eps_hat);
+    float ng = lr[0] * correction * mi / (sqrtf(vi) + eps_hat);
     nw[i] = w[i] + ng;
   }
 }
@@ -130,7 +130,7 @@ bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
   auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
   const auto iter =
       OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
-  const float correction = std::sqrt(1.0f - std::pow(beta2_, iter + 1)) /
+  const float correction = sqrtf(1.0f - std::pow(beta2_, iter + 1)) /
       (1.0f - std::pow(beta1_, iter + 1));
 
   SparseAdamKernel<SIndex>
diff --git a/caffe2/sgd/fp16_momentum_sgd_op.cu b/caffe2/sgd/fp16_momentum_sgd_op.cu
index d8d98bcda90a48..0067fb9a979180 100644
--- a/caffe2/sgd/fp16_momentum_sgd_op.cu
+++ b/caffe2/sgd/fp16_momentum_sgd_op.cu
@@ -1,10 +1,16 @@
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 
-#include "fp16_momentum_sgd_op.h"
+#include "caffe2/sgd/fp16_momentum_sgd_op.h"
 
 namespace caffe2 {
 namespace {
+
+#ifdef __HIPCC__
+typedef __half half;
+typedef __half2 half2;
+#endif
+
 __global__ void FP16MomentumSGDKernel(
     int N,
     const half2* g,
diff --git a/caffe2/sgd/fp32_momentum_sgd_op.cu b/caffe2/sgd/fp32_momentum_sgd_op.cu
index 17a0d6badefd68..c7947dac440a23 100644
--- a/caffe2/sgd/fp32_momentum_sgd_op.cu
+++ b/caffe2/sgd/fp32_momentum_sgd_op.cu
@@ -1,7 +1,7 @@
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 
-#include "fp32_momentum_sgd_op.h"
+#include "caffe2/sgd/fp32_momentum_sgd_op.h"
 
 namespace caffe2 {
 namespace {
diff --git a/caffe2/sgd/momentum_sgd_op_gpu.cu b/caffe2/sgd/momentum_sgd_op_gpu.cu
index 9ef3f7e5d96da7..74d84f1ada8117 100644
--- a/caffe2/sgd/momentum_sgd_op_gpu.cu
+++ b/caffe2/sgd/momentum_sgd_op_gpu.cu
@@ -1,4 +1,4 @@
-#include "momentum_sgd_op.h"
+#include "caffe2/sgd/momentum_sgd_op.h"
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 
diff --git a/caffe2/sgd/rmsprop_op_gpu.cu b/caffe2/sgd/rmsprop_op_gpu.cu
index dd34e10f97c28a..fd293e240308bd 100644
--- a/caffe2/sgd/rmsprop_op_gpu.cu
+++ b/caffe2/sgd/rmsprop_op_gpu.cu
@@ -1,4 +1,4 @@
-#include "rmsprop_op.h"
+#include "caffe2/sgd/rmsprop_op.h"
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
 
@@ -21,7 +21,7 @@ __global__ void RmsPropUpdate(
     nms[i] = ms[i] + (1.0f - decay) * (g[i] * g[i] - ms[i]);
     // Update momentum estimate
     nmom[i] =
-        mom[i] * momentum + lr[0] * g[i] / std::sqrt(epsilon + nms[i]);
+        mom[i] * momentum + lr[0] * g[i] / sqrtf(epsilon + nms[i]);
     // New gradient is the momentum
     ng[i] = nmom[i];
   }
diff --git a/caffe2/utils/filler.h b/caffe2/utils/filler.h
new file mode 100644
index 00000000000000..a2aa32fb56db7f
--- /dev/null
+++ b/caffe2/utils/filler.h
@@ -0,0 +1,126 @@
+#ifndef CAFFE2_FILLER_H_
+#define CAFFE2_FILLER_H_
+
+#include <sstream>
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context_t>
+class TensorFiller {
+ public:
+  template <class Type>
+  void Fill(Tensor<Context_t>* tensor) const {
+    CAFFE_ENFORCE(context_, "context is null");
+    CAFFE_ENFORCE(tensor, "tensor is null");
+    auto min = static_cast<Type>(min_);
+    auto max = static_cast<Type>(max_);
+    CAFFE_ENFORCE_LE(min, max);
+
+    Tensor<Context_t> temp_tensor(shape_);
+    tensor->swap(temp_tensor);
+    Type* data = tensor->template mutable_data<Type>();
+    Context_t* context = static_cast<Context_t*>(context_);
+
+    // TODO: Come up with a good distribution abstraction so that
+    // the users could plug in their own distribution.
+    if (has_fixed_sum_) {
+      auto fixed_sum = static_cast<Type>(fixed_sum_);
+      CAFFE_ENFORCE_LE(min * tensor->size(), fixed_sum);
+      CAFFE_ENFORCE_GE(max * tensor->size(), fixed_sum);
+      math::RandFixedSum<Type, Context_t>(
+          tensor->size(), min, max, fixed_sum_, data, context);
+    } else {
+      math::RandUniform<Type, Context_t>(
+          tensor->size(), min, max, data, context);
+    }
+  }
+
+  template <class Type>
+  TensorFiller& Min(Type min) {
+    min_ = (double)min;
+    return *this;
+  }
+
+  template <class Type>
+  TensorFiller& Max(Type max) {
+    max_ = (double)max;
+    return *this;
+  }
+
+  template <class Type>
+  TensorFiller& FixedSum(Type fixed_sum) {
+    has_fixed_sum_ = true;
+    fixed_sum_ = (double)fixed_sum;
+    return *this;
+  }
+
+  // a helper function to construct the lengths vector for sparse features
+  template <class Type>
+  TensorFiller& SparseLengths(Type total_length) {
+    return FixedSum(total_length).Min(0).Max(total_length);
+  }
+
+  // a helper function to construct the segments vector for sparse features
+  template <class Type>
+  TensorFiller& SparseSegments(Type max_segment) {
+    CAFFE_ENFORCE(!has_fixed_sum_);
+    return Min(0).Max(max_segment);
+  }
+
+  TensorFiller& Shape(const std::vector<TIndex>& shape) {
+    shape_ = shape;
+    return *this;
+  }
+
+  // Use new context so that it is independent from its operator
+  TensorFiller& Context(Context_t* context) {
+    context_ = (void*)context;
+    return *this;
+  }
+
+  template <class Type>
+  TensorFiller(
+      const std::vector<TIndex>& shape,
+      Type fixed_sum,
+      Context_t* context)
+      : shape_(shape),
+        has_fixed_sum_(true),
+        fixed_sum_((double)fixed_sum),
+        context_((void*)context) {}
+
+  TensorFiller(const std::vector<TIndex>& shape, Context_t* context)
+      : shape_(shape),
+        has_fixed_sum_(false),
+        fixed_sum_(0),
+        context_((void*)context) {}
+
+  TensorFiller() : TensorFiller({}, (Context_t*)nullptr) {}
+
+  std::string DebugString() const {
+    std::stringstream stream;
+    stream << "shape = [" << shape_ << "]; min = " << min_
+           << "; max = " << max_;
+    if (has_fixed_sum_) {
+      stream << "; fixed sum = " << fixed_sum_;
+    }
+    return stream.str();
+  }
+
+ private:
+  std::vector<TIndex> shape_;
+  // TODO: type is unknown until a user starts to fill data;
+  // cast everything to double for now.
+  double min_ = 0.0;
+  double max_ = 1.0;
+  bool has_fixed_sum_;
+  double fixed_sum_;
+  void* context_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_FILLER_H_
diff --git a/caffe2/utils/hip/math_hip.cc b/caffe2/utils/hip/math_hip.cc
index 902478337be5d3..9b2eef54216188 100644
--- a/caffe2/utils/hip/math_hip.cc
+++ b/caffe2/utils/hip/math_hip.cc
@@ -3,6 +3,7 @@
 
 #include "caffe2/utils/math.h"
 
+#include <cstring>
 #include <limits>
 #include <numeric>
 #include <vector>
@@ -14,6 +15,8 @@
 
 #include "caffe2/core/hip/context_hip.h"
 #include "caffe2/utils/conversions.h"
+#include "caffe2/utils/fixed_divisor.h"
+#include "caffe2/utils/math_utils.h"
 
 #if THRUST_VERSION >= 100800
 #define THRUST_SUPPORTS_PER_THREAD
@@ -26,40 +29,19 @@ namespace math {
 
 namespace {
 
-inline __host__ __device__ bool Not(const bool x) {
-  return !x;
-}
-
-template <typename T>
-inline __host__ __device__ T Negate(const T& x) {
-  return -x;
-}
-
-template <typename T>
-inline __host__ __device__ T Square(const T& x) {
-  return x * x;
-}
-
-template <typename T>
-inline __host__ __device__ T Sign(const T& x) {
-  return x > 0 ? T(1) : (x < 0 ? T(-1) : T(0));
-}
-
-#define DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Func, expr)        \
-  template <typename T>                                               \
-  struct Func##Functor {                                              \
-    inline __host__ __device__ T                                      \
-    operator()(const T& lhs, const T& rhs) const {                    \
-      return lhs expr rhs;                                            \
-    }                                                                 \
-  };                                                                  \
-  template <>                                                         \
-  struct Func##Functor<float16> {                                     \
-    inline __host__ __device__ float16                                \
-    operator()(const float16& lhs, const float16& rhs) const {        \
-      return convert::To<float, float16>(convert::To<float16, float>( \
-          lhs) expr convert::To<float16, float>(rhs));                \
-    }                                                                 \
+#define DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Func, expr)                 \
+  template <typename T> struct Func##Functor {                                 \
+    inline __host__ __device__ T operator()(const T &lhs,                      \
+                                            const T &rhs) const {              \
+      return lhs expr rhs;                                                     \
+    }                                                                          \
+  };                                                                           \
+  template <> struct Func##Functor<float16> {                                  \
+    inline __host__ __device__ float16 operator()(const float16 &lhs,          \
+                                                  const float16 &rhs) const {  \
+      return convert::To<float, float16>(convert::To<float16, float>(          \
+          lhs) expr convert::To<float16, float>(rhs));                         \
+    }                                                                          \
   };
 DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Add, +)
 DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Sub, -)
@@ -68,28 +50,18 @@ DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Div, /)
 #undef DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR
 
 template <typename TIn, typename TOut, class BinaryOperator>
-__global__ void SimpleBinaryOpHIPKernel(
-    const int N,
-    const BinaryOperator op,
-    const TIn* A,
-    const TIn* B,
-    TOut* C) {
-  HIP_1D_KERNEL_LOOP(i, N) {
-    C[i] = op(A[i], B[i]);
-  }
+__global__ void SimpleBinaryOpHIPKernel(const int N, const BinaryOperator op,
+                                        const TIn *A, const TIn *B, TOut *C) {
+  HIP_1D_KERNEL_LOOP(i, N) { C[i] = op(A[i], B[i]); }
 }
 
 template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
-__global__ void RowwiseBinaryOpHIPKenel(
-    const int rows,
-    const int cols,
-    const BinaryOperator op,
-    const TIn* A,
-    const TIn* B,
-    TOut* C) {
-  const int size = rows * cols;
+__global__ void RowwiseBinaryOpHIPKernel(const int size,
+                                         const FixedDivisor<int> cols,
+                                         const BinaryOperator op, const TIn *A,
+                                         const TIn *B, TOut *C) {
   HIP_1D_KERNEL_LOOP(C_index, size) {
-    const int j = C_index % cols;
+    const int j = cols.Mod(C_index);
     const int A_index = broadcast_1st ? j : C_index;
     const int B_index = broadcast_1st ? C_index : j;
     C[C_index] = op(A[A_index], B[B_index]);
@@ -97,16 +69,12 @@ __global__ void RowwiseBinaryOpHIPKenel(
 }
 
 template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
-__global__ void ColwiseBinaryOpHIPKenel(
-    const int rows,
-    const int cols,
-    const BinaryOperator op,
-    const TIn* A,
-    const TIn* B,
-    TOut* C) {
-  const int size = rows * cols;
+__global__ void ColwiseBinaryOpHIPKernel(const int size,
+                                         const FixedDivisor<int> cols,
+                                         const BinaryOperator op, const TIn *A,
+                                         const TIn *B, TOut *C) {
   HIP_1D_KERNEL_LOOP(C_index, size) {
-    const int i = C_index / cols;
+    const int i = cols.Div(C_index);
     const int A_index = broadcast_1st ? i : C_index;
     const int B_index = broadcast_1st ? C_index : i;
     C[C_index] = op(A[A_index], B[B_index]);
@@ -114,260 +82,154 @@ __global__ void ColwiseBinaryOpHIPKenel(
 }
 
 template <typename TIn, typename TOut, class BinaryOperator, int D>
-__global__ void BroadcastBinaryOpHIPKernel(
-    const int size,
-    const SimpleArray<int, D> A_strides,
-    const SimpleArray<int, D> B_strides,
-    const SimpleArray<int, D> C_dims,
-    const BinaryOperator op,
-    const TIn* A,
-    const TIn* B,
-    TOut* C) {
+__global__ void
+BroadcastBinaryOpHIPKernel(const int size, const SimpleArray<int, D> A_strides,
+                           const SimpleArray<int, D> B_strides,
+                           const SimpleArray<FixedDivisor<int>, D> C_dims,
+                           const BinaryOperator op, const TIn *A, const TIn *B,
+                           TOut *C) {
   HIP_1D_KERNEL_LOOP(C_index, size) {
     int A_index = 0;
     int B_index = 0;
     int C_index_val = C_index;
 #pragma unroll
     for (int i = D - 1; i >= 0; --i) {
-      const int d = C_index_val % C_dims.data[i];
-      A_index += A_strides.data[i] == 0 ? 0 : d * A_strides.data[i];
-      B_index += B_strides.data[i] == 0 ? 0 : d * B_strides.data[i];
-      C_index_val /= C_dims.data[i];
+      int d;
+      C_dims.data[i].DivMod(C_index_val, &C_index_val, &d);
+      A_index += d * A_strides.data[i];
+      B_index += d * B_strides.data[i];
     }
     C[C_index] = op(A[A_index], B[B_index]);
   }
 }
 
 template <typename TIn, typename TOut, class BinaryOperator>
-void BinaryOpWith2DBroadcasting(
-    const int ndim,
-    const int* dims,
-    const int pivot,
-    const bool rowwise_broadcast,
-    const bool broadcast_1st,
-    const BinaryOperator& op,
-    const TIn* A,
-    const TIn* B,
-    TOut* C,
-    HIPContext* context) {
+void BinaryOpWith2DBroadcasting(const int ndim, const int *dims,
+                                const int pivot, const bool rowwise_broadcast,
+                                const bool broadcast_1st,
+                                const BinaryOperator &op, const TIn *A,
+                                const TIn *B, TOut *C, HIPContext *context) {
   const int rows =
       std::accumulate(dims, dims + pivot, 1, std::multiplies<int>());
   const int cols =
       std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies<int>());
+  if (rows == 0 || cols == 0) {
+    return;
+  }
   const int size = rows * cols;
+  const FixedDivisor<int> cols_div(cols);
   if (rowwise_broadcast) {
     if (broadcast_1st) {
       hipLaunchKernelGGL(
-          (RowwiseBinaryOpHIPKenel<TIn, TOut, BinaryOperator, true>),
-          dim3(CAFFE_GET_BLOCKS(size)),
-          dim3(CAFFE_HIP_NUM_THREADS),
-          0,
-          context->hip_stream(),
-          rows,
-          cols,
-          op,
-          A,
-          B,
-          C);
+          (RowwiseBinaryOpHIPKernel<TIn, TOut, BinaryOperator, true>),
+          dim3(CAFFE_GET_BLOCKS(size)), dim3(CAFFE_HIP_NUM_THREADS), 0,
+          context->hip_stream(), size, cols_div, op, A, B, C);
     } else {
       hipLaunchKernelGGL(
-          (RowwiseBinaryOpHIPKenel<TIn, TOut, BinaryOperator, false>),
-          dim3(CAFFE_GET_BLOCKS(size)),
-          dim3(CAFFE_HIP_NUM_THREADS),
-          0,
-          context->hip_stream(),
-          rows,
-          cols,
-          op,
-          A,
-          B,
-          C);
+          (RowwiseBinaryOpHIPKernel<TIn, TOut, BinaryOperator, false>),
+          dim3(CAFFE_GET_BLOCKS(size)), dim3(CAFFE_HIP_NUM_THREADS), 0,
+          context->hip_stream(), size, cols_div, op, A, B, C);
     }
   } else {
     if (broadcast_1st) {
       hipLaunchKernelGGL(
-          (ColwiseBinaryOpHIPKenel<TIn, TOut, BinaryOperator, true>),
-          dim3(CAFFE_GET_BLOCKS(size)),
-          dim3(CAFFE_HIP_NUM_THREADS),
-          0,
-          context->hip_stream(),
-          rows,
-          cols,
-          op,
-          A,
-          B,
-          C);
+          (ColwiseBinaryOpHIPKernel<TIn, TOut, BinaryOperator, true>),
+          dim3(CAFFE_GET_BLOCKS(size)), dim3(CAFFE_HIP_NUM_THREADS), 0,
+          context->hip_stream(), size, cols_div, op, A, B, C);
     } else {
       hipLaunchKernelGGL(
-          (ColwiseBinaryOpHIPKenel<TIn, TOut, BinaryOperator, false>),
-          dim3(CAFFE_GET_BLOCKS(size)),
-          dim3(CAFFE_HIP_NUM_THREADS),
-          0,
-          context->hip_stream(),
-          rows,
-          cols,
-          op,
-          A,
-          B,
-          C);
+          (ColwiseBinaryOpHIPKernel<TIn, TOut, BinaryOperator, false>),
+          dim3(CAFFE_GET_BLOCKS(size)), dim3(CAFFE_HIP_NUM_THREADS), 0,
+          context->hip_stream(), size, cols_div, op, A, B, C);
     }
   }
 }
 
 template <typename TIn, typename TOut, class BinaryOperator, int D>
-void BroadcastBinaryOpImpl(
-    const int* A_dims,
-    const int* B_dims,
-    const int* C_dims,
-    const BinaryOperator& op,
-    const TIn* A,
-    const TIn* B,
-    TOut* C,
-    HIPContext* context) {
+void BroadcastBinaryOpImpl(const int *A_dims, const int *B_dims,
+                           const int *C_dims, const BinaryOperator &op,
+                           const TIn *A, const TIn *B, TOut *C,
+                           HIPContext *context) {
   SimpleArray<int, D> A_strides_array;
   SimpleArray<int, D> B_strides_array;
-  SimpleArray<int, D> C_dims_array;
+  SimpleArray<FixedDivisor<int>, D> C_dims_array;
   int A_stride = 1;
   int B_stride = 1;
   for (int i = D - 1; i >= 0; --i) {
+    if (C_dims[i] == 0) {
+      return;
+    }
     A_strides_array.data[i] = A_dims[i] == 1 ? 0 : A_stride;
     B_strides_array.data[i] = B_dims[i] == 1 ? 0 : B_stride;
     A_stride *= A_dims[i];
     B_stride *= B_dims[i];
+    C_dims_array.data[i] = FixedDivisor<int>(C_dims[i]);
   }
-  std::copy(C_dims, C_dims + D, C_dims_array.data);
   const int size =
       std::accumulate(C_dims, C_dims + D, 1, std::multiplies<int>());
-  hipLaunchKernelGGL(
-      (BroadcastBinaryOpHIPKernel<TIn, TOut, BinaryOperator, D>),
-      dim3(CAFFE_GET_BLOCKS(size)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      size,
-      A_strides_array,
-      B_strides_array,
-      C_dims_array,
-      op,
-      A,
-      B,
-      C);
+  hipLaunchKernelGGL((BroadcastBinaryOpHIPKernel<TIn, TOut, BinaryOperator, D>),
+                     dim3(CAFFE_GET_BLOCKS(size)), dim3(CAFFE_HIP_NUM_THREADS),
+                     0, context->hip_stream(), size, A_strides_array,
+                     B_strides_array, C_dims_array, op, A, B, C);
 }
 
 template <typename TIn, typename TOut, class BinaryOperator>
-void BroadcastBinaryOp(
-    const int A_ndim,
-    const int* A_dims,
-    const int B_ndim,
-    const int* B_dims,
-    const BinaryOperator& op,
-    const TIn* A,
-    const TIn* B,
-    TOut* C,
-    HIPContext* context) {
+void BroadcastBinaryOp(const int A_ndim, const int *A_dims, const int B_ndim,
+                       const int *B_dims, const BinaryOperator &op,
+                       const TIn *A, const TIn *B, TOut *C,
+                       HIPContext *context) {
   const int ndim = std::max(A_ndim, B_ndim);
   std::vector<int> A_dims_array(ndim);
   std::vector<int> B_dims_array(ndim);
   std::vector<int> C_dims_array(ndim);
-  utils::ComputeBroadcastBinaryOpDims(
-      A_ndim,
-      A_dims,
-      B_ndim,
-      B_dims,
-      A_dims_array.data(),
-      B_dims_array.data(),
-      C_dims_array.data());
+  utils::ComputeBroadcastBinaryOpDims(A_ndim, A_dims, B_ndim, B_dims,
+                                      A_dims_array.data(), B_dims_array.data(),
+                                      C_dims_array.data());
   if (A_dims_array == B_dims_array) {
-    const int size = std::accumulate(
-        C_dims_array.cbegin(), C_dims_array.cend(), 1, std::multiplies<int>());
-    hipLaunchKernelGGL(
-        (SimpleBinaryOpHIPKernel<TIn, TOut, BinaryOperator>),
-        dim3(CAFFE_GET_BLOCKS(size)),
-        dim3(CAFFE_HIP_NUM_THREADS),
-        0,
-        context->hip_stream(),
-        size,
-        op,
-        A,
-        B,
-        C);
+    const int size = std::accumulate(C_dims_array.cbegin(), C_dims_array.cend(),
+                                     1, std::multiplies<int>());
+    hipLaunchKernelGGL((SimpleBinaryOpHIPKernel<TIn, TOut, BinaryOperator>),
+                       dim3(CAFFE_GET_BLOCKS(size)),
+                       dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(),
+                       size, op, A, B, C);
     return;
   }
   int pivot;
   bool broadcast_1st;
-  if (utils::IsRowwiseBroadcastBinaryOp(
-          ndim,
-          A_dims_array.data(),
-          B_dims_array.data(),
-          &pivot,
-          &broadcast_1st)) {
+  if (utils::IsRowwiseBroadcastBinaryOp(ndim, A_dims_array.data(),
+                                        B_dims_array.data(), &pivot,
+                                        &broadcast_1st)) {
     BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
-        ndim,
-        C_dims_array.data(),
-        pivot,
-        true,
-        broadcast_1st,
-        op,
-        A,
-        B,
-        C,
+        ndim, C_dims_array.data(), pivot, true, broadcast_1st, op, A, B, C,
         context);
     return;
   }
-  if (utils::IsColwiseBroadcastBinaryOp(
-          ndim,
-          A_dims_array.data(),
-          B_dims_array.data(),
-          &pivot,
-          &broadcast_1st)) {
+  if (utils::IsColwiseBroadcastBinaryOp(ndim, A_dims_array.data(),
+                                        B_dims_array.data(), &pivot,
+                                        &broadcast_1st)) {
     BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
-        ndim,
-        C_dims_array.data(),
-        pivot,
-        false,
-        broadcast_1st,
-        op,
-        A,
-        B,
-        C,
+        ndim, C_dims_array.data(), pivot, false, broadcast_1st, op, A, B, C,
         context);
     return;
   }
   DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(
-      ndim,
-      BroadcastBinaryOpImpl,
-      TIn,
-      TOut,
-      BinaryOperator,
-      A_dims_array.data(),
-      B_dims_array.data(),
-      C_dims_array.data(),
-      op,
-      A,
-      B,
-      C,
-      context);
+      ndim, BroadcastBinaryOpImpl, TIn, TOut, BinaryOperator,
+      A_dims_array.data(), B_dims_array.data(), C_dims_array.data(), op, A, B,
+      C, context);
 }
+
 } // namespace
 
-#define DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(T, Func, op)            \
-  __global__ void Func##HIPKernel(const int N, const T* X, T* Y) { \
-    HIP_1D_KERNEL_LOOP(i, N) {                                     \
-      Y[i] = op(X[i]);                                             \
-    }                                                              \
-  }                                                                \
-  template <>                                                      \
-  void Func<T, HIPContext>(                                        \
-      const int N, const T* x, T* y, HIPContext* context) {        \
-    hipLaunchKernelGGL(                                            \
-        (Func##HIPKernel),                                         \
-        CAFFE_GET_BLOCKS(N),                                       \
-        CAFFE_HIP_NUM_THREADS,                                     \
-        0,                                                         \
-        context->hip_stream(),                                     \
-        N,                                                         \
-        x,                                                         \
-        y);                                                        \
+#define DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(T, Func, op)                        \
+  __global__ void Func##HIPKernel(const int N, const T *X, T *Y) {             \
+    HIP_1D_KERNEL_LOOP(i, N) { Y[i] = op(X[i]); }                              \
+  }                                                                            \
+  template <>                                                                  \
+  void Func<T, HIPContext>(const int N, const T *x, T *y,                      \
+                           HIPContext *context) {                              \
+    hipLaunchKernelGGL((Func##HIPKernel), CAFFE_GET_BLOCKS(N),                 \
+                       CAFFE_HIP_NUM_THREADS, 0, context->hip_stream(), N, x,  \
+                       y);                                                     \
   }
 
 DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Exp, expf)
@@ -378,74 +240,70 @@ DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sin, sinf)
 DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Asin, asinf)
 DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Tan, tanf)
 DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Atan, atanf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sinh, sinhf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Cosh, coshf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Tanh, tanhf)
 DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Abs, fabsf)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sqr, utils::Square<float>)
 DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sqrt, sqrtf)
 DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Rsqrt, rsqrtf)
-DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sqr, Square<float>)
-
-DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(bool, Not, Not)
-
-DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Neg, Negate<float>)
-DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(double, Neg, Negate<double>)
-DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int32_t, Neg, Negate<std::int32_t>)
-DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int64_t, Neg, Negate<std::int64_t>)
-
-DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sign, Sign<float>)
-DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(double, Sign, Sign<double>)
-DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int32_t, Sign, Sign<std::int32_t>)
-DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int64_t, Sign, Sign<std::int64_t>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Cbrt, cbrtf)
+
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Cube, utils::Cube<float>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(double, Cube, utils::Cube<double>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int32_t, Cube,
+                                   utils::Cube<std::int32_t>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int64_t, Cube,
+                                   utils::Cube<std::int64_t>)
+
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(bool, Not, utils::Not)
+
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Neg, utils::Negate<float>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(double, Neg, utils::Negate<double>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int32_t, Neg,
+                                   utils::Negate<std::int32_t>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int64_t, Neg,
+                                   utils::Negate<std::int64_t>)
+
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(float, Sign, utils::Sign<float>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(double, Sign, utils::Sign<double>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int32_t, Sign,
+                                   utils::Sign<std::int32_t>)
+DELEGATE_SIMPLE_HIP_UNARY_FUNCTION(std::int64_t, Sign,
+                                   utils::Sign<std::int64_t>)
 
 #undef DELEGATE_SIMPLE_HIP_UNARY_FUNCTION
 
-#define DELEGATE_SINCOS_HIP_FUNCTION(T, fn)                         \
-  __global__ void _Kernel_##T##_##SinCos(                           \
-      const int N, const T* x, T* ys, T* yc) {                      \
-    HIP_1D_KERNEL_LOOP(i, N) {                                      \
-      fn(__ldg(x + i), ys + i, yc + i);                             \
-    }                                                               \
-  }                                                                 \
-  template <>                                                       \
-  void SinCos<T, HIPContext>(                                       \
-      const int N, const T* x, T* ys, T* yc, HIPContext* context) { \
-    hipLaunchKernelGGL(                                             \
-        (_Kernel_##T##_##SinCos),                                   \
-        CAFFE_GET_BLOCKS(N),                                        \
-        CAFFE_HIP_NUM_THREADS,                                      \
-        0,                                                          \
-        context->hip_stream(),                                      \
-        N,                                                          \
-        x,                                                          \
-        ys,                                                         \
-        yc);                                                        \
+#define DELEGATE_SINCOS_HIP_FUNCTION(T, fn)                                    \
+  __global__ void _Kernel_##T##_##SinCos(const int N, const T *x, T *ys,       \
+                                         T *yc) {                              \
+    HIP_1D_KERNEL_LOOP(i, N) { fn(__ldg(x + i), ys + i, yc + i); }             \
+  }                                                                            \
+  template <>                                                                  \
+  void SinCos<T, HIPContext>(const int N, const T *x, T *ys, T *yc,            \
+                             HIPContext *context) {                            \
+    hipLaunchKernelGGL((_Kernel_##T##_##SinCos), CAFFE_GET_BLOCKS(N),          \
+                       CAFFE_HIP_NUM_THREADS, 0, context->hip_stream(), N, x,  \
+                       ys, yc);                                                \
   }
 
 DELEGATE_SINCOS_HIP_FUNCTION(float, sincosf)
 DELEGATE_SINCOS_HIP_FUNCTION(double, sincos)
 
-#undef DELEGATE_SINCOS_HIP_FUNCTION
-
 #define DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op)               \
   template <>                                                                  \
-  void Func<TIn, HIPContext>(                                                  \
-      const int N, const TIn* A, const TIn* B, TOut* C, HIPContext* context) { \
-    hipLaunchKernelGGL(                                                        \
-        (SimpleBinaryOpHIPKernel<TIn, TOut, Op<TIn>>),                         \
-        CAFFE_GET_BLOCKS(N),                                                   \
-        CAFFE_HIP_NUM_THREADS,                                                 \
-        0,                                                                     \
-        context->hip_stream(),                                                 \
-        N,                                                                     \
-        Op<TIn>(),                                                             \
-        A,                                                                     \
-        B,                                                                     \
-        C);                                                                    \
-  }
-
-#define DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(Func, Op)                \
-  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
-  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
-  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, bool, Func, Op)        \
-  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(double, bool, Func, Op)       \
+  void Func<TIn, HIPContext>(const int N, const TIn *A, const TIn *B, TOut *C, \
+                             HIPContext *context) {                            \
+    hipLaunchKernelGGL((SimpleBinaryOpHIPKernel<TIn, TOut, Op<TIn>>),          \
+                       CAFFE_GET_BLOCKS(N), CAFFE_HIP_NUM_THREADS, 0,          \
+                       context->hip_stream(), N, Op<TIn>(), A, B, C);          \
+  }
+
+#define DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(Func, Op)                           \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op)            \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op)            \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, bool, Func, Op)                   \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(double, bool, Func, Op)                  \
   DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Func, Op)
 
 DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(EQ, thrust::equal_to)
@@ -457,11 +315,11 @@ DEFINE_SIMPLE_HIP_COMPARE_FUNCTION(GE, thrust::greater_equal)
 
 #undef DEFINE_SIMPLE_HIP_COMPARE_FUNCTION
 
-#define DEFINE_SIMPLE_HIP_BINARY_FUNCTION(Func, Op)                         \
-  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
-  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) \
-  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, float, Func, Op)               \
-  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(double, double, Func, Op)             \
+#define DEFINE_SIMPLE_HIP_BINARY_FUNCTION(Func, Op)                            \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op)    \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)    \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, float, Func, Op)                  \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(double, double, Func, Op)                \
   DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float16, float16, Func, Op)
 
 DEFINE_SIMPLE_HIP_BINARY_FUNCTION(Add, AddFunctor)
@@ -475,9 +333,9 @@ DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
 DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
 DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
 
-#define DEFINE_SIMPLE_HIP_BITWISE_BINARY_FUNCTION(Func, Op)                 \
-  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Func, Op)                 \
-  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+#define DEFINE_SIMPLE_HIP_BITWISE_BINARY_FUNCTION(Func, Op)                    \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(bool, bool, Func, Op)                    \
+  DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op)    \
   DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
 
 DEFINE_SIMPLE_HIP_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
@@ -490,101 +348,69 @@ DELEGATE_SIMPLE_HIP_BINARY_FUNCTION(float, float, ElemwiseMax, thrust::maximum);
 
 #undef DELEGATE_SIMPLE_HIP_BINARY_FUNCTION
 
-#define DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op) \
-  template <>                                                          \
-  void Rowwise##Func<TIn, HIPContext, true>(                           \
-      const int rows,                                                  \
-      const int cols,                                                  \
-      const TIn* A,                                                    \
-      const TIn* B,                                                    \
-      TOut* C,                                                         \
-      HIPContext* context) {                                           \
-    const int size = rows * cols;                                      \
-    hipLaunchKernelGGL(                                                \
-        (RowwiseBinaryOpHIPKenel<TIn, TOut, Op<TIn>, true>),           \
-        CAFFE_GET_BLOCKS(size),                                        \
-        CAFFE_HIP_NUM_THREADS,                                         \
-        0,                                                             \
-        context->hip_stream(),                                         \
-        rows,                                                          \
-        cols,                                                          \
-        Op<TIn>(),                                                     \
-        A,                                                             \
-        B,                                                             \
-        C);                                                            \
-  }                                                                    \
-  template <>                                                          \
-  void Rowwise##Func<TIn, HIPContext, false>(                          \
-      const int rows,                                                  \
-      const int cols,                                                  \
-      const TIn* A,                                                    \
-      const TIn* B,                                                    \
-      TOut* C,                                                         \
-      HIPContext* context) {                                           \
-    const int size = rows * cols;                                      \
-    hipLaunchKernelGGL(                                                \
-        (RowwiseBinaryOpHIPKenel<TIn, TOut, Op<TIn>, false>),          \
-        CAFFE_GET_BLOCKS(size),                                        \
-        CAFFE_HIP_NUM_THREADS,                                         \
-        0,                                                             \
-        context->hip_stream(),                                         \
-        rows,                                                          \
-        cols,                                                          \
-        Op<TIn>(),                                                     \
-        A,                                                             \
-        B,                                                             \
-        C);                                                            \
-  }                                                                    \
-  template <>                                                          \
-  void Colwise##Func<TIn, HIPContext, true>(                           \
-      const int rows,                                                  \
-      const int cols,                                                  \
-      const TIn* A,                                                    \
-      const TIn* B,                                                    \
-      TOut* C,                                                         \
-      HIPContext* context) {                                           \
-    const int size = rows * cols;                                      \
-    hipLaunchKernelGGL(                                                \
-        (ColwiseBinaryOpHIPKenel<TIn, TOut, Op<TIn>, true>),           \
-        CAFFE_GET_BLOCKS(size),                                        \
-        CAFFE_HIP_NUM_THREADS,                                         \
-        0,                                                             \
-        context->hip_stream(),                                         \
-        rows,                                                          \
-        cols,                                                          \
-        Op<TIn>(),                                                     \
-        A,                                                             \
-        B,                                                             \
-        C);                                                            \
-  }                                                                    \
-  template <>                                                          \
-  void Colwise##Func<TIn, HIPContext, false>(                          \
-      const int rows,                                                  \
-      const int cols,                                                  \
-      const TIn* A,                                                    \
-      const TIn* B,                                                    \
-      TOut* C,                                                         \
-      HIPContext* context) {                                           \
-    const int size = rows * cols;                                      \
-    hipLaunchKernelGGL(                                                \
-        (ColwiseBinaryOpHIPKenel<TIn, TOut, Op<TIn>, false>),          \
-        CAFFE_GET_BLOCKS(size),                                        \
-        CAFFE_HIP_NUM_THREADS,                                         \
-        0,                                                             \
-        context->hip_stream(),                                         \
-        rows,                                                          \
-        cols,                                                          \
-        Op<TIn>(),                                                     \
-        A,                                                             \
-        B,                                                             \
-        C);                                                            \
-  }
-
-#define DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(Func, Op)                \
-  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
-  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
-  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float, bool, Func, Op)        \
-  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(double, bool, Func, Op)       \
+#define DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op)         \
+  template <>                                                                  \
+  void Rowwise##Func<TIn, HIPContext, true>(const int rows, const int cols,    \
+                                            const TIn *A, const TIn *B,        \
+                                            TOut *C, HIPContext *context) {    \
+    if (rows == 0 || cols == 0) {                                              \
+      return;                                                                  \
+    }                                                                          \
+    const int size = rows * cols;                                              \
+    const FixedDivisor<int> cols_div(cols);                                    \
+    hipLaunchKernelGGL(RowwiseBinaryOpHIPKernel<TIn, TOut, Op<TIn>, true>,     \
+                       CAFFE_GET_BLOCKS(size), CAFFE_HIP_NUM_THREADS, 0,       \
+                       context->hip_stream(), size, cols_div, Op<TIn>(), A, B, \
+                       C);                                                     \
+  }                                                                            \
+  template <>                                                                  \
+  void Rowwise##Func<TIn, HIPContext, false>(const int rows, const int cols,   \
+                                             const TIn *A, const TIn *B,       \
+                                             TOut *C, HIPContext *context) {   \
+    if (rows == 0 || cols == 0) {                                              \
+      return;                                                                  \
+    }                                                                          \
+    const int size = rows * cols;                                              \
+    const FixedDivisor<int> cols_div(cols);                                    \
+    hipLaunchKernelGGL(RowwiseBinaryOpHIPKernel<TIn, TOut, Op<TIn>, false>,    \
+                       CAFFE_GET_BLOCKS(size), CAFFE_HIP_NUM_THREADS, 0,       \
+                       context->hip_stream(), size, cols_div, Op<TIn>(), A, B, \
+                       C);                                                     \
+  }                                                                            \
+  template <>                                                                  \
+  void Colwise##Func<TIn, HIPContext, true>(const int rows, const int cols,    \
+                                            const TIn *A, const TIn *B,        \
+                                            TOut *C, HIPContext *context) {    \
+    if (rows == 0 || cols == 0) {                                              \
+      return;                                                                  \
+    }                                                                          \
+    const int size = rows * cols;                                              \
+    const FixedDivisor<int> cols_div(cols);                                    \
+    hipLaunchKernelGGL(ColwiseBinaryOpHIPKernel<TIn, TOut, Op<TIn>, true>,     \
+                       CAFFE_GET_BLOCKS(size), CAFFE_HIP_NUM_THREADS, 0,       \
+                       context->hip_stream(), size, cols_div, Op<TIn>(), A, B, \
+                       C);                                                     \
+  }                                                                            \
+  template <>                                                                  \
+  void Colwise##Func<TIn, HIPContext, false>(const int rows, const int cols,   \
+                                             const TIn *A, const TIn *B,       \
+                                             TOut *C, HIPContext *context) {   \
+    if (rows == 0 || cols == 0) {                                              \
+      return;                                                                  \
+    }                                                                          \
+    const int size = rows * cols;                                              \
+    const FixedDivisor<int> cols_div(cols);                                    \
+    hipLaunchKernelGGL(ColwiseBinaryOpHIPKernel<TIn, TOut, Op<TIn>, false>,    \
+                       CAFFE_GET_BLOCKS(size), CAFFE_HIP_NUM_THREADS, 0,       \
+                       context->hip_stream(), size, cols_div, Op<TIn>(), A, B, \
+                       C);                                                     \
+  }
+
+#define DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(Func, Op)                     \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op)      \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op)      \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float, bool, Func, Op)             \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(double, bool, Func, Op)            \
   DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Func, Op)
 
 DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(EQ, thrust::equal_to)
@@ -596,13 +422,13 @@ DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION(GE, thrust::greater_equal)
 
 #undef DEFINE_2D_BROADCAST_HIP_COMPARE_FUNCTION
 
-#define DEFINE_2D_BROADCAST_HIP_BINARY_FUNCTION(Func, Op)             \
-  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(                          \
-      std::int32_t, std::int32_t, Func, Op)                           \
-  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(                          \
-      std::int64_t, std::int64_t, Func, Op)                           \
-  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float, float, Func, Op)   \
-  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(double, double, Func, Op) \
+#define DEFINE_2D_BROADCAST_HIP_BINARY_FUNCTION(Func, Op)                      \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func,  \
+                                            Op)                                \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func,  \
+                                            Op)                                \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float, float, Func, Op)            \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(double, double, Func, Op)          \
   DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(float16, float16, Func, Op)
 
 DEFINE_2D_BROADCAST_HIP_BINARY_FUNCTION(Add, AddFunctor)
@@ -616,12 +442,12 @@ DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
 DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
 DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
 
-#define DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(Func, Op) \
-  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Func, Op) \
-  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(                      \
-      std::int32_t, std::int32_t, Func, Op)                       \
-  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(                      \
-      std::int64_t, std::int64_t, Func, Op)
+#define DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(Func, Op)              \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Func, Op)              \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, std::int32_t, Func,  \
+                                            Op)                                \
+  DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, std::int64_t, Func,  \
+                                            Op)
 
 DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
 DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
@@ -631,26 +457,21 @@ DEFINE_2D_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
 
 #undef DELEGATE_2D_BROADCAST_HIP_BINARY_FUNCTION
 
-#define DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op)   \
-  template <>                                                         \
-  void Func<TIn, HIPContext>(                                         \
-      const int A_ndim,                                               \
-      const int* A_dims,                                              \
-      const int B_ndim,                                               \
-      const int* B_dims,                                              \
-      const TIn* A,                                                   \
-      const TIn* B,                                                   \
-      TOut* C,                                                        \
-      HIPContext* context) {                                          \
-    BroadcastBinaryOp<TIn, TOut, Op<TIn>>(                            \
-        A_ndim, A_dims, B_ndim, B_dims, Op<TIn>(), A, B, C, context); \
-  }
-
-#define DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(Func, Op)                \
-  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
-  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
-  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(float, bool, Func, Op)        \
-  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(double, bool, Func, Op)       \
+#define DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(TIn, TOut, Func, Op)            \
+  template <>                                                                  \
+  void Func<TIn, HIPContext>(const int A_ndim, const int *A_dims,              \
+                             const int B_ndim, const int *B_dims,              \
+                             const TIn *A, const TIn *B, TOut *C,              \
+                             HIPContext *context) {                            \
+    BroadcastBinaryOp<TIn, TOut, Op<TIn>>(A_ndim, A_dims, B_ndim, B_dims,      \
+                                          Op<TIn>(), A, B, C, context);        \
+  }
+
+#define DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(Func, Op)                        \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int32_t, bool, Func, Op)         \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(std::int64_t, bool, Func, Op)         \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(float, bool, Func, Op)                \
+  DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(double, bool, Func, Op)               \
   DELEGATE_BROADCAST_HIP_BINARY_FUNCTION(bool, bool, Func, Op)
 
 DEFINE_BROADCAST_HIP_COMPARE_FUNCTION(EQ, thrust::equal_to)
@@ -693,27 +514,20 @@ DEFINE_BROADCAST_HIP_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
 
 #undef DELEGATE_BROADCAST_HIP_BINARY_FUNCTION
 
-#define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func)                  \
-  template <>                                                           \
-  void Funcname<T, HIPContext>(                                         \
-      const int N,                                                      \
-      const T* src,                                                     \
-      T* dst,                                                           \
-      Tensor<HIPContext>* scratch_ptr,                                  \
-      HIPContext* context) {                                            \
-    size_t memRequired = 0;                                             \
-    cub::DeviceReduce::func(                                            \
-        nullptr, memRequired, src, dst, N, context->hip_stream());      \
-    auto buffer_size =                                                  \
-        static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T)); \
-    scratch_ptr->Resize(std::vector<TIndex>{buffer_size});              \
-    cub::DeviceReduce::func(                                            \
-        static_cast<void*>(scratch_ptr->mutable_data<T>()),             \
-        memRequired,                                                    \
-        src,                                                            \
-        dst,                                                            \
-        N,                                                              \
-        context->hip_stream());                                         \
+#define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func)                         \
+  template <>                                                                  \
+  void Funcname<T, HIPContext>(const int N, const T *src, T *dst,              \
+                               Tensor<HIPContext> *scratch_ptr,                \
+                               HIPContext *context) {                          \
+    size_t memRequired = 0;                                                    \
+    cub::DeviceReduce::func(nullptr, memRequired, src, dst, N,                 \
+                            context->hip_stream());                            \
+    auto buffer_size =                                                         \
+        static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T));        \
+    scratch_ptr->Resize(std::vector<TIndex>{buffer_size});                     \
+    cub::DeviceReduce::func(                                                   \
+        static_cast<void *>(scratch_ptr->mutable_data<T>()), memRequired, src, \
+        dst, N, context->hip_stream());                                        \
   }
 
 DELEGATE_REDUCTION_FUNCTION(float, ReduceMin, Min)
@@ -726,60 +540,34 @@ DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max)
 // Caffe2 gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
 template <>
-void Gemm<float, HIPContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float* A,
-    const float* B,
-    const float beta,
-    float* C,
-    HIPContext* context,
-    TensorProto::DataType math_type) {
+void Gemm<float, HIPContext>(const CBLAS_TRANSPOSE TransA,
+                             const CBLAS_TRANSPOSE TransB, const int M,
+                             const int N, const int K, const float alpha,
+                             const float *A, const float *B, const float beta,
+                             float *C, HIPContext *context,
+                             TensorProto::DataType math_type) {
   // Note that rocblas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
   rocblas_operation cuTransA = (TransA == CblasNoTrans)
-      ? rocblas_operation_none
-      : rocblas_operation_transpose;
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
   rocblas_operation cuTransB = (TransB == CblasNoTrans)
-      ? rocblas_operation_none
-      : rocblas_operation_transpose;
-  ROCBLAS_ENFORCE(rocblas_sgemm(
-      context->rocblas_handle(),
-      cuTransB,
-      cuTransA,
-      N,
-      M,
-      K,
-      &alpha,
-      B,
-      ldb,
-      A,
-      lda,
-      &beta,
-      C,
-      N));
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  ROCBLAS_ENFORCE(rocblas_sgemm(context->rocblas_handle(), cuTransB, cuTransA,
+                                N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
 template <>
-void Gemm<float16, HIPContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float16* A,
-    const float16* B,
-    const float beta,
-    float16* C,
-    HIPContext* context,
-    TensorProto::DataType math_type) {
+void Gemm<float16, HIPContext>(const CBLAS_TRANSPOSE TransA,
+                               const CBLAS_TRANSPOSE TransB, const int M,
+                               const int N, const int K, const float alpha,
+                               const float16 *A, const float16 *B,
+                               const float beta, float16 *C,
+                               HIPContext *context,
+                               TensorProto::DataType math_type) {
   CAFFE_THROW("Unsupported math type");
 #if ROCBLAS_FP16 // rocblas does not support fp16 yet
   // Note that cublas follows fortran order, so the order is different from
@@ -787,30 +575,15 @@ void Gemm<float16, HIPContext>(
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
   rocblas_operation cuTransA = (TransA == CblasNoTrans)
-      ? rocblas_operation_none
-      : rocblas_operation_transpose;
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
   rocblas_operation cuTransB = (TransB == CblasNoTrans)
-      ? rocblas_operation_none
-      : rocblas_operation_transpose;
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
   if (math_type == TensorProto_DataType_FLOAT) {
-    ROCBLAS_CHECK(rocblas_sgemmEx(
-        context->rocblas_handle(),
-        cuTransB,
-        cuTransA,
-        N,
-        M,
-        K,
-        &alpha,
-        B,
-        CUDA_R_16F,
-        ldb,
-        A,
-        CUDA_R_16F,
-        lda,
-        &beta,
-        C,
-        CUDA_R_16F,
-        N));
+    ROCBLAS_CHECK(rocblas_sgemmEx(context->rocblas_handle(), cuTransB, cuTransA,
+                                  N, M, K, &alpha, B, CUDA_R_16F, ldb, A,
+                                  CUDA_R_16F, lda, &beta, C, CUDA_R_16F, N));
 
   } else if (math_type == TensorProto_DataType_FLOAT16) {
     // convert alpha, beta from float -> __half
@@ -841,42 +614,19 @@ void Gemm<float16, HIPContext>(
 }
 
 template <>
-void BiasCHW<float, HIPContext>(
-    const float* bias,
-    const float* bias_multiplier,
-    const int bias_channels,
-    const int image_size,
-    float* image,
-    HIPContext* context) {
-  Gemm<float, HIPContext>(
-      CblasNoTrans,
-      CblasNoTrans,
-      bias_channels,
-      image_size,
-      1,
-      1,
-      bias,
-      bias_multiplier,
-      1,
-      image,
-      context);
+void BiasCHW<float, HIPContext>(const float *bias, const float *bias_multiplier,
+                                const int bias_channels, const int image_size,
+                                float *image, HIPContext *context) {
+  Gemm<float, HIPContext>(CblasNoTrans, CblasNoTrans, bias_channels, image_size,
+                          1, 1, bias, bias_multiplier, 1, image, context);
 }
 
 template <>
 void GemmBatched<float, HIPContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB,
-    const int batch_size,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float* A,
-    const float* B,
-    const float beta,
-    float* C,
-    HIPContext* context,
-    Tensor<HIPContext>* scratch,
+    const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+    const int batch_size, const int M, const int N, const int K,
+    const float alpha, const float *A, const float *B, const float beta,
+    float *C, HIPContext *context, Tensor<HIPContext> *scratch,
     TensorProto::DataType math_type) {
   const int a_stride = M * K;
   const int b_stride = K * N;
@@ -886,63 +636,34 @@ void GemmBatched<float, HIPContext>(
   const int lda = (TransA == CblasNoTrans) ? K : M;
   const int ldb = (TransB == CblasNoTrans) ? N : K;
   rocblas_operation cuTransA = (TransA == CblasNoTrans)
-      ? rocblas_operation_none
-      : rocblas_operation_transpose;
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
   rocblas_operation cuTransB = (TransB == CblasNoTrans)
-      ? rocblas_operation_none
-      : rocblas_operation_transpose;
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
   ROCBLAS_ENFORCE(rocblas_sgemm_strided_batched(
-      context->rocblas_handle(),
-      cuTransB,
-      cuTransA,
-      N,
-      M,
-      K,
-      &alpha,
-      B,
-      ldb,
-      b_stride,
-      A,
-      lda,
-      a_stride,
-      &beta,
-      C,
-      N,
-      c_stride,
-      batch_size));
+      context->rocblas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
+      b_stride, A, lda, a_stride, &beta, C, N, c_stride, batch_size));
 }
 
 namespace {
 
-__global__ void FloatToHalfKernel(const int N, const float* X, half* Y) {
-  HIP_1D_KERNEL_LOOP(i, N) {
-    Y[i] = __float2half(X[i]);
-  }
+__global__ void FloatToHalfKernel(const int N, const float *X, half *Y) {
+  HIP_1D_KERNEL_LOOP(i, N) { Y[i] = __float2half(X[i]); }
 }
 
-__global__ void HalfToFloatKernel(const int N, const half* X, float* Y) {
-  HIP_1D_KERNEL_LOOP(i, N) {
-    Y[i] = __half2float(X[i]);
-  }
+__global__ void HalfToFloatKernel(const int N, const half *X, float *Y) {
+  HIP_1D_KERNEL_LOOP(i, N) { Y[i] = __half2float(X[i]); }
 }
 
 }; // namespace
 
 template <>
 void GemmBatched<float16, HIPContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB,
-    const int batch_size,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float16* A,
-    const float16* B,
-    const float beta,
-    float16* C,
-    HIPContext* context,
-    Tensor<HIPContext>* scratch,
+    const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+    const int batch_size, const int M, const int N, const int K,
+    const float alpha, const float16 *A, const float16 *B, const float beta,
+    float16 *C, HIPContext *context, Tensor<HIPContext> *scratch,
     TensorProto::DataType math_type) {
   const int a_stride = M * K;
   const int b_stride = K * N;
@@ -961,74 +682,37 @@ void GemmBatched<float16, HIPContext>(
     size_t out_elems = c_stride * batch_size;
 
     scratch->Resize(in_elems + out_elems);
-    float* scratch_ptr = scratch->mutable_data<float>();
+    float *scratch_ptr = scratch->mutable_data<float>();
 
-    float* A_fp32 = scratch_ptr;
-    float* B_fp32 = scratch_ptr + A_size;
-    float* C_fp32 = scratch_ptr + A_size + B_size;
+    float *A_fp32 = scratch_ptr;
+    float *B_fp32 = scratch_ptr + A_size;
+    float *C_fp32 = scratch_ptr + A_size + B_size;
 
     // cast A, B into fp32
-    hipLaunchKernelGGL(
-        (HalfToFloatKernel),
-        dim3(CAFFE_GET_BLOCKS(A_size)),
-        dim3(CAFFE_HIP_NUM_THREADS),
-        0,
-        context->hip_stream(),
-        A_size,
-        (half*)A,
-        A_fp32);
-    hipLaunchKernelGGL(
-        (HalfToFloatKernel),
-        dim3(CAFFE_GET_BLOCKS(B_size)),
-        dim3(CAFFE_HIP_NUM_THREADS),
-        0,
-        context->hip_stream(),
-        B_size,
-        (half*)B,
-        B_fp32);
+    hipLaunchKernelGGL((HalfToFloatKernel), dim3(CAFFE_GET_BLOCKS(A_size)),
+                       dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(),
+                       A_size, (half *)A, A_fp32);
+    hipLaunchKernelGGL((HalfToFloatKernel), dim3(CAFFE_GET_BLOCKS(B_size)),
+                       dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(),
+                       B_size, (half *)B, B_fp32);
 
     // run fp32 batched Gemm
-    GemmBatched<float, HIPContext>(
-        TransA,
-        TransB,
-        batch_size,
-        M,
-        N,
-        K,
-        alpha,
-        A_fp32,
-        B_fp32,
-        beta,
-        C_fp32,
-        context);
+    GemmBatched<float, HIPContext>(TransA, TransB, batch_size, M, N, K, alpha,
+                                   A_fp32, B_fp32, beta, C_fp32, context);
 
     // cast result back to fp16
-    hipLaunchKernelGGL(
-        (FloatToHalfKernel),
-        dim3(CAFFE_GET_BLOCKS(batch_size * M * N)),
-        dim3(CAFFE_HIP_NUM_THREADS),
-        0,
-        context->hip_stream(),
-        batch_size * M * N,
-        C_fp32,
-        (half*)C);
+    hipLaunchKernelGGL((FloatToHalfKernel),
+                       dim3(CAFFE_GET_BLOCKS(batch_size * M * N)),
+                       dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(),
+                       batch_size * M * N, C_fp32, (half *)C);
   } else {
 #if ROCBLAS_FP16 // rocblas does not support fp16 yet
     if (math_type == TensorProto_DataType_FLOAT) {
       // loop over matrices in the batch
       for (int i = 0; i < batch_size; ++i) {
-        math::Gemm<float16, HIPContext>(
-            TransA,
-            TransB,
-            M,
-            N,
-            K,
-            alpha,
-            A + a_stride * i,
-            B + b_stride * i,
-            beta,
-            C + c_stride * i,
-            context);
+        math::Gemm<float16, HIPContext>(TransA, TransB, M, N, K, alpha,
+                                        A + a_stride * i, B + b_stride * i,
+                                        beta, C + c_stride * i, context);
       }
     } else if (math_type == TensorProto_DataType_FLOAT16) {
       // Note that cublas follows fortran order, so the order is different from
@@ -1036,122 +720,65 @@ void GemmBatched<float16, HIPContext>(
       const int lda = (TransA == CblasNoTrans) ? K : M;
       const int ldb = (TransB == CblasNoTrans) ? N : K;
       rocblas_operation cuTransA = (TransA == CblasNoTrans)
-          ? rocblas_operation_none
-          : rocblas_operation_transpose;
+                                       ? rocblas_operation_none
+                                       : rocblas_operation_transpose;
       rocblas_operation cuTransB = (TransB == CblasNoTrans)
-          ? rocblas_operation_none
-          : rocblas_operation_transpose;
+                                       ? rocblas_operation_none
+                                       : rocblas_operation_transpose;
 
       // convert alpha, beta from float -> __half
       auto alpha_fp16 = convert::floatToHalf(alpha);
       auto beta_fp16 = convert::floatToHalf(beta);
       ROCBLAS_ENFORCE(cublasHgemmStridedBatched(
-          context->rocblas_handle(),
-          cuTransB,
-          cuTransA,
-          N,
-          M,
-          K,
-          &alpha_fp16,
-          (const __half*)B,
-          ldb,
-          b_stride,
-          (const __half*)A,
-          lda,
-          a_stride,
-          &beta_fp16,
-          (__half*)C,
-          N,
-          c_stride,
-          batch_size));
+          context->rocblas_handle(), cuTransB, cuTransA, N, M, K, &alpha_fp16,
+          (const __half *)B, ldb, b_stride, (const __half *)A, lda, a_stride,
+          &beta_fp16, (__half *)C, N, c_stride, batch_size));
     }
 #endif
   }
 }
 
 template <>
-void GemmEx<float, HIPContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const CBLAS_TRANSPOSE TransB,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float* A,
-    const int lda,
-    const float* B,
-    const int ldb,
-    const float beta,
-    float* C,
-    const int ldc,
-    HIPContext* context) {
+void GemmEx<float, HIPContext>(const CBLAS_TRANSPOSE TransA,
+                               const CBLAS_TRANSPOSE TransB, const int M,
+                               const int N, const int K, const float alpha,
+                               const float *A, const int lda, const float *B,
+                               const int ldb, const float beta, float *C,
+                               const int ldc, HIPContext *context) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   rocblas_operation cuTransA = (TransA == CblasNoTrans)
-      ? rocblas_operation_none
-      : rocblas_operation_transpose;
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
   rocblas_operation cuTransB = (TransB == CblasNoTrans)
-      ? rocblas_operation_none
-      : rocblas_operation_transpose;
-  ROCBLAS_ENFORCE(rocblas_sgemm(
-      context->rocblas_handle(),
-      cuTransB,
-      cuTransA,
-      N,
-      M,
-      K,
-      &alpha,
-      B,
-      ldb,
-      A,
-      lda,
-      &beta,
-      C,
-      ldc));
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  ROCBLAS_ENFORCE(rocblas_sgemm(context->rocblas_handle(), cuTransB, cuTransA,
+                                N, M, K, &alpha, B, ldb, A, lda, &beta, C,
+                                ldc));
 }
 
 template <>
-void Gemv<float, HIPContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const int M,
-    const int N,
-    const float alpha,
-    const float* A,
-    const float* x,
-    const float beta,
-    float* y,
-    HIPContext* context,
-    TensorProto::DataType math_type) {
+void Gemv<float, HIPContext>(const CBLAS_TRANSPOSE TransA, const int M,
+                             const int N, const float alpha, const float *A,
+                             const float *x, const float beta, float *y,
+                             HIPContext *context,
+                             TensorProto::DataType math_type) {
   rocblas_operation cuTransA = (TransA == CblasNoTrans)
-      ? rocblas_operation_transpose
-      : rocblas_operation_none;
-  ROCBLAS_ENFORCE(rocblas_sgemv(
-      context->rocblas_handle(),
-      cuTransA,
-      N,
-      M,
-      &alpha,
-      A,
-      N,
-      x,
-      1,
-      &beta,
-      y,
-      1));
+                                   ? rocblas_operation_transpose
+                                   : rocblas_operation_none;
+  ROCBLAS_ENFORCE(rocblas_sgemv(context->rocblas_handle(), cuTransA, N, M,
+                                &alpha, A, N, x, 1, &beta, y, 1));
 }
 
 // Batched Add variants
 namespace {
 
 template <typename T>
-__global__ void AddStripedBatchKernel(
-    const int N,
-    const T* first,
-    T* Y,
-    const int stripe,
-    const int batch) {
+__global__ void AddStripedBatchKernel(const int N, const T *first, T *Y,
+                                      const int stripe, const int batch) {
   for (int j = 0; j < batch; j++) {
-    const T* x = first + j * stripe;
+    const T *x = first + j * stripe;
     HIP_1D_KERNEL_LOOP(i, N) {
       float tmpY = convert::To<T, float>(Y[i]);
       tmpY += convert::To<T, float>(x[i]);
@@ -1161,26 +788,14 @@ __global__ void AddStripedBatchKernel(
 }
 } // namespace
 
-#define CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH(T) \
-  template <>                                       \
-  void AddStripedBatch<T, HIPContext>(              \
-      const int N,                                  \
-      const T* first,                               \
-      T* Y,                                         \
-      const int stripe,                             \
-      const int batch,                              \
-      HIPContext* context) {                        \
-    hipLaunchKernelGGL(                             \
-        (AddStripedBatchKernel<T>),                 \
-        CAFFE_GET_BLOCKS(N),                        \
-        CAFFE_HIP_NUM_THREADS,                      \
-        0,                                          \
-        context->hip_stream(),                      \
-        N,                                          \
-        first,                                      \
-        Y,                                          \
-        stripe,                                     \
-        batch);                                     \
+#define CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH(T)                            \
+  template <>                                                                  \
+  void AddStripedBatch<T, HIPContext>(const int N, const T *first, T *Y,       \
+                                      const int stripe, const int batch,       \
+                                      HIPContext *context) {                   \
+    hipLaunchKernelGGL(AddStripedBatchKernel<T>, CAFFE_GET_BLOCKS(N),          \
+                       CAFFE_HIP_NUM_THREADS, 0, context->hip_stream(), N,     \
+                       first, Y, stripe, batch);                               \
   }
 
 CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH(float);
@@ -1188,22 +803,16 @@ CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH(float16);
 #undef CAFFE2_SPECIALIZED_HIP_ADD_STRIPED_BATCH
 
 template <>
-void Gemv<float16, HIPContext>(
-    const CBLAS_TRANSPOSE TransA,
-    const int M,
-    const int N,
-    const float alpha,
-    const float16* A,
-    const float16* x,
-    const float beta,
-    float16* y,
-    HIPContext* context,
-    TensorProto::DataType math_type) {
+void Gemv<float16, HIPContext>(const CBLAS_TRANSPOSE TransA, const int M,
+                               const int N, const float alpha, const float16 *A,
+                               const float16 *x, const float beta, float16 *y,
+                               HIPContext *context,
+                               TensorProto::DataType math_type) {
   CAFFE_THROW("Unsupported math type");
 #if ROCBLAS_FP16 // rocblas does not support fp16 yet
   rocblas_operation cuTransA = (TransA == CblasNoTrans)
-      ? rocblas_operation_transpose
-      : rocblas_operation_none;
+                                   ? rocblas_operation_transpose
+                                   : rocblas_operation_none;
 
   // sort out what we need to call cublasSgemmEx / cublasHgemm
   int m = (cuTransA == rocblas_operation_none) ? N : M;
@@ -1212,71 +821,39 @@ void Gemv<float16, HIPContext>(
   int LDC = m;
 
   if (math_type == TensorProto_DataType_FLOAT) {
-    ROCBLAS_CHECK(cublasSgemmEx(
-        context->rocblas_handle(),
-        cuTransA,
-        rocblas_operation_none,
-        m,
-        1,
-        k,
-        &alpha,
-        A,
-        CUDA_R_16F,
-        LDA,
-        x,
-        CUDA_R_16F,
-        k,
-        &beta,
-        y,
-        CUDA_R_16F,
-        LDC));
+    ROCBLAS_CHECK(cublasSgemmEx(context->rocblas_handle(), cuTransA,
+                                rocblas_operation_none, m, 1, k, &alpha, A,
+                                CUDA_R_16F, LDA, x, CUDA_R_16F, k, &beta, y,
+                                CUDA_R_16F, LDC));
   } else if (math_type == TensorProto_DataType_FLOAT16) {
     auto alpha_fp16 = convert::floatToHalf(alpha);
     auto beta_fp16 = convert::floatToHalf(beta);
 
-    ROCBLAS_CHECK(cublasHgemm(
-        context->rocblas_handle(),
-        cuTransA,
-        rocblas_operation_none,
-        m,
-        1,
-        k,
-        &alpha_fp16,
-        (const __half*)A,
-        LDA,
-        (const __half*)x,
-        k,
-        &beta_fp16,
-        (__half*)y,
-        LDC));
+    ROCBLAS_CHECK(cublasHgemm(context->rocblas_handle(), cuTransA,
+                              rocblas_operation_none, m, 1, k, &alpha_fp16,
+                              (const __half *)A, LDA, (const __half *)x, k,
+                              &beta_fp16, (__half *)y, LDC));
   } else {
     // fail
     CAFFE_THROW("Unsupported math type");
   }
 #endif
 }
+
 namespace {
 template <typename T>
-__global__ void SetKernel(const int N, const T alpha, T* Y) {
-  HIP_1D_KERNEL_LOOP(i, N) {
-    Y[i] = alpha;
-  }
+__global__ void SetKernel(const int N, const T alpha, T *Y) {
+  HIP_1D_KERNEL_LOOP(i, N) { Y[i] = alpha; }
 }
 } // namespace
 
-#define CAFFE2_SPECIALIZED_HIP_SET(T)                             \
-  template <>                                                     \
-  void Set<T, HIPContext>(                                        \
-      const size_t N, const T alpha, T* Y, HIPContext* context) { \
-    hipLaunchKernelGGL(                                           \
-        (SetKernel),                                              \
-        CAFFE_GET_BLOCKS(N),                                      \
-        CAFFE_HIP_NUM_THREADS,                                    \
-        0,                                                        \
-        context->hip_stream(),                                    \
-        static_cast<const int>(N),                                \
-        alpha,                                                    \
-        Y);                                                       \
+#define CAFFE2_SPECIALIZED_HIP_SET(T)                                          \
+  template <>                                                                  \
+  void Set<T, HIPContext>(const size_t N, const T alpha, T *Y,                 \
+                          HIPContext *context) {                               \
+    hipLaunchKernelGGL((SetKernel), CAFFE_GET_BLOCKS(N),                       \
+                       CAFFE_HIP_NUM_THREADS, 0, context->hip_stream(),        \
+                       static_cast<const int>(N), alpha, Y);                   \
   }
 
 CAFFE2_SPECIALIZED_HIP_SET(float);
@@ -1294,93 +871,56 @@ CAFFE2_SPECIALIZED_HIP_SET(uint16_t);
 
 namespace {
 template <typename T>
-__global__ void
-UniformShift(const size_t N, const float min, const float max, T* x) {
+__global__ void UniformShift(const size_t N, const float min, const float max,
+                             T *x) {
   float scale = max - min;
   HIP_1D_KERNEL_LOOP(i, N) {
     x[i] = convert::To<float, T>(convert::To<T, float>(x[i]) * scale + min);
   }
 }
 
-__global__ void
-UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) {
-  int* x_int = reinterpret_cast<int*>(x);
+__global__ void UniformIntFit(const size_t N, const int min, const int max,
+                              unsigned int *x) {
+  int *x_int = reinterpret_cast<int *>(x);
   int range = (max - min + 1);
-  HIP_1D_KERNEL_LOOP(i, N) {
-    x_int[i] = min + static_cast<int>(x[i] % range);
-  }
+  HIP_1D_KERNEL_LOOP(i, N) { x_int[i] = min + static_cast<int>(x[i] % range); }
 }
 } // namespace
 
 template <>
-void RandUniform<float, HIPContext>(
-    const size_t n,
-    const float min,
-    const float max,
-    float* r,
-    HIPContext* context) {
+void RandUniform<float, HIPContext>(const size_t n, const float min,
+                                    const float max, float *r,
+                                    HIPContext *context) {
   HIPRAND_ENFORCE(hiprandGenerateUniform(context->hiprand_generator(), r, n));
-  hipLaunchKernelGGL(
-      (UniformShift<float>),
-      dim3(CAFFE_GET_BLOCKS(n)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      n,
-      min,
-      max,
-      r);
+  hipLaunchKernelGGL((UniformShift<float>), dim3(CAFFE_GET_BLOCKS(n)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n,
+                     min, max, r);
 }
 
 template <>
-void RandUniform<double, HIPContext>(
-    const size_t n,
-    const double min,
-    const double max,
-    double* r,
-    HIPContext* context) {
+void RandUniform<double, HIPContext>(const size_t n, const double min,
+                                     const double max, double *r,
+                                     HIPContext *context) {
   HIPRAND_ENFORCE(
       hiprandGenerateUniformDouble(context->hiprand_generator(), r, n));
-  hipLaunchKernelGGL(
-      (UniformShift<double>),
-      dim3(CAFFE_GET_BLOCKS(n)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      n,
-      min,
-      max,
-      r);
+  hipLaunchKernelGGL((UniformShift<double>), dim3(CAFFE_GET_BLOCKS(n)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n,
+                     min, max, r);
 }
 
 template <>
-void RandUniform<int, HIPContext>(
-    const size_t n,
-    const int min,
-    const int max,
-    int* r,
-    HIPContext* context) {
-  HIPRAND_ENFORCE(hiprandGenerate(
-      context->hiprand_generator(), reinterpret_cast<unsigned int*>(r), n));
-  hipLaunchKernelGGL(
-      (UniformIntFit),
-      dim3(CAFFE_GET_BLOCKS(n)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      n,
-      min,
-      max,
-      reinterpret_cast<unsigned int*>(r));
+void RandUniform<int, HIPContext>(const size_t n, const int min, const int max,
+                                  int *r, HIPContext *context) {
+  HIPRAND_ENFORCE(hiprandGenerate(context->hiprand_generator(),
+                                  reinterpret_cast<unsigned int *>(r), n));
+  hipLaunchKernelGGL((UniformIntFit), dim3(CAFFE_GET_BLOCKS(n)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n,
+                     min, max, reinterpret_cast<unsigned int *>(r));
 }
 
 template <typename T>
-size_t HandleOddLengthRandGaussian(
-    const size_t n,
-    const T mean,
-    const T std,
-    T* r,
-    HIPContext* context) {
+size_t HandleOddLengthRandGaussian(const size_t n, const T mean, const T std,
+                                   T *r, HIPContext *context) {
   if (n % 2 == 1) {
     std::default_random_engine generator;
     std::normal_distribution<T> distribution(mean, std);
@@ -1392,41 +932,31 @@ size_t HandleOddLengthRandGaussian(
 }
 
 template <>
-void RandGaussian<float, HIPContext>(
-    const size_t n,
-    const float mean,
-    const float std,
-    float* r,
-    HIPContext* context) {
+void RandGaussian<float, HIPContext>(const size_t n, const float mean,
+                                     const float std, float *r,
+                                     HIPContext *context) {
   // If n is odd, we add a random Gaussian value at the end manually
   // and generate n-1 random values using curandGenerateNormal.
   // curandGenerateNormal requires n to be even.
   const size_t even_n =
       HandleOddLengthRandGaussian<float>(n, mean, std, r, context);
-  HIPRAND_ENFORCE(hiprandGenerateNormal(
-      context->hiprand_generator(), r, even_n, mean, std));
+  HIPRAND_ENFORCE(hiprandGenerateNormal(context->hiprand_generator(), r, even_n,
+                                        mean, std));
 }
 
 template <>
-void RandGaussian<double, HIPContext>(
-    const size_t n,
-    const double mean,
-    const double std,
-    double* r,
-    HIPContext* context) {
+void RandGaussian<double, HIPContext>(const size_t n, const double mean,
+                                      const double std, double *r,
+                                      HIPContext *context) {
   const size_t even_n =
       HandleOddLengthRandGaussian<double>(n, mean, std, r, context);
-  HIPRAND_ENFORCE(hiprandGenerateNormalDouble(
-      context->hiprand_generator(), r, even_n, mean, std));
+  HIPRAND_ENFORCE(hiprandGenerateNormalDouble(context->hiprand_generator(), r,
+                                              even_n, mean, std));
 }
 
 template <>
-void Dot<float, HIPContext>(
-    const int n,
-    const float* a,
-    const float* b,
-    float* y,
-    HIPContext* context) {
+void Dot<float, HIPContext>(const int n, const float *a, const float *b,
+                            float *y, HIPContext *context) {
   float result;
   ROCBLAS_ENFORCE(
       rocblas_sdot(context->rocblas_handle(), n, a, 1, b, 1, &result));
@@ -1434,28 +964,14 @@ void Dot<float, HIPContext>(
 }
 
 template <>
-void Dot<float16, HIPContext>(
-    const int n,
-    const float16* a,
-    const float16* b,
-    float16* y,
-    HIPContext* context) {
+void Dot<float16, HIPContext>(const int n, const float16 *a, const float16 *b,
+                              float16 *y, HIPContext *context) {
   CAFFE_THROW("Unsupported math type");
 #if ROCBLAS_FP16 // rocblas does not support fp16 yet
   float16 result;
   // execute with 32-bit math
-  ROCBLAS_CHECK(cublasDotEx(
-      context->rocblas_handle(),
-      n,
-      a,
-      CUDA_R_16F,
-      1,
-      b,
-      CUDA_R_16F,
-      1,
-      &result,
-      CUDA_R_16F,
-      CUDA_R_32F));
+  ROCBLAS_CHECK(cublasDotEx(context->rocblas_handle(), n, a, CUDA_R_16F, 1, b,
+                            CUDA_R_16F, 1, &result, CUDA_R_16F, CUDA_R_32F));
   context->Copy<float16, CPUContext, HIPContext>(1, &result, y);
 #endif
 }
@@ -1466,7 +982,7 @@ void Dot<float16, HIPContext>(
 // reduction here.
 #define SUM_KERNEL_NTHREADS 128
 template <typename T>
-__global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
+__global__ void SumKernel(const int N, const T *X, T *Y, bool square) {
   const int idx = threadIdx.x;
   __shared__ float reduction_buffer[SUM_KERNEL_NTHREADS];
 
@@ -1488,7 +1004,8 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
   // 128 -> 32
   if (idx < 32) {
     reduction_buffer[idx] += reduction_buffer[idx + 32] +
-        reduction_buffer[idx + 64] + reduction_buffer[idx + 96];
+                             reduction_buffer[idx + 64] +
+                             reduction_buffer[idx + 96];
   }
   __syncthreads();
   // 32 -> 1
@@ -1508,21 +1025,16 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
 
 namespace {
 
-template <typename T>
-__global__ void SumConvertKernel(float* sum, T* dest) {
+template <typename T> __global__ void SumConvertKernel(float *sum, T *dest) {
   *dest = convert::To<float, T>(*sum);
 }
 
 template <typename T, typename IterT>
-void SumGenericIter(
-    const int N,
-    IterT it,
-    T*& dest,
-    HIPContext* context,
-    Tensor<HIPContext>* scratch_ptr) {
+void SumGenericIter(const int N, IterT it, T *&dest, HIPContext *context,
+                    Tensor<HIPContext> *scratch_ptr) {
   size_t memRequired = 0;
-  cub::DeviceReduce::Sum(
-      nullptr, memRequired, it, dest, N, context->hip_stream());
+  cub::DeviceReduce::Sum(nullptr, memRequired, it, dest, N,
+                         context->hip_stream());
   auto buffer_size =
       static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T));
   if (!dest) {
@@ -1533,184 +1045,106 @@ void SumGenericIter(
     scratch_ptr->Resize(std::vector<TIndex>{buffer_size});
   }
   cub::DeviceReduce::Sum(
-      static_cast<void*>(scratch_ptr->template mutable_data<T>()),
-      memRequired,
-      it,
-      dest,
-      N,
-      context->hip_stream());
+      static_cast<void *>(scratch_ptr->template mutable_data<T>()), memRequired,
+      it, dest, N, context->hip_stream());
 }
 } // namespace
 
 template <>
-void Sum<float, HIPContext>(
-    const int N,
-    const float* x,
-    float* y,
-    HIPContext* context,
-    Tensor<HIPContext>* scratch_ptr) {
+void Sum<float, HIPContext>(const int N, const float *x, float *y,
+                            HIPContext *context,
+                            Tensor<HIPContext> *scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SumGenericIter<float>(N, x, y, context, scratch_ptr);
   } else {
-    hipLaunchKernelGGL(
-        (SumKernel),
-        dim3(1),
-        dim3(SUM_KERNEL_NTHREADS),
-        0,
-        context->hip_stream(),
-        N,
-        x,
-        y,
-        false);
+    hipLaunchKernelGGL((SumKernel), dim3(1), dim3(SUM_KERNEL_NTHREADS), 0,
+                       context->hip_stream(), N, x, y, false);
   }
 }
 
 template <>
-void Sum<int32_t, HIPContext>(
-    const int N,
-    const int32_t* x,
-    int32_t* y,
-    HIPContext* context,
-    Tensor<HIPContext>* scratch_ptr) {
+void Sum<int32_t, HIPContext>(const int N, const int32_t *x, int32_t *y,
+                              HIPContext *context,
+                              Tensor<HIPContext> *scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SumGenericIter<int32_t>(N, x, y, context, scratch_ptr);
   } else {
-    hipLaunchKernelGGL(
-        (SumKernel),
-        dim3(1),
-        dim3(SUM_KERNEL_NTHREADS),
-        0,
-        context->hip_stream(),
-        N,
-        x,
-        y,
-        false);
+    hipLaunchKernelGGL((SumKernel), dim3(1), dim3(SUM_KERNEL_NTHREADS), 0,
+                       context->hip_stream(), N, x, y, false);
   }
 }
 
 namespace {
-template <typename T>
-struct FloatTransform {
+template <typename T> struct FloatTransform {
   inline __host__ __device__ float operator()(const T v) const {
     return convert::To<T, float>(v);
   }
 };
 } // namespace
 
-#define CAFFE2_MATH_SUM_FUNC(T)                                           \
-  template <>                                                             \
-  void Sum<T, HIPContext>(                                                \
-      const int N,                                                        \
-      const T* x,                                                         \
-      T* y,                                                               \
-      HIPContext* context,                                                \
-      Tensor<HIPContext>* scratch_ptr) {                                  \
-    if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {                \
-      FloatTransform<T> transform;                                        \
-      cub::TransformInputIterator<float, FloatTransform<T>, const T*> it( \
-          x, transform);                                                  \
-      float* sum = nullptr;                                               \
-      SumGenericIter<float>(N, it, sum, context, scratch_ptr);            \
-      hipLaunchKernelGGL(                                                 \
-          (SumConvertKernel),                                             \
-          dim3(1),                                                        \
-          dim3(1),                                                        \
-          0,                                                              \
-          context->hip_stream(),                                          \
-          sum,                                                            \
-          y);                                                             \
-    } else {                                                              \
-      hipLaunchKernelGGL(                                                 \
-          (SumKernel),                                                    \
-          dim3(1),                                                        \
-          dim3(SUM_KERNEL_NTHREADS),                                      \
-          0,                                                              \
-          context->hip_stream(),                                          \
-          N,                                                              \
-          x,                                                              \
-          y,                                                              \
-          false);                                                         \
-    }                                                                     \
+#define CAFFE2_MATH_SUM_FUNC(T)                                                \
+  template <>                                                                  \
+  void Sum<T, HIPContext>(const int N, const T *x, T *y, HIPContext *context,  \
+                          Tensor<HIPContext> *scratch_ptr) {                   \
+    if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {                     \
+      FloatTransform<T> transform;                                             \
+      cub::TransformInputIterator<float, FloatTransform<T>, const T *> it(     \
+          x, transform);                                                       \
+      float *sum = nullptr;                                                    \
+      SumGenericIter<float>(N, it, sum, context, scratch_ptr);                 \
+      hipLaunchKernelGGL((SumConvertKernel), dim3(1), dim3(1), 0,              \
+                         context->hip_stream(), sum, y);                       \
+    } else {                                                                   \
+      hipLaunchKernelGGL((SumKernel), dim3(1), dim3(SUM_KERNEL_NTHREADS), 0,   \
+                         context->hip_stream(), N, x, y, false);               \
+    }                                                                          \
   }
 
 CAFFE2_MATH_SUM_FUNC(float16)
 #undef CAFFE2_MATH_SUM_FUNC
 
 namespace {
-template <typename T>
-struct SqrTransform {
-  inline __host__ __device__ T operator()(const T v) const {
-    return v * v;
-  }
+template <typename T> struct SqrTransform {
+  inline __host__ __device__ T operator()(const T v) const { return v * v; }
 };
 } //  namespace
 
 template <>
-void SumSqr<float, HIPContext>(
-    const int N,
-    const float* x,
-    float* y,
-    HIPContext* context,
-    Tensor<HIPContext>* scratch_ptr) {
+void SumSqr<float, HIPContext>(const int N, const float *x, float *y,
+                               HIPContext *context,
+                               Tensor<HIPContext> *scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SqrTransform<float> transform;
-    cub::TransformInputIterator<float, SqrTransform<float>, const float*> it(
+    cub::TransformInputIterator<float, SqrTransform<float>, const float *> it(
         x, transform);
     SumGenericIter<float>(N, it, y, context, scratch_ptr);
   } else {
-    hipLaunchKernelGGL(
-        (SumKernel),
-        dim3(1),
-        dim3(SUM_KERNEL_NTHREADS),
-        0,
-        context->hip_stream(),
-        N,
-        x,
-        y,
-        true);
-  }
-}
-
-#define CAFFE2_MATH_SUMSQR_FUNC(T)                                    \
-  template <>                                                         \
-  void SumSqr<T, HIPContext>(                                         \
-      const int N,                                                    \
-      const T* x,                                                     \
-      T* y,                                                           \
-      HIPContext* context,                                            \
-      Tensor<HIPContext>* scratch_ptr) {                              \
-    if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {            \
-      FloatTransform<T> float_transform;                              \
-      cub::TransformInputIterator<float, FloatTransform<T>, const T*> \
-          float_it(x, float_transform);                               \
-      SqrTransform<float> sqr_transform;                              \
-      cub::TransformInputIterator<                                    \
-          float,                                                      \
-          SqrTransform<float>,                                        \
-          decltype(float_it)>                                         \
-          it(float_it, sqr_transform);                                \
-      float* sum = nullptr;                                           \
-      SumGenericIter<float>(N, it, sum, context, scratch_ptr);        \
-      hipLaunchKernelGGL(                                             \
-          (SumConvertKernel),                                         \
-          dim3(1),                                                    \
-          dim3(1),                                                    \
-          0,                                                          \
-          context->hip_stream(),                                      \
-          sum,                                                        \
-          y);                                                         \
-    } else {                                                          \
-      hipLaunchKernelGGL(                                             \
-          (SumKernel),                                                \
-          dim3(1),                                                    \
-          dim3(SUM_KERNEL_NTHREADS),                                  \
-          0,                                                          \
-          context->hip_stream(),                                      \
-          N,                                                          \
-          x,                                                          \
-          y,                                                          \
-          true);                                                      \
-    }                                                                 \
+    hipLaunchKernelGGL((SumKernel), dim3(1), dim3(SUM_KERNEL_NTHREADS), 0,
+                       context->hip_stream(), N, x, y, true);
+  }
+}
+
+#define CAFFE2_MATH_SUMSQR_FUNC(T)                                             \
+  template <>                                                                  \
+  void SumSqr<T, HIPContext>(const int N, const T *x, T *y,                    \
+                             HIPContext *context,                              \
+                             Tensor<HIPContext> *scratch_ptr) {                \
+    if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {                     \
+      FloatTransform<T> float_transform;                                       \
+      cub::TransformInputIterator<float, FloatTransform<T>, const T *>         \
+          float_it(x, float_transform);                                        \
+      SqrTransform<float> sqr_transform;                                       \
+      cub::TransformInputIterator<float, SqrTransform<float>,                  \
+                                  decltype(float_it)>                          \
+          it(float_it, sqr_transform);                                         \
+      float *sum = nullptr;                                                    \
+      SumGenericIter<float>(N, it, sum, context, scratch_ptr);                 \
+      hipLaunchKernelGGL((SumConvertKernel), dim3(1), dim3(1), 0,              \
+                         context->hip_stream(), sum, y);                       \
+    } else {                                                                   \
+      hipLaunchKernelGGL((SumKernel), dim3(1), dim3(SUM_KERNEL_NTHREADS), 0,   \
+                         context->hip_stream(), N, x, y, true);                \
+    }                                                                          \
   }
 
 CAFFE2_MATH_SUMSQR_FUNC(float16)
@@ -1719,59 +1153,32 @@ CAFFE2_MATH_SUMSQR_FUNC(float16)
 
 namespace {
 template <typename T>
-__global__ void
-SelectKernel(const int N, const int D, const T* x, const int* idx, T* y) {
-  HIP_1D_KERNEL_LOOP(i, N) {
-    y[i] = x[i * D + idx[i]];
-  }
+__global__ void SelectKernel(const int N, const int D, const T *x,
+                             const int *idx, T *y) {
+  HIP_1D_KERNEL_LOOP(i, N) { y[i] = x[i * D + idx[i]]; }
 }
 } // namespace
 
 template <>
-void Select<float, HIPContext>(
-    const int N,
-    const int D,
-    const float* x,
-    const int* idx,
-    float* y,
-    HIPContext* context) {
-  hipLaunchKernelGGL(
-      (SelectKernel<float>),
-      dim3(CAFFE_GET_BLOCKS(N)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      N,
-      D,
-      x,
-      idx,
-      y);
+void Select<float, HIPContext>(const int N, const int D, const float *x,
+                               const int *idx, float *y, HIPContext *context) {
+  hipLaunchKernelGGL((SelectKernel<float>), dim3(CAFFE_GET_BLOCKS(N)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), N,
+                     D, x, idx, y);
 }
 
 template <>
-void Select<float16, HIPContext>(
-    const int N,
-    const int D,
-    const float16* x,
-    const int* idx,
-    float16* y,
-    HIPContext* context) {
-  hipLaunchKernelGGL(
-      (SelectKernel<float16>),
-      dim3(CAFFE_GET_BLOCKS(N)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      N,
-      D,
-      x,
-      idx,
-      y);
+void Select<float16, HIPContext>(const int N, const int D, const float16 *x,
+                                 const int *idx, float16 *y,
+                                 HIPContext *context) {
+  hipLaunchKernelGGL((SelectKernel<float16>), dim3(CAFFE_GET_BLOCKS(N)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), N,
+                     D, x, idx, y);
 }
 
 namespace {
 template <typename T>
-__global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) {
+__global__ void ScaleKernel(const int n, const float alpha, const T *x, T *y) {
   HIP_1D_KERNEL_LOOP(i, n) {
     // y[i] = convert::To<float,T>(convert::To<T, float>(x[i]) * alpha);
     y[i] = convert::Get<T>(convert::Get<float>(x[i]) * alpha);
@@ -1779,275 +1186,146 @@ __global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) {
 }
 
 template <typename T>
-__global__ void
-ScaleKernelDeviceAlpha(const int n, const float* alpha, const T* x, T* y) {
-  HIP_1D_KERNEL_LOOP(i, n) {
-    y[i] = x[i] * (*alpha);
-  }
+__global__ void ScaleKernelDeviceAlpha(const int n, const float *alpha,
+                                       const T *x, T *y) {
+  HIP_1D_KERNEL_LOOP(i, n) { y[i] = x[i] * (*alpha); }
 }
 
 template <typename T>
-__global__ void PowKernel(const int n, const T* x, const T exponent, T* y) {
-  HIP_1D_KERNEL_LOOP(i, n) {
-    y[i] = powf(x[i], exponent);
-  }
+__global__ void PowKernel(const int n, const T *x, const T exponent, T *y) {
+  HIP_1D_KERNEL_LOOP(i, n) { y[i] = powf(x[i], exponent); }
 }
 
 // fp16 specialization
 template <>
-__global__ void ScaleKernelDeviceAlpha(
-    const int n,
-    const float* alpha,
-    const float16* x,
-    float16* y) {
+__global__ void ScaleKernelDeviceAlpha(const int n, const float *alpha,
+                                       const float16 *x, float16 *y) {
   HIP_1D_KERNEL_LOOP(i, n) {
-    y[i] = convert::To<float, float16>(
-        convert::To<float16, float>(x[i]) * (*alpha));
+    y[i] = convert::To<float, float16>(convert::To<float16, float>(x[i]) *
+                                       (*alpha));
   }
 }
 
 } // namespace
 
 template <>
-void Powx<float, HIPContext>(
-    const int N,
-    const float* a,
-    const float b,
-    float* y,
-    HIPContext* context) {
-  hipLaunchKernelGGL(
-      (PowKernel),
-      dim3(CAFFE_GET_BLOCKS(N)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      N,
-      a,
-      b,
-      y);
+void Powx<float, HIPContext>(const int N, const float *a, const float b,
+                             float *y, HIPContext *context) {
+  hipLaunchKernelGGL((PowKernel), dim3(CAFFE_GET_BLOCKS(N)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), N,
+                     a, b, y);
 }
 
 template <>
-void Scale<float, HIPContext>(
-    const int n,
-    const float alpha,
-    const float* x,
-    float* y,
-    HIPContext* context) {
-  hipLaunchKernelGGL(
-      (ScaleKernel<float>),
-      dim3(CAFFE_GET_BLOCKS(n)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      n,
-      alpha,
-      x,
-      y);
+void Scale<float, HIPContext>(const int n, const float alpha, const float *x,
+                              float *y, HIPContext *context) {
+  hipLaunchKernelGGL((ScaleKernel<float>), dim3(CAFFE_GET_BLOCKS(n)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n,
+                     alpha, x, y);
 }
 
 template <>
-void Scale<float16, HIPContext>(
-    const int n,
-    const float alpha,
-    const float16* x,
-    float16* y,
-    HIPContext* context) {
-  hipLaunchKernelGGL(
-      (ScaleKernel<float16>),
-      dim3(CAFFE_GET_BLOCKS(n)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      n,
-      alpha,
-      x,
-      y);
+void Scale<float16, HIPContext>(const int n, const float alpha,
+                                const float16 *x, float16 *y,
+                                HIPContext *context) {
+  hipLaunchKernelGGL((ScaleKernel<float16>), dim3(CAFFE_GET_BLOCKS(n)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n,
+                     alpha, x, y);
 }
 
 template <>
-void Scale<float, HIPContext>(
-    const int n,
-    const float* alpha,
-    const float* x,
-    float* y,
-    HIPContext* context) {
-  hipLaunchKernelGGL(
-      (ScaleKernelDeviceAlpha<float>),
-      dim3(CAFFE_GET_BLOCKS(n)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      n,
-      alpha,
-      x,
-      y);
+void Scale<float, HIPContext>(const int n, const float *alpha, const float *x,
+                              float *y, HIPContext *context) {
+  hipLaunchKernelGGL((ScaleKernelDeviceAlpha<float>), dim3(CAFFE_GET_BLOCKS(n)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n,
+                     alpha, x, y);
 }
 
 template <>
-void Scale<float16, HIPContext>(
-    const int n,
-    const float* alpha,
-    const float16* x,
-    float16* y,
-    HIPContext* context) {
-  hipLaunchKernelGGL(
-      (ScaleKernelDeviceAlpha<float16>),
-      dim3(CAFFE_GET_BLOCKS(n)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      n,
-      alpha,
-      x,
-      y);
+void Scale<float16, HIPContext>(const int n, const float *alpha,
+                                const float16 *x, float16 *y,
+                                HIPContext *context) {
+  hipLaunchKernelGGL((ScaleKernelDeviceAlpha<float16>),
+                     dim3(CAFFE_GET_BLOCKS(n)), dim3(CAFFE_HIP_NUM_THREADS), 0,
+                     context->hip_stream(), n, alpha, x, y);
 }
 
 template <>
-void Axpy<float, HIPContext>(
-    const int N,
-    const float alpha,
-    const float* X,
-    float* Y,
-    HIPContext* context) {
+void Axpy<float, HIPContext>(const int N, const float alpha, const float *X,
+                             float *Y, HIPContext *context) {
   ROCBLAS_ENFORCE(
       rocblas_saxpy(context->rocblas_handle(), N, &alpha, X, 1, Y, 1));
 }
 
 template <>
-void Axpy<double, HIPContext>(
-    const int N,
-    const float alpha,
-    const double* X,
-    double* Y,
-    HIPContext* context) {
+void Axpy<double, HIPContext>(const int N, const float alpha, const double *X,
+                              double *Y, HIPContext *context) {
   double alpha_d{alpha};
   ROCBLAS_ENFORCE(
       rocblas_daxpy(context->rocblas_handle(), N, &alpha_d, X, 1, Y, 1));
 }
 
 template <>
-void Axpy<float16, HIPContext>(
-    const int N,
-    const float alpha,
-    const float16* X,
-    float16* Y,
-    HIPContext* context) {
+void Axpy<float16, HIPContext>(const int N, const float alpha, const float16 *X,
+                               float16 *Y, HIPContext *context) {
   CAFFE_THROW("Unsupported math type");
 #if ROCBLAS_FP16
-  ROCBLAS_CHECK(cublasAxpyEx(
-      context->rocblas_handle(),
-      N,
-      &alpha,
-      CUDA_R_16F,
-      X,
-      CUDA_R_16F,
-      1,
-      Y,
-      CUDA_R_16F,
-      1,
-      CUDA_R_32F));
+  ROCBLAS_CHECK(cublasAxpyEx(context->rocblas_handle(), N, &alpha, CUDA_R_16F,
+                             X, CUDA_R_16F, 1, Y, CUDA_R_16F, 1, CUDA_R_32F));
 #endif
 }
 
 namespace {
 template <typename T>
-__global__ void AxpyKernel(const int n, const float* a, const T* x, T* y) {
+__global__ void AxpyKernel(const int n, const float *a, const T *x, T *y) {
   HIP_1D_KERNEL_LOOP(index, n) {
-    y[index] = convert::Get<T>(
-        convert::Get<float>(x[index]) * (*a) + convert::Get<float>(y[index]));
+    y[index] = convert::Get<T>(convert::Get<float>(x[index]) * (*a) +
+                               convert::Get<float>(y[index]));
   }
 }
 } // namespace
 
 template <>
-void Axpy<float, HIPContext>(
-    const int n,
-    const float* alpha,
-    const float* X,
-    float* Y,
-    HIPContext* context) {
-  hipLaunchKernelGGL(
-      (AxpyKernel<float>),
-      dim3(CAFFE_GET_BLOCKS(n)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      n,
-      alpha,
-      X,
-      Y);
+void Axpy<float, HIPContext>(const int n, const float *alpha, const float *X,
+                             float *Y, HIPContext *context) {
+  hipLaunchKernelGGL((AxpyKernel<float>), dim3(CAFFE_GET_BLOCKS(n)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n,
+                     alpha, X, Y);
 }
 
 template <>
-void Axpy<float16, HIPContext>(
-    const int n,
-    const float* alpha,
-    const float16* X,
-    float16* Y,
-    HIPContext* context) {
-  hipLaunchKernelGGL(
-      (AxpyKernel<float16>),
-      dim3(CAFFE_GET_BLOCKS(n)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      n,
-      alpha,
-      X,
-      Y);
+void Axpy<float16, HIPContext>(const int n, const float *alpha,
+                               const float16 *X, float16 *Y,
+                               HIPContext *context) {
+  hipLaunchKernelGGL((AxpyKernel<float16>), dim3(CAFFE_GET_BLOCKS(n)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n,
+                     alpha, X, Y);
 }
 
 namespace {
 template <typename T>
-__global__ void
-AxpbyKernel(const int n, const T a, const T* x, const T b, T* y) {
-  HIP_1D_KERNEL_LOOP(index, n) {
-    y[index] = x[index] * a + y[index] * b;
-  }
+__global__ void AxpbyKernel(const int n, const T a, const T *x, const T b,
+                            T *y) {
+  HIP_1D_KERNEL_LOOP(index, n) { y[index] = x[index] * a + y[index] * b; }
 }
 } // namespace
 
 template <>
-void Axpby<float, HIPContext>(
-    const int n,
-    const float a,
-    const float* x,
-    const float b,
-    float* y,
-    HIPContext* context) {
-  hipLaunchKernelGGL(
-      (AxpbyKernel<float>),
-      dim3(CAFFE_GET_BLOCKS(n)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      n,
-      a,
-      x,
-      b,
-      y);
+void Axpby<float, HIPContext>(const int n, const float a, const float *x,
+                              const float b, float *y, HIPContext *context) {
+  hipLaunchKernelGGL((AxpbyKernel<float>), dim3(CAFFE_GET_BLOCKS(n)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), n,
+                     a, x, b, y);
 }
 
 namespace {
 
 template <typename T>
 __global__ void Im2ColNCHWHIPKernel(
-    const int n,
-    const int input_h,
-    const int input_w,
-    const int kernel_h,
-    const int kernel_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int pad_t,
-    const int pad_l,
-    const int stride_h,
-    const int stride_w,
-    const int output_h,
-    const int output_w,
-    const T* img_data,
-    T* col_data) {
+    const int n, const int input_h, const int input_w, const int kernel_h,
+    const int kernel_w, const int dilation_h, const int dilation_w,
+    const int pad_t, const int pad_l, const int stride_h, const int stride_w,
+    const int output_h, const int output_w, const T *img_data, T *col_data) {
   HIP_1D_KERNEL_LOOP(index, n) {
     const int w_out = index % output_w;
     const int h_index = index / output_w;
@@ -2057,9 +1335,9 @@ __global__ void Im2ColNCHWHIPKernel(
     const int h_in = h_out * stride_h - pad_t;
     const int w_in = w_out * stride_w - pad_l;
     const int output_size = output_h * output_w;
-    T* col_data_ptr =
+    T *col_data_ptr =
         col_data + (channel_out * output_h + h_out) * output_w + w_out;
-    const T* img_data_ptr =
+    const T *img_data_ptr =
         img_data + (channel_in * input_h + h_in) * input_w + w_in;
     int dh = 0;
     for (int i = 0; i < kernel_h; ++i) {
@@ -2068,8 +1346,8 @@ __global__ void Im2ColNCHWHIPKernel(
         const int h = h_in + dh;
         const int w = w_in + dw;
         *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
-            ? __ldg(img_data_ptr + dh * input_w + dw)
-            : 0;
+                            ? __ldg(img_data_ptr + dh * input_w + dw)
+                            : 0;
         col_data_ptr += output_size;
         dw += dilation_w;
       }
@@ -2080,29 +1358,18 @@ __global__ void Im2ColNCHWHIPKernel(
 
 template <typename T>
 __global__ void Im2ColNHWCHIPKernel(
-    const int n,
-    const int input_h,
-    const int input_w,
-    const int kernel_h,
-    const int kernel_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int pad_t,
-    const int pad_l,
-    const int stride_h,
-    const int stride_w,
-    const int output_w,
-    const int channels,
-    const T* img_data,
-    T* col_data) {
+    const int n, const int input_h, const int input_w, const int kernel_h,
+    const int kernel_w, const int dilation_h, const int dilation_w,
+    const int pad_t, const int pad_l, const int stride_h, const int stride_w,
+    const int output_w, const int channels, const T *img_data, T *col_data) {
   HIP_1D_KERNEL_LOOP(index, n) {
     const int channel_in = index % channels;
     const int w_out = index / channels % output_w;
     const int h_out = index / channels / output_w;
     const int h_in = h_out * stride_h - pad_t;
     const int w_in = w_out * stride_w - pad_l;
-    T* col_data_ptr = col_data +
-        (h_out * output_w + w_out) * channels * kernel_h * kernel_w +
+    T *col_data_ptr =
+        col_data + (h_out * output_w + w_out) * channels * kernel_h * kernel_w +
         channel_in;
     int dh = 0;
     for (int i = 0; i < kernel_h; ++i) {
@@ -2110,9 +1377,10 @@ __global__ void Im2ColNHWCHIPKernel(
       for (int j = 0; j < kernel_w; ++j) {
         const int h = h_in + dh;
         const int w = w_in + dw;
-        *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
-            ? __ldg(img_data + (h * input_w + w) * channels + channel_in)
-            : 0;
+        *col_data_ptr =
+            (h >= 0 && w >= 0 && h < input_h && w < input_w)
+                ? __ldg(img_data + (h * input_w + w) * channels + channel_in)
+                : 0;
         col_data_ptr += channels;
         dw += dilation_w;
       }
@@ -2122,22 +1390,12 @@ __global__ void Im2ColNHWCHIPKernel(
 }
 
 template <typename T>
-__global__ void Col2ImNCHWHIPKernel(
-    const int n,
-    const int input_h,
-    const int input_w,
-    const int patch_h,
-    const int patch_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int pad_t,
-    const int pad_l,
-    const int stride_h,
-    const int stride_w,
-    const int output_h,
-    const int output_w,
-    const T* col_data,
-    T* img_data) {
+__global__ void
+Col2ImNCHWHIPKernel(const int n, const int input_h, const int input_w,
+                    const int patch_h, const int patch_w, const int dilation_h,
+                    const int dilation_w, const int pad_t, const int pad_l,
+                    const int stride_h, const int stride_w, const int output_h,
+                    const int output_w, const T *col_data, T *img_data) {
   const int dpatch_h = dilation_h * (patch_h - 1) + 1;
   const int dpatch_w = dilation_w * (patch_w - 1) + 1;
 
@@ -2173,22 +1431,12 @@ __global__ void Col2ImNCHWHIPKernel(
 }
 
 template <typename T>
-__global__ void Col2ImNHWCHIPKernel(
-    const int n,
-    const int input_w,
-    const int channels,
-    const int patch_h,
-    const int patch_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int pad_t,
-    const int pad_l,
-    const int stride_h,
-    const int stride_w,
-    const int output_h,
-    const int output_w,
-    const T* col_data,
-    T* img_data) {
+__global__ void
+Col2ImNHWCHIPKernel(const int n, const int input_w, const int channels,
+                    const int patch_h, const int patch_w, const int dilation_h,
+                    const int dilation_w, const int pad_t, const int pad_l,
+                    const int stride_h, const int stride_w, const int output_h,
+                    const int output_w, const T *col_data, T *img_data) {
   const int dpatch_h = dilation_h * (patch_h - 1) + 1;
   const int dpatch_w = dilation_w * (patch_w - 1) + 1;
 
@@ -2212,8 +1460,8 @@ __global__ void Col2ImNHWCHIPKernel(
           h_k /= dilation_h;
           w_k /= dilation_w;
           const int c_col = (h_k * patch_w + w_k) * channels + c;
-          val += __ldg(
-              col_data + (h_col * output_w + w_col) * channels_col + c_col);
+          val += __ldg(col_data + (h_col * output_w + w_col) * channels_col +
+                       c_col);
         }
       }
     }
@@ -2222,18 +1470,13 @@ __global__ void Col2ImNHWCHIPKernel(
 }
 
 template <typename T, int N, bool kCol2Im>
-__global__ void Im2ColNdNCHWHIPKernel(
-    const int outer_size,
-    const int inner_size,
-    const int kernel_size,
-    SimpleArray<int, N + 1> img_shape,
-    SimpleArray<int, N + 1> col_shape,
-    SimpleArray<int, N> kernel_shape,
-    SimpleArray<int, N> stride,
-    SimpleArray<int, N> dilation,
-    SimpleArray<int, N> pad,
-    const T* X_data,
-    T* Y_data) {
+__global__ void
+Im2ColNdNCHWHIPKernel(const int outer_size, const int inner_size,
+                      const int kernel_size, SimpleArray<int, N + 1> img_shape,
+                      SimpleArray<int, N + 1> col_shape,
+                      SimpleArray<int, N> kernel_shape,
+                      SimpleArray<int, N> stride, SimpleArray<int, N> dilation,
+                      SimpleArray<int, N> pad, const T *X_data, T *Y_data) {
   int d_offset[N];
   int d_iter[N];
   for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
@@ -2256,7 +1499,7 @@ __global__ void Im2ColNdNCHWHIPKernel(
 #pragma unroll
       for (int d_i = 0; d_i < N; ++d_i) {
         const int d_img = d_iter[d_i] * stride.data[d_i] - pad.data[d_i] +
-            d_offset[d_i] * dilation.data[d_i];
+                          d_offset[d_i] * dilation.data[d_i];
         is_padding |= d_img < 0 || d_img >= img_shape.data[d_i + 1];
         img_index = img_index * img_shape.data[d_i + 1] + d_img;
       }
@@ -2270,22 +1513,16 @@ __global__ void Im2ColNdNCHWHIPKernel(
 }
 
 template <typename T, int N>
-void Im2ColNdNCHWHIPImpl(
-    const int img_size,
-    const int col_size,
-    const int* img_shape,
-    const int* col_shape,
-    const int* kernel_shape,
-    const int* stride,
-    const int* dilation,
-    const int* pad,
-    const float* img_data,
-    float* col_data,
-    HIPContext* context) {
+void Im2ColNdNCHWHIPImpl(const int img_size, const int col_size,
+                         const int *img_shape, const int *col_shape,
+                         const int *kernel_shape, const int *stride,
+                         const int *dilation, const int *pad,
+                         const float *img_data, float *col_data,
+                         HIPContext *context) {
   const int outer_size = col_shape[0];
   const int inner_size = col_size / outer_size;
-  const int kernel_size = std::accumulate(
-      kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
+  const int kernel_size = std::accumulate(kernel_shape, kernel_shape + N, 1,
+                                          std::multiplies<int>());
   SimpleArray<int, N + 1> img_shape_array;
   SimpleArray<int, N + 1> col_shape_array;
   SimpleArray<int, N> kernel_shape_array;
@@ -2298,42 +1535,25 @@ void Im2ColNdNCHWHIPImpl(
   std::memcpy(stride_array.data, stride, N * sizeof(int));
   std::memcpy(dilation_array.data, dilation, N * sizeof(int));
   std::memcpy(pad_array.data, pad, N * sizeof(int));
-  hipLaunchKernelGGL(
-      (Im2ColNdNCHWHIPKernel<T, N, false>),
-      dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      outer_size,
-      inner_size,
-      kernel_size,
-      img_shape_array,
-      col_shape_array,
-      kernel_shape_array,
-      stride_array,
-      dilation_array,
-      pad_array,
-      img_data,
-      col_data);
+  hipLaunchKernelGGL((Im2ColNdNCHWHIPKernel<T, N, false>),
+                     dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(),
+                     outer_size, inner_size, kernel_size, img_shape_array,
+                     col_shape_array, kernel_shape_array, stride_array,
+                     dilation_array, pad_array, img_data, col_data);
 }
 
 template <typename T, int N>
-void Col2ImNdNCHWHIPImpl(
-    const int img_size,
-    const int col_size,
-    const int* img_shape,
-    const int* col_shape,
-    const int* kernel_shape,
-    const int* stride,
-    const int* dilation,
-    const int* pad,
-    const float* col_data,
-    float* img_data,
-    HIPContext* context) {
+void Col2ImNdNCHWHIPImpl(const int img_size, const int col_size,
+                         const int *img_shape, const int *col_shape,
+                         const int *kernel_shape, const int *stride,
+                         const int *dilation, const int *pad,
+                         const float *col_data, float *img_data,
+                         HIPContext *context) {
   const int outer_size = col_shape[0];
   const int inner_size = col_size / outer_size;
-  const int kernel_size = std::accumulate(
-      kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
+  const int kernel_size = std::accumulate(kernel_shape, kernel_shape + N, 1,
+                                          std::multiplies<int>());
   SimpleArray<int, N + 1> img_shape_array;
   SimpleArray<int, N + 1> col_shape_array;
   SimpleArray<int, N> kernel_shape_array;
@@ -2347,309 +1567,130 @@ void Col2ImNdNCHWHIPImpl(
   std::memcpy(dilation_array.data, dilation, N * sizeof(int));
   std::memcpy(pad_array.data, pad, N * sizeof(int));
   Set<T, HIPContext>(img_size, 0, img_data, context);
-  hipLaunchKernelGGL(
-      (Im2ColNdNCHWHIPKernel<T, N, true>),
-      dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      outer_size,
-      inner_size,
-      kernel_size,
-      img_shape_array,
-      col_shape_array,
-      kernel_shape_array,
-      stride_array,
-      dilation_array,
-      pad_array,
-      col_data,
-      img_data);
+  hipLaunchKernelGGL((Im2ColNdNCHWHIPKernel<T, N, true>),
+                     dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(),
+                     outer_size, inner_size, kernel_size, img_shape_array,
+                     col_shape_array, kernel_shape_array, stride_array,
+                     dilation_array, pad_array, col_data, img_data);
 }
 
 } // namespace
 
 template <>
 void Im2Col<float, HIPContext, StorageOrder::NCHW>(
-    const int channels,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int pad_t,
-    const int pad_l,
-    const int pad_b,
-    const int pad_r,
-    const int stride_h,
-    const int stride_w,
-    const float* img_data,
-    float* col_data,
-    HIPContext* context) {
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int dilation_h, const int dilation_w,
+    const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+    const int stride_h, const int stride_w, const float *img_data,
+    float *col_data, HIPContext *context) {
   const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
   const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
   const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
   const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
   const int num_kernels = channels * output_h * output_w;
   hipLaunchKernelGGL(
-      (Im2ColNCHWHIPKernel<float>),
-      dim3(CAFFE_GET_BLOCKS(num_kernels)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      num_kernels,
-      height,
-      width,
-      kernel_h,
-      kernel_w,
-      dilation_h,
-      dilation_w,
-      pad_t,
-      pad_l,
-      stride_h,
-      stride_w,
-      output_h,
-      output_w,
-      img_data,
-      col_data);
+      (Im2ColNCHWHIPKernel<float>), dim3(CAFFE_GET_BLOCKS(num_kernels)),
+      dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), num_kernels,
+      height, width, kernel_h, kernel_w, dilation_h, dilation_w, pad_t, pad_l,
+      stride_h, stride_w, output_h, output_w, img_data, col_data);
 }
 
 template <>
 void Im2Col<float, HIPContext, StorageOrder::NHWC>(
-    const int channels,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int pad_t,
-    const int pad_l,
-    const int pad_b,
-    const int pad_r,
-    const int stride_h,
-    const int stride_w,
-    const float* img_data,
-    float* col_data,
-    HIPContext* context) {
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int dilation_h, const int dilation_w,
+    const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+    const int stride_h, const int stride_w, const float *img_data,
+    float *col_data, HIPContext *context) {
   const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
   const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
   const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
   const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
   const int num_kernels = output_h * output_w * channels;
   hipLaunchKernelGGL(
-      (Im2ColNHWCHIPKernel<float>),
-      dim3(CAFFE_GET_BLOCKS(num_kernels)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      num_kernels,
-      height,
-      width,
-      kernel_h,
-      kernel_w,
-      dilation_h,
-      dilation_w,
-      pad_t,
-      pad_l,
-      stride_h,
-      stride_w,
-      output_w,
-      channels,
-      img_data,
-      col_data);
+      (Im2ColNHWCHIPKernel<float>), dim3(CAFFE_GET_BLOCKS(num_kernels)),
+      dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), num_kernels,
+      height, width, kernel_h, kernel_w, dilation_h, dilation_w, pad_t, pad_l,
+      stride_h, stride_w, output_w, channels, img_data, col_data);
 }
 
 template <>
 void Col2Im<float, HIPContext, StorageOrder::NCHW>(
-    const int channels,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int pad_t,
-    const int pad_l,
-    const int pad_b,
-    const int pad_r,
-    const int stride_h,
-    const int stride_w,
-    const float* col_data,
-    float* img_data,
-    HIPContext* context) {
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int dilation_h, const int dilation_w,
+    const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+    const int stride_h, const int stride_w, const float *col_data,
+    float *img_data, HIPContext *context) {
   const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
   const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
   const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
   const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
   const int num_kernels = channels * height * width;
   hipLaunchKernelGGL(
-      (Col2ImNCHWHIPKernel<float>),
-      dim3(CAFFE_GET_BLOCKS(num_kernels)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      num_kernels,
-      height,
-      width,
-      kernel_h,
-      kernel_w,
-      dilation_h,
-      dilation_w,
-      pad_t,
-      pad_l,
-      stride_h,
-      stride_w,
-      output_h,
-      output_w,
-      col_data,
-      img_data);
+      (Col2ImNCHWHIPKernel<float>), dim3(CAFFE_GET_BLOCKS(num_kernels)),
+      dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), num_kernels,
+      height, width, kernel_h, kernel_w, dilation_h, dilation_w, pad_t, pad_l,
+      stride_h, stride_w, output_h, output_w, col_data, img_data);
 }
 
 template <>
 void Col2Im<float, HIPContext, StorageOrder::NHWC>(
-    const int channels,
-    const int height,
-    const int width,
-    const int kernel_h,
-    const int kernel_w,
-    const int dilation_h,
-    const int dilation_w,
-    const int pad_t,
-    const int pad_l,
-    const int pad_b,
-    const int pad_r,
-    const int stride_h,
-    const int stride_w,
-    const float* col_data,
-    float* img_data,
-    HIPContext* context) {
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int dilation_h, const int dilation_w,
+    const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+    const int stride_h, const int stride_w, const float *col_data,
+    float *img_data, HIPContext *context) {
   const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
   const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
   const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
   const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
   const int num_kernels = height * width * channels;
   hipLaunchKernelGGL(
-      (Col2ImNHWCHIPKernel<float>),
-      dim3(CAFFE_GET_BLOCKS(num_kernels)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      num_kernels,
-      width,
-      channels,
-      kernel_h,
-      kernel_w,
-      dilation_h,
-      dilation_w,
-      pad_t,
-      pad_l,
-      stride_h,
-      stride_w,
-      output_h,
-      output_w,
-      col_data,
-      img_data);
+      (Col2ImNHWCHIPKernel<float>), dim3(CAFFE_GET_BLOCKS(num_kernels)),
+      dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), num_kernels, width,
+      channels, kernel_h, kernel_w, dilation_h, dilation_w, pad_t, pad_l,
+      stride_h, stride_w, output_h, output_w, col_data, img_data);
 }
 
 template <>
 void Im2ColNd<float, HIPContext, StorageOrder::NCHW>(
-    const int N,
-    const int img_size,
-    const int col_size,
-    const int* img_shape,
-    const int* col_shape,
-    const int* kernel_shape,
-    const int* stride,
-    const int* dilation,
-    const int* pad,
-    const float* img_data,
-    float* col_data,
-    HIPContext* context) {
+    const int N, const int img_size, const int col_size, const int *img_shape,
+    const int *col_shape, const int *kernel_shape, const int *stride,
+    const int *dilation, const int *pad, const float *img_data, float *col_data,
+    HIPContext *context) {
   DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
-      N,
-      Im2ColNdNCHWHIPImpl,
-      float,
-      img_size,
-      col_size,
-      img_shape,
-      col_shape,
-      kernel_shape,
-      stride,
-      dilation,
-      pad,
-      img_data,
-      col_data,
-      context);
+      N, Im2ColNdNCHWHIPImpl, float, img_size, col_size, img_shape, col_shape,
+      kernel_shape, stride, dilation, pad, img_data, col_data, context);
 }
 
 template <>
 void Col2ImNd<float, HIPContext, StorageOrder::NCHW>(
-    const int N,
-    const int img_size,
-    const int col_size,
-    const int* img_shape,
-    const int* col_shape,
-    const int* kernel_shape,
-    const int* stride,
-    const int* dilation,
-    const int* pad,
-    const float* col_data,
-    float* img_data,
-    HIPContext* context) {
+    const int N, const int img_size, const int col_size, const int *img_shape,
+    const int *col_shape, const int *kernel_shape, const int *stride,
+    const int *dilation, const int *pad, const float *col_data, float *img_data,
+    HIPContext *context) {
   DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
-      N,
-      Col2ImNdNCHWHIPImpl,
-      float,
-      img_size,
-      col_size,
-      img_shape,
-      col_shape,
-      kernel_shape,
-      stride,
-      dilation,
-      pad,
-      col_data,
-      img_data,
-      context);
+      N, Col2ImNdNCHWHIPImpl, float, img_size, col_size, img_shape, col_shape,
+      kernel_shape, stride, dilation, pad, col_data, img_data, context);
 }
 
 template <>
-void CopyMatrix<HIPContext>(
-    const size_t itemsize,
-    const int M,
-    const int N,
-    const void* A,
-    const int lda,
-    void* B,
-    const int ldb,
-    HIPContext* context,
-    TypeMeta::TypedCopy copy) {
+void CopyMatrix<HIPContext>(const size_t itemsize, const int M, const int N,
+                            const void *A, const int lda, void *B,
+                            const int ldb, HIPContext *context,
+                            TypeMeta::TypedCopy copy) {
   CAFFE_ENFORCE(!copy, "Copy constructor is not supported in HIP context");
-  hipMemcpy2DAsync(
-      B,
-      ldb * itemsize,
-      A,
-      lda * itemsize,
-      N * itemsize,
-      M,
-      hipMemcpyDeviceToDevice,
-      context->hip_stream());
+  hipMemcpy2DAsync(B, ldb * itemsize, A, lda * itemsize, N * itemsize, M,
+                   hipMemcpyDeviceToDevice, context->hip_stream());
 }
 
 template <>
-void CopyVector<float, HIPContext>(
-    const int N,
-    const float* src,
-    float* dst,
-    HIPContext* context) {
+void CopyVector<float, HIPContext>(const int N, const float *src, float *dst,
+                                   HIPContext *context) {
   if (src != dst && N > 0) {
-    hipMemcpyAsync(
-        dst,
-        src,
-        sizeof(float) * N,
-        hipMemcpyDeviceToDevice,
-        context->hip_stream());
+    hipMemcpyAsync(dst, src, sizeof(float) * N, hipMemcpyDeviceToDevice,
+                   context->hip_stream());
   }
 }
 
@@ -2659,13 +1700,9 @@ template <typename T>
 using BlockReduce = cub::BlockReduce<T, CAFFE_HIP_NUM_THREADS>;
 
 template <typename T, class Reducer>
-__global__ void RowwiseReduceKernel(
-    const int rows,
-    const int cols,
-    const Reducer reducer,
-    const T init,
-    const T* X,
-    T* Y) {
+__global__ void RowwiseReduceKernel(const int rows, const int cols,
+                                    const Reducer reducer, const T init,
+                                    const T *X, T *Y) {
   __shared__ typename BlockReduce<T>::TempStorage temp_storage;
   for (int i = blockIdx.x; i < rows; i += gridDim.x) {
     T val = init;
@@ -2681,13 +1718,9 @@ __global__ void RowwiseReduceKernel(
 }
 
 template <typename T, class Reducer>
-__global__ void ColwiseReduceKernel(
-    const int rows,
-    const int cols,
-    const Reducer reducer,
-    const T init,
-    const T* X,
-    T* Y) {
+__global__ void ColwiseReduceKernel(const int rows, const int cols,
+                                    const Reducer reducer, const T init,
+                                    const T *X, T *Y) {
   __shared__ typename BlockReduce<T>::TempStorage temp_storage;
   for (int i = blockIdx.x; i < cols; i += gridDim.x) {
     T val = init;
@@ -2704,86 +1737,53 @@ __global__ void ColwiseReduceKernel(
 
 } // namespace
 
-#define CAFFE2_SPECIALIZED_HIP_ROWWISE_MAX(T)                            \
-  template <>                                                            \
-  void RowwiseMax<T, HIPContext>(                                        \
-      const int N, const int D, const T* x, T* y, HIPContext* context) { \
-    hipLaunchKernelGGL(                                                  \
-        (RowwiseReduceKernel),                                           \
-        std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),                           \
-        CAFFE_HIP_NUM_THREADS,                                           \
-        0,                                                               \
-        context->hip_stream(),                                           \
-        N,                                                               \
-        D,                                                               \
-        cub::Max(),                                                      \
-        std::numeric_limits<T>::lowest(),                                \
-        x,                                                               \
-        y);                                                              \
+#define CAFFE2_SPECIALIZED_HIP_ROWWISE_MAX(T)                                  \
+  template <>                                                                  \
+  void RowwiseMax<T, HIPContext>(const int N, const int D, const T *x, T *y,   \
+                                 HIPContext *context) {                        \
+    hipLaunchKernelGGL(RowwiseReduceKernel,                                    \
+                       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),                  \
+                       CAFFE_HIP_NUM_THREADS, 0, context->hip_stream(), N, D,  \
+                       cub::Max(), std::numeric_limits<T>::lowest(), x, y);    \
   }
 CAFFE2_SPECIALIZED_HIP_ROWWISE_MAX(float)
 #undef CAFFE2_SPECIALIZED_HIP_ROWWISE_MAX
 
-#define CAFFE2_SPECIALIZED_HIP_COLWISE_MAX(T)                            \
-  template <>                                                            \
-  void ColwiseMax<T, HIPContext>(                                        \
-      const int N, const int D, const T* x, T* y, HIPContext* context) { \
-    hipLaunchKernelGGL(                                                  \
-        (ColwiseReduceKernel),                                           \
-        std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS),                           \
-        CAFFE_HIP_NUM_THREADS,                                           \
-        0,                                                               \
-        context->hip_stream(),                                           \
-        N,                                                               \
-        D,                                                               \
-        cub::Max(),                                                      \
-        std::numeric_limits<T>::lowest(),                                \
-        x,                                                               \
-        y);                                                              \
+#define CAFFE2_SPECIALIZED_HIP_COLWISE_MAX(T)                                  \
+  template <>                                                                  \
+  void ColwiseMax<T, HIPContext>(const int N, const int D, const T *x, T *y,   \
+                                 HIPContext *context) {                        \
+    hipLaunchKernelGGL(ColwiseReduceKernel,                                    \
+                       std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS),                  \
+                       CAFFE_HIP_NUM_THREADS, 0, context->hip_stream(), N, D,  \
+                       cub::Max(), std::numeric_limits<T>::lowest(), x, y);    \
   }
 CAFFE2_SPECIALIZED_HIP_COLWISE_MAX(float)
 #undef CAFFE2_SPECIALIZED_HIP_COLWISE_MAX
 
 namespace {
-__global__ void
-maximum_kernel(const int N, const float alpha, const float* x, float* y) {
-  HIP_1D_KERNEL_LOOP(i, N) {
-    y[i] = fmaxf(x[i], alpha);
-  }
+__global__ void maximum_kernel(const int N, const float alpha, const float *x,
+                               float *y) {
+  HIP_1D_KERNEL_LOOP(i, N) { y[i] = fmaxf(x[i], alpha); }
 }
 } // namespace
 
 template <>
-void Maximum(
-    const int N,
-    const float alpha,
-    const float* x,
-    float* y,
-    HIPContext* context) {
+void Maximum(const int N, const float alpha, const float *x, float *y,
+             HIPContext *context) {
   hipLaunchKernelGGL(
-      (maximum_kernel),
-      dim3(std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      N,
-      alpha,
-      x,
-      y);
+      (maximum_kernel), dim3(std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS)),
+      dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(), N, alpha, x, y);
 }
 
 namespace {
 
 template <typename T, class Reducer, int D>
-__global__ void ReduceTensorHIPKernel(
-    const int outer_size,
-    const int inner_size,
-    SimpleArray<int, D> X_strides,
-    SimpleArray<int, D> Y_dims,
-    const Reducer reducer,
-    const T init,
-    const T* X,
-    T* Y) {
+__global__ void
+ReduceTensorHIPKernel(const int outer_size, const int inner_size,
+                      SimpleArray<int, D> X_strides,
+                      SimpleArray<FixedDivisor<int>, D> Y_dims,
+                      const Reducer reducer, const T init, const T *X, T *Y) {
   __shared__ typename BlockReduce<T>::TempStorage temp_storage;
   for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
     T val = init;
@@ -2792,10 +1792,15 @@ __global__ void ReduceTensorHIPKernel(
       int Y_index = i * inner_size + j;
 #pragma unroll
       for (int d = D - 1; d >= 0; --d) {
-        X_index += (Y_index % Y_dims.data[d]) * X_strides.data[d];
-        Y_index /= Y_dims.data[d];
+        int r;
+        Y_dims.data[d].DivMod(Y_index, &Y_index, &r);
+        X_index += r * X_strides.data[d];
       }
+#if __HIP_ARCH__ >= 350
       val = reducer(val, __ldg(X + X_index));
+#else
+      val = reducer(val, X[X_index]);
+#endif
     }
     val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
     if (threadIdx.x == 0) {
@@ -2806,53 +1811,34 @@ __global__ void ReduceTensorHIPKernel(
 }
 
 template <typename T, class Reducer, int D>
-void ReduceTensorHIPImpl(
-    const int outer_size,
-    const int inner_size,
-    const int* dims,
-    const int* axes,
-    const Reducer& reducer,
-    const T& init,
-    const T* X,
-    T* Y,
-    HIPContext* context) {
+void ReduceTensorHIPImpl(const int outer_size, const int inner_size,
+                         const int *dims, const int *axes,
+                         const Reducer &reducer, const T &init, const T *X,
+                         T *Y, HIPContext *context) {
   SimpleArray<int, D> X_strides;
-  SimpleArray<int, D> Y_dims;
+  SimpleArray<FixedDivisor<int>, D> Y_dims;
   utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
   for (int i = 0; i < D; ++i) {
-    Y_dims.data[i] = dims[axes[i]];
+    Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
   }
-  hipLaunchKernelGGL(
-      (ReduceTensorHIPKernel<T, Reducer, D>),
-      dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      outer_size,
-      inner_size,
-      X_strides,
-      Y_dims,
-      reducer,
-      init,
-      X,
-      Y);
+  hipLaunchKernelGGL((ReduceTensorHIPKernel<T, Reducer, D>),
+                     dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(),
+                     outer_size, inner_size, X_strides, Y_dims, reducer, init,
+                     X, Y);
 }
 
 template <typename T, class Reducer>
-void ReduceTensorHIP(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes,
-    const Reducer& reducer,
-    const T& init,
-    const T* X,
-    T* Y,
-    HIPContext* context) {
+void ReduceTensorHIP(const int num_dims, const int *dims, const int num_axes,
+                     const int *axes, const Reducer &reducer, const T &init,
+                     const T *X, T *Y, HIPContext *context) {
   CAFFE_ENFORCE_LE(num_axes, num_dims);
+  if (X == Y) {
+    return;
+  }
   std::vector<int> transpose_axes(num_dims);
-  utils::ComputeTransposeAxesForReduceOp(
-      num_dims, num_axes, axes, transpose_axes.data());
+  utils::ComputeTransposeAxesForReduceOp(num_dims, num_axes, axes,
+                                         transpose_axes.data());
   const int pivot = num_dims - num_axes;
   int outer_size = 1;
   for (int i = 0; i < pivot; ++i) {
@@ -2862,48 +1848,27 @@ void ReduceTensorHIP(
   for (int i = pivot; i < num_dims; ++i) {
     inner_size *= dims[transpose_axes[i]];
   }
-  if (transpose_axes[pivot] == pivot) {
-    hipLaunchKernelGGL(
-        (RowwiseReduceKernel<T>),
-        dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
-        dim3(CAFFE_HIP_NUM_THREADS),
-        0,
-        context->hip_stream(),
-        outer_size,
-        inner_size,
-        reducer,
-        init,
-        X,
-        Y);
-    return;
+  if (outer_size > 0 && inner_size > 0) {
+    if (transpose_axes[pivot] == pivot) {
+      hipLaunchKernelGGL((RowwiseReduceKernel<T>),
+                         dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
+                         dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(),
+                         outer_size, inner_size, reducer, init, X, Y);
+      return;
+    }
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
+        num_dims, ReduceTensorHIPImpl, T, Reducer, outer_size, inner_size, dims,
+        transpose_axes.data(), reducer, init, X, Y, context);
+  } else if (outer_size > 0) {
+    math::Set<T, HIPContext>(outer_size, init, Y, context);
   }
-  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
-      num_dims,
-      ReduceTensorHIPImpl,
-      T,
-      Reducer,
-      outer_size,
-      inner_size,
-      dims,
-      transpose_axes.data(),
-      reducer,
-      init,
-      X,
-      Y,
-      context);
 }
 
 template <typename T>
-void ReduceMeanHIPImpl(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes,
-    const T* X,
-    T* Y,
-    HIPContext* context) {
-  ReduceTensorHIP(
-      num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context);
+void ReduceMeanHIPImpl(const int num_dims, const int *dims, const int num_axes,
+                       const int *axes, const T *X, T *Y, HIPContext *context) {
+  ReduceTensorHIP(num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y,
+                  context);
   const int X_size =
       std::accumulate(dims, dims + num_dims, 1, std::multiplies<int>());
   int scale = 1;
@@ -2916,26 +1881,13 @@ void ReduceMeanHIPImpl(
 
 } // namespace
 
-#define CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(T) \
-  template <>                                \
-  void ReduceMin<T, HIPContext>(             \
-      const int num_dims,                    \
-      const int* dims,                       \
-      const int num_axes,                    \
-      const int* axes,                       \
-      const T* X,                            \
-      T* Y,                                  \
-      HIPContext* context) {                 \
-    ReduceTensorHIP(                         \
-        num_dims,                            \
-        dims,                                \
-        num_axes,                            \
-        axes,                                \
-        cub::Min(),                          \
-        std::numeric_limits<T>::max(),       \
-        X,                                   \
-        Y,                                   \
-        context);                            \
+#define CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(T)                                   \
+  template <>                                                                  \
+  void ReduceMin<T, HIPContext>(const int num_dims, const int *dims,           \
+                                const int num_axes, const int *axes,           \
+                                const T *X, T *Y, HIPContext *context) {       \
+    ReduceTensorHIP(num_dims, dims, num_axes, axes, cub::Min(),                \
+                    std::numeric_limits<T>::max(), X, Y, context);             \
   }
 CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(std::int32_t)
 CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(std::int64_t)
@@ -2943,26 +1895,13 @@ CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(float)
 CAFFE2_SPECIALIZED_HIP_REDUCE_MIN(double)
 #undef CAFFE2_SPECIALIZED_HIP_REDUCE_MIN
 
-#define CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(T) \
-  template <>                                \
-  void ReduceMax<T, HIPContext>(             \
-      const int num_dims,                    \
-      const int* dims,                       \
-      const int num_axes,                    \
-      const int* axes,                       \
-      const T* X,                            \
-      T* Y,                                  \
-      HIPContext* context) {                 \
-    ReduceTensorHIP(                         \
-        num_dims,                            \
-        dims,                                \
-        num_axes,                            \
-        axes,                                \
-        cub::Max(),                          \
-        std::numeric_limits<T>::lowest(),    \
-        X,                                   \
-        Y,                                   \
-        context);                            \
+#define CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(T)                                   \
+  template <>                                                                  \
+  void ReduceMax<T, HIPContext>(const int num_dims, const int *dims,           \
+                                const int num_axes, const int *axes,           \
+                                const T *X, T *Y, HIPContext *context) {       \
+    ReduceTensorHIP(num_dims, dims, num_axes, axes, cub::Max(),                \
+                    std::numeric_limits<T>::lowest(), X, Y, context);          \
   }
 CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(std::int32_t)
 CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(std::int64_t)
@@ -2970,18 +1909,13 @@ CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(float)
 CAFFE2_SPECIALIZED_HIP_REDUCE_MAX(double)
 #undef CAFFE2_SPECIALIZED_HIP_REDUCE_MAX
 
-#define CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(T)                              \
-  template <>                                                             \
-  void ReduceSum<T, HIPContext>(                                          \
-      const int num_dims,                                                 \
-      const int* dims,                                                    \
-      const int num_axes,                                                 \
-      const int* axes,                                                    \
-      const T* X,                                                         \
-      T* Y,                                                               \
-      HIPContext* context) {                                              \
-    ReduceTensorHIP(                                                      \
-        num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context); \
+#define CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(T)                                   \
+  template <>                                                                  \
+  void ReduceSum<T, HIPContext>(const int num_dims, const int *dims,           \
+                                const int num_axes, const int *axes,           \
+                                const T *X, T *Y, HIPContext *context) {       \
+    ReduceTensorHIP(num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y,    \
+                    context);                                                  \
   }
 CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(std::int32_t)
 CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(std::int64_t)
@@ -2989,17 +1923,12 @@ CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(float)
 CAFFE2_SPECIALIZED_HIP_REDUCE_SUM(double)
 #undef CAFFE2_SPECIALIZED_HIP_REDUCE_SUM
 
-#define CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN(T)                            \
-  template <>                                                            \
-  void ReduceMean<T, HIPContext>(                                        \
-      const int num_dims,                                                \
-      const int* dims,                                                   \
-      const int num_axes,                                                \
-      const int* axes,                                                   \
-      const T* X,                                                        \
-      T* Y,                                                              \
-      HIPContext* context) {                                             \
-    ReduceMeanHIPImpl<T>(num_dims, dims, num_axes, axes, X, Y, context); \
+#define CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN(T)                                  \
+  template <>                                                                  \
+  void ReduceMean<T, HIPContext>(const int num_dims, const int *dims,          \
+                                 const int num_axes, const int *axes,          \
+                                 const T *X, T *Y, HIPContext *context) {      \
+    ReduceMeanHIPImpl<T>(num_dims, dims, num_axes, axes, X, Y, context);       \
   }
 CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN(float)
 #undef CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN
@@ -3007,20 +1936,16 @@ CAFFE2_SPECIALIZED_HIP_REDUCE_MEAN(float)
 namespace {
 
 template <typename T, int D>
-__global__ void BroadcastHIPKernel(
-    const int Y_size,
-    const SimpleArray<int, D> X_strides,
-    const SimpleArray<int, D> Y_dims,
-    const T* X,
-    T* Y) {
+__global__ void
+BroadcastHIPKernel(const int Y_size, const SimpleArray<int, D> X_strides,
+                   const SimpleArray<int, D> Y_dims, const T *X, T *Y) {
   HIP_1D_KERNEL_LOOP(Y_index, Y_size) {
     int X_index = 0;
     int Y_index_val = Y_index;
 #pragma unroll
     for (int i = D - 1; i >= 0; --i) {
-      X_index += X_strides.data[i] == 0
-          ? 0
-          : (Y_index_val % Y_dims.data[i]) * X_strides.data[i];
+      X_index += X_strides.data[i] == 0 ? 0 : (Y_index_val % Y_dims.data[i]) *
+                                                  X_strides.data[i];
       Y_index_val /= Y_dims.data[i];
     }
     Y[Y_index] = __ldg(X + X_index);
@@ -3028,13 +1953,8 @@ __global__ void BroadcastHIPKernel(
 }
 
 template <typename T, int D>
-void BroadcastHIPImpl(
-    const int X_ndim,
-    const int* X_dims,
-    const int* Y_dims,
-    const T* X,
-    T* Y,
-    HIPContext* context) {
+void BroadcastHIPImpl(const int X_ndim, const int *X_dims, const int *Y_dims,
+                      const T *X, T *Y, HIPContext *context) {
   SimpleArray<int, D> X_strides_array;
   SimpleArray<int, D> Y_dims_array;
   const int d = D - X_ndim;
@@ -3048,34 +1968,21 @@ void BroadcastHIPImpl(
   std::copy_n(Y_dims, D, Y_dims_array.data);
   const int Y_size =
       std::accumulate(Y_dims, Y_dims + D, 1, std::multiplies<int>());
-  hipLaunchKernelGGL(
-      (BroadcastHIPKernel<T, D>),
-      dim3(CAFFE_GET_BLOCKS(Y_size)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      Y_size,
-      X_strides_array,
-      Y_dims_array,
-      X,
-      Y);
+  hipLaunchKernelGGL((BroadcastHIPKernel<T, D>), dim3(CAFFE_GET_BLOCKS(Y_size)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(),
+                     Y_size, X_strides_array, Y_dims_array, X, Y);
 }
 
 } // namespace
 
-#define CAFFE2_SPECIALIZED_HIP_BROADCAST(T)                                  \
-  template <>                                                                \
-  void Broadcast<T, HIPContext>(                                             \
-      const int X_ndim,                                                      \
-      const int* X_dims,                                                     \
-      const int Y_ndim,                                                      \
-      const int* Y_dims,                                                     \
-      const T* X,                                                            \
-      T* Y,                                                                  \
-      HIPContext* context) {                                                 \
-    CAFFE_ENFORCE_LE(X_ndim, Y_ndim);                                        \
-    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(                                  \
-        Y_ndim, BroadcastHIPImpl, T, X_ndim, X_dims, Y_dims, X, Y, context); \
+#define CAFFE2_SPECIALIZED_HIP_BROADCAST(T)                                    \
+  template <>                                                                  \
+  void Broadcast<T, HIPContext>(const int X_ndim, const int *X_dims,           \
+                                const int Y_ndim, const int *Y_dims,           \
+                                const T *X, T *Y, HIPContext *context) {       \
+    CAFFE_ENFORCE_LE(X_ndim, Y_ndim);                                          \
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(                                    \
+        Y_ndim, BroadcastHIPImpl, T, X_ndim, X_dims, Y_dims, X, Y, context);   \
   }
 CAFFE2_SPECIALIZED_HIP_BROADCAST(std::int32_t)
 CAFFE2_SPECIALIZED_HIP_BROADCAST(std::int64_t)
@@ -3086,12 +1993,8 @@ CAFFE2_SPECIALIZED_HIP_BROADCAST(double)
 namespace {
 
 template <typename T>
-__global__ void RowwiseMomentsHIPKernel(
-    const int rows,
-    const int cols,
-    const T* X,
-    T* mean,
-    T* variance) {
+__global__ void RowwiseMomentsHIPKernel(const int rows, const int cols,
+                                        const T *X, T *mean, T *variance) {
   __shared__ typename BlockReduce<T>::TempStorage m_storage;
   __shared__ typename BlockReduce<T>::TempStorage v_storage;
   for (int i = blockIdx.x; i < rows; i += gridDim.x) {
@@ -3113,14 +2016,10 @@ __global__ void RowwiseMomentsHIPKernel(
 }
 
 template <typename T, int D>
-__global__ void MomentsHIPKernel(
-    const int outer_size,
-    const int inner_size,
-    SimpleArray<int, D> X_strides,
-    SimpleArray<int, D> Y_dims,
-    const T* X,
-    T* mean,
-    T* variance) {
+__global__ void MomentsHIPKernel(const int outer_size, const int inner_size,
+                                 SimpleArray<int, D> X_strides,
+                                 SimpleArray<FixedDivisor<int>, D> Y_dims,
+                                 const T *X, T *mean, T *variance) {
   __shared__ typename BlockReduce<T>::TempStorage m_storage;
   __shared__ typename BlockReduce<T>::TempStorage v_storage;
   for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
@@ -3130,9 +2029,10 @@ __global__ void MomentsHIPKernel(
       int X_index = 0;
       int Y_index = i * inner_size + j;
 #pragma unroll
-      for (int i = D - 1; i >= 0; --i) {
-        X_index += (Y_index % Y_dims.data[i]) * X_strides.data[i];
-        Y_index /= Y_dims.data[i];
+      for (int d = D - 1; d >= 0; --d) {
+        int r;
+        Y_dims.data[d].DivMod(Y_index, &Y_index, &r);
+        X_index += r * X_strides.data[d];
       }
       m_val += __ldg(X + X_index);
       v_val += __ldg(X + X_index) * __ldg(X + X_index);
@@ -3148,50 +2048,30 @@ __global__ void MomentsHIPKernel(
 }
 
 template <typename T, int D>
-void MomentsHIPImpl(
-    const int outer_size,
-    const int inner_size,
-    const int* dims,
-    const int* axes,
-    const T* X,
-    T* mean,
-    T* variance,
-    HIPContext* context) {
+void MomentsHIPImpl(const int outer_size, const int inner_size, const int *dims,
+                    const int *axes, const T *X, T *mean, T *variance,
+                    HIPContext *context) {
   SimpleArray<int, D> X_strides;
-  SimpleArray<int, D> Y_dims;
+  SimpleArray<FixedDivisor<int>, D> Y_dims;
   utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
   for (int i = 0; i < D; ++i) {
-    Y_dims.data[i] = dims[axes[i]];
+    Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
   }
-  hipLaunchKernelGGL(
-      (MomentsHIPKernel<T, D>),
-      dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      outer_size,
-      inner_size,
-      X_strides,
-      Y_dims,
-      X,
-      mean,
-      variance);
+  hipLaunchKernelGGL((MomentsHIPKernel<T, D>),
+                     dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(),
+                     outer_size, inner_size, X_strides, Y_dims, X, mean,
+                     variance);
 }
 
 template <typename T>
-void MomentsHIP(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes,
-    const T* X,
-    T* mean,
-    T* variance,
-    HIPContext* context) {
+void MomentsHIP(const int num_dims, const int *dims, const int num_axes,
+                const int *axes, const T *X, T *mean, T *variance,
+                HIPContext *context) {
   CAFFE_ENFORCE_LE(num_axes, num_dims);
   std::vector<int> transpose_axes(num_dims);
-  utils::ComputeTransposeAxesForReduceOp(
-      num_dims, num_axes, axes, transpose_axes.data());
+  utils::ComputeTransposeAxesForReduceOp(num_dims, num_axes, axes,
+                                         transpose_axes.data());
   const int pivot = num_dims - num_axes;
   int outer_size = 1;
   for (int i = 0; i < pivot; ++i) {
@@ -3201,47 +2081,27 @@ void MomentsHIP(
   for (int i = pivot; i < num_dims; ++i) {
     inner_size *= dims[transpose_axes[i]];
   }
-  if (transpose_axes[pivot] == pivot) {
-    hipLaunchKernelGGL(
-        (RowwiseMomentsHIPKernel<T>),
-        dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
-        dim3(CAFFE_HIP_NUM_THREADS),
-        0,
-        context->hip_stream(),
-        outer_size,
-        inner_size,
-        X,
-        mean,
-        variance);
-    return;
+  if (outer_size > 0 && inner_size > 0) {
+    if (transpose_axes[pivot] == pivot) {
+      hipLaunchKernelGGL((RowwiseMomentsHIPKernel<T>),
+                         dim3(std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS)),
+                         dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(),
+                         outer_size, inner_size, X, mean, variance);
+      return;
+    }
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
+        num_dims, MomentsHIPImpl, T, outer_size, inner_size, dims,
+        transpose_axes.data(), X, mean, variance, context);
   }
-  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
-      num_dims,
-      MomentsHIPImpl,
-      T,
-      outer_size,
-      inner_size,
-      dims,
-      transpose_axes.data(),
-      X,
-      mean,
-      variance,
-      context);
 }
 
 } // namespace
 
 #define CAFFE2_SPECIALIZED_HIP_MOMENTS(T)                                      \
   template <>                                                                  \
-  void Moments<T, HIPContext>(                                                 \
-      const int num_dims,                                                      \
-      const int* dims,                                                         \
-      const int num_axes,                                                      \
-      const int* axes,                                                         \
-      const T* X,                                                              \
-      T* mean,                                                                 \
-      T* variance,                                                             \
-      HIPContext* context) {                                                   \
+  void Moments<T, HIPContext>(const int num_dims, const int *dims,             \
+                              const int num_axes, const int *axes, const T *X, \
+                              T *mean, T *variance, HIPContext *context) {     \
     MomentsHIP<T>(num_dims, dims, num_axes, axes, X, mean, variance, context); \
   }
 CAFFE2_SPECIALIZED_HIP_MOMENTS(float)
@@ -3250,65 +2110,54 @@ CAFFE2_SPECIALIZED_HIP_MOMENTS(float)
 namespace {
 
 template <typename T, int D>
-__global__ void TransposeHIPKernel(
-    const int size,
-    const SimpleArray<int, D> X_strides,
-    const SimpleArray<int, D> Y_dims,
-    const T* X,
-    T* Y) {
+__global__ void
+TransposeHIPKernel(const int size, const SimpleArray<int, D> X_strides,
+                   const SimpleArray<FixedDivisor<int>, D> Y_dims, const T *X,
+                   T *Y) {
   HIP_1D_KERNEL_LOOP(Y_index, size) {
     int X_index = 0;
     int Y_index_val = Y_index;
 #pragma unroll
     for (int i = D - 1; i >= 0; --i) {
-      X_index += (Y_index_val % Y_dims.data[i]) * X_strides.data[i];
-      Y_index_val /= Y_dims.data[i];
+      int d;
+      Y_dims.data[i].DivMod(Y_index_val, &Y_index_val, &d);
+      X_index += d * X_strides.data[i];
     }
     Y[Y_index] = __ldg(X + X_index);
   }
 }
 
 template <typename T, int D>
-void TransposeHIPImpl(
-    const int* dims,
-    const int* axes,
-    const T* X,
-    T* Y,
-    HIPContext* context) {
+void TransposeHIPImpl(const int *dims, const int *axes, const T *X, T *Y,
+                      HIPContext *context) {
   SimpleArray<int, D> X_strides;
-  SimpleArray<int, D> Y_dims;
+  SimpleArray<FixedDivisor<int>, D> Y_dims;
   utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
   int size = 1;
   for (int i = 0; i < D; ++i) {
-    Y_dims.data[i] = dims[axes[i]];
+    Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
     size *= dims[i];
   }
-  hipLaunchKernelGGL(
-      (TransposeHIPKernel<T, D>),
-      dim3(CAFFE_GET_BLOCKS(size)),
-      dim3(CAFFE_HIP_NUM_THREADS),
-      0,
-      context->hip_stream(),
-      size,
-      X_strides,
-      Y_dims,
-      X,
-      Y);
+  hipLaunchKernelGGL((TransposeHIPKernel<T, D>), dim3(CAFFE_GET_BLOCKS(size)),
+                     dim3(CAFFE_HIP_NUM_THREADS), 0, context->hip_stream(),
+                     size, X_strides, Y_dims, X, Y);
 }
 
 } // namespace
 
-#define CAFFE2_SPECIALIZED_HIP_TRANSPOSE(T)                    \
-  template <>                                                  \
-  void Transpose<T, HIPContext>(                               \
-      const int ndim,                                          \
-      const int* dims,                                         \
-      const int* axes,                                         \
-      const T* X,                                              \
-      T* Y,                                                    \
-      HIPContext* context) {                                   \
-    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(                    \
-        ndim, TransposeHIPImpl, T, dims, axes, X, Y, context); \
+#define CAFFE2_SPECIALIZED_HIP_TRANSPOSE(T)                                    \
+  template <>                                                                  \
+  void Transpose<T, HIPContext>(const int ndim, const int *dims,               \
+                                const int *axes, const T *X, T *Y,             \
+                                HIPContext *context) {                         \
+    if (utils::IsIdentityPermutation(ndim, axes)) {                            \
+      const int size =                                                         \
+          std::accumulate(dims, dims + ndim, 1, std::multiplies<int>());       \
+      context->template Copy<T, HIPContext, HIPContext>(size, X, Y);           \
+      return;                                                                  \
+    }                                                                          \
+    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(ndim, TransposeHIPImpl, T, dims,    \
+                                           axes, X, Y, context);               \
   }
 CAFFE2_SPECIALIZED_HIP_TRANSPOSE(float)
 CAFFE2_SPECIALIZED_HIP_TRANSPOSE(double)
diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h
index 88d8db509847e1..ca8535e4aa3dd3 100644
--- a/caffe2/utils/math.h
+++ b/caffe2/utils/math.h
@@ -383,6 +383,17 @@ void Set(const size_t N, const T alpha, T* X, Context* context);
 template <typename T, class Context>
 void RandUniform(const size_t n, const T a, const T b, T* r, Context* context);
 
+// Generate n values that sum up to a fixed sum
+// and subject to a restriction a <= x <= b for each x generated
+template <typename T, class Context>
+void RandFixedSum(
+    const size_t n,
+    const T a,
+    const T b,
+    const T sum,
+    T* r,
+    Context* context);
+
 template <typename T, class Context>
 void RandUniformUnique(
     const size_t n,
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index 72c22853b2dfdb..6ebf41ab7cba12 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -1714,31 +1714,101 @@ DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
 
 #undef DELEGATE_BROADCAST_BINARY_FUNCTION
 
-template <>
-void RandUniform<float, CPUContext>(
-    const size_t n,
-    const float a,
-    const float b,
-    float* r,
-    CPUContext* context) {
-  std::uniform_real_distribution<float> distribution(a, b);
-  for (size_t i = 0; i < n; ++i) {
-    r[i] = distribution(context->RandGenerator());
-  }
-}
-
-template <>
-void RandUniform<int, CPUContext>(
-    const size_t n,
-    const int a,
-    const int b,
-    int* r,
-    CPUContext* context) {
-  std::uniform_int_distribution<int> distribution(a, b);
-  for (size_t i = 0; i < n; ++i) {
-    r[i] = distribution(context->RandGenerator());
-  }
-}
+#define CAFFE2_RAND_UNIFORM_REAL(T)                                      \
+  template <>                                                            \
+  void RandUniform<T, CPUContext>(                                       \
+      const size_t n, const T a, const T b, T* r, CPUContext* context) { \
+    std::uniform_real_distribution<T> distribution(a, b);                \
+    for (size_t i = 0; i < n; ++i) {                                     \
+      r[i] = distribution(context->RandGenerator());                     \
+    }                                                                    \
+  }
+CAFFE2_RAND_UNIFORM_REAL(float);
+CAFFE2_RAND_UNIFORM_REAL(double);
+#undef CAFFE2_RAND_UNIFORM_REAL
+
+#define CAFFE2_RAND_UNIFORM_CHAR(T)                                        \
+  template <>                                                              \
+  void RandUniform<T, CPUContext>(                                         \
+      const size_t n, const T a, const T b, T* r, CPUContext* context) {   \
+    std::uniform_int_distribution<short> distribution((short)a, (short)b); \
+    for (size_t i = 0; i < n; ++i) {                                       \
+      r[i] = static_cast<T>(distribution(context->RandGenerator()));       \
+    }                                                                      \
+  }
+CAFFE2_RAND_UNIFORM_CHAR(int8_t);
+CAFFE2_RAND_UNIFORM_CHAR(uint8_t);
+#undef CAFFE2_RAND_UNIFORM_CHAR
+
+#define CAFFE2_RAND_UNIFORM_INT(T)                                       \
+  template <>                                                            \
+  void RandUniform<T, CPUContext>(                                       \
+      const size_t n, const T a, const T b, T* r, CPUContext* context) { \
+    std::uniform_int_distribution<T> distribution(a, b);                 \
+    for (size_t i = 0; i < n; ++i) {                                     \
+      r[i] = distribution(context->RandGenerator());                     \
+    }                                                                    \
+  }
+
+CAFFE2_RAND_UNIFORM_INT(int16_t);
+CAFFE2_RAND_UNIFORM_INT(int32_t);
+CAFFE2_RAND_UNIFORM_INT(int64_t);
+CAFFE2_RAND_UNIFORM_INT(uint16_t);
+CAFFE2_RAND_UNIFORM_INT(uint32_t);
+CAFFE2_RAND_UNIFORM_INT(uint64_t);
+#undef CAFFE2_RAND_UNIFORM_INT
+
+// This is not uniformly distributed between a and b.
+// It takes advantage of normal distribution to generate numbers
+// with mean = sum / n.
+// Ideally the algorithm should be generating n numbers between 0 and 1,
+// sum them up as scaled_sum, and use sum / scaled_sum to adjust the values
+// to between a and b.
+// The algorithm is non-trivial given the adjustment would be different towards
+// each value.
+#define CAFFE2_RAND_FIXED_SUM(T)                                        \
+  template <>                                                           \
+  void RandFixedSum<T, CPUContext>(                                     \
+      const size_t n,                                                   \
+      const T a,                                                        \
+      const T b,                                                        \
+      const T sum,                                                      \
+      T* r,                                                             \
+      CPUContext* context) {                                            \
+    CAFFE_ENFORCE_GE(a, 0);                                             \
+    CAFFE_ENFORCE_GE(sum / (double)n, a);                               \
+    CAFFE_ENFORCE_LE(sum / (double)n, b);                               \
+    T current_sum = 0;                                                  \
+    for (size_t i = 0; i < n - 1; ++i) {                                \
+      auto remaining_numbers = n - 1 - i;                               \
+      double mean = (sum - current_sum) / remaining_numbers;            \
+      double stdev = std::min(mean - a, b - mean);                      \
+      std::normal_distribution<double> distribution{mean, stdev / 4.0}; \
+      T value = distribution(context->RandGenerator());                 \
+      auto remaining_sum = sum - current_sum - value;                   \
+      if (value < a || remaining_sum > b * remaining_numbers) {         \
+        value = a;                                                      \
+      } else if (value > b || remaining_sum < a * remaining_numbers) {  \
+        value = b;                                                      \
+      }                                                                 \
+      r[i] = value;                                                     \
+      CAFFE_ENFORCE(a <= value && value <= b);                          \
+      current_sum += value;                                             \
+    }                                                                   \
+    r[n - 1] = sum - current_sum;                                       \
+    CAFFE_ENFORCE(a <= r[n - 1] && r[n - 1] <= b);                      \
+  }
+CAFFE2_RAND_FIXED_SUM(float);
+CAFFE2_RAND_FIXED_SUM(double);
+CAFFE2_RAND_FIXED_SUM(int8_t);
+CAFFE2_RAND_FIXED_SUM(int16_t);
+CAFFE2_RAND_FIXED_SUM(int32_t);
+CAFFE2_RAND_FIXED_SUM(int64_t);
+CAFFE2_RAND_FIXED_SUM(uint8_t);
+CAFFE2_RAND_FIXED_SUM(uint16_t);
+CAFFE2_RAND_FIXED_SUM(uint32_t);
+CAFFE2_RAND_FIXED_SUM(uint64_t);
+#undef CAFFE2_RAND_FIXED_SUM
 
 #define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T)                      \
   template <>                                                          \
diff --git a/caffe2/utils/mixed_utils_hip.h b/caffe2/utils/mixed_utils_hip.h
new file mode 100644
index 00000000000000..f8e07dca2cd134
--- /dev/null
+++ b/caffe2/utils/mixed_utils_hip.h
@@ -0,0 +1,116 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+#ifndef CAFFE2_UTILS_MIXED_UTILS_HIP_H
+#define CAFFE2_UTILS_MIXED_UTILS_HIP_H
+
+#include "caffe2/core/hip/common_hip.h"
+#include "caffe2/core/hip/context_hip.h"
+
+// define functions to allow add/mult/store operaions for input/output with
+// mixed precisions.
+namespace caffe2 {
+
+// functions that will only be triggered when there is no spcialized version
+// supported
+template <typename T, typename T2>
+inline __device__ T mixed_mult(T data1, T2 data2)
+{
+    return data1 * data2;
+};
+
+template <typename T, typename T2>
+inline __device__ T mixed_add(T data1, T2 data2)
+{
+    return data1 + data2;
+};
+
+template <typename TIN, typename TOUT>
+inline __device__ void mixed_store(TIN* data_in, TOUT* data_out)
+{
+    *data_out = *data_in;
+    return;
+};
+
+template <typename T>
+inline __device__ void mixed_store(T* data_in, T* data_out)
+{
+    *data_out = *data_in;
+    return;
+};
+
+template <>
+inline __device__ float mixed_mult(float data1, const float data2)
+{
+    return data1 * data2;
+}
+
+template <>
+inline __device__ float mixed_mult(float data1, const half data2)
+{
+    return data1 * __half2float(data2);
+}
+
+template <>
+inline __device__ float mixed_mult(float data1, float16 data2)
+{
+    half* data2_half = reinterpret_cast<half*>(&data2);
+    return data1 * __half2float(*data2_half);
+}
+template <>
+inline __device__ float mixed_add(float data1, const float data2)
+{
+    return data1 + data2;
+}
+
+template <>
+inline __device__ float mixed_add(float data1, const half data2)
+{
+    return data1 + __half2float(data2);
+}
+
+template <>
+inline __device__ float mixed_add(float data1, float16 data2)
+{
+    half* data2_half = reinterpret_cast<half*>(&data2);
+    return data1 + __half2float(*data2_half);
+}
+
+template <>
+inline __device__ void mixed_store(float* data_in, float* data_out)
+{
+    *data_out = *data_in;
+    return;
+}
+
+template <>
+inline __device__ void mixed_store(half* data_in, float* data_out)
+{
+    *data_out = __half2float(*data_in);
+    return;
+}
+
+template <>
+inline __device__ void mixed_store(float16* data_in, float* data_out)
+{
+    half* data_in_half = reinterpret_cast<half*>(data_in);
+    *data_out          = __half2float(*data_in_half);
+    return;
+}
+
+template <>
+inline __device__ void mixed_store(float* data_in, float16* data_out)
+{
+    half data_in_half        = __float2half(*data_in);
+    float16* data_in_float16 = reinterpret_cast<float16*>(&data_in_half);
+    *data_out                = *data_in_float16;
+    return;
+}
+
+template <>
+inline __device__ void mixed_store(float* data_in, half* data_out)
+{
+    half data_in_half = __float2half(*data_in);
+    *data_out         = data_in_half;
+    return;
+}
+} // namespace caffe2
+#endif // for CAFFE2_UTILS_MIXED_UTILS_HIP_H
diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
index 84ed2c4c6edf7d..a18fb2911ab93b 100644
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@@ -176,6 +176,7 @@ function(aten_compile_options libname)
     -Wextra
     -fexceptions
     -Wno-missing-field-initializers
+    -Wno-strict-overflow
     -Wno-type-limits
     -Wno-unused-parameter
     -Wno-unknown-warning-option
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 538d2603962b54..987044bbd212f4 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -1102,6 +1102,11 @@ Linear functions
 
 .. autofunction:: linear
 
+:hidden:`bilinear`
+~~~~~~~~~~~~~~~~
+
+.. autofunction:: bilinear
+
 Dropout functions
 -----------------
 
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 86a862316dab92..89f96cf174ddbe 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -329,6 +329,7 @@ view of a storage and defines numeric operations on it.
    .. automethod:: repeat
    .. automethod:: requires_grad_
    .. automethod:: reshape
+   .. automethod:: reshape_as
    .. automethod:: resize_
    .. automethod:: resize_as_
    .. automethod:: round
@@ -336,6 +337,7 @@ view of a storage and defines numeric operations on it.
    .. automethod:: rsqrt
    .. automethod:: rsqrt_
    .. automethod:: scatter_
+   .. automethod:: scatter_add_
    .. automethod:: select
    .. automethod:: set_
    .. automethod:: share_memory_
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index b8d6e2a349fe8d..75b71fd314bb56 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -223,6 +223,7 @@ Comparison Ops
 .. autofunction:: equal
 .. autofunction:: ge
 .. autofunction:: gt
+.. autofunction:: isfinite
 .. autofunction:: isinf
 .. autofunction:: isnan
 .. autofunction:: kthvalue
@@ -250,6 +251,7 @@ Spectral Ops
 
 Other Operations
 ~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: bincount
 .. autofunction:: cross
 .. autofunction:: diag
 .. autofunction:: diagflat
@@ -257,6 +259,7 @@ Other Operations
 .. autofunction:: einsum
 .. autofunction:: flip
 .. autofunction:: histc
+.. autofunction:: meshgrid
 .. autofunction:: renorm
 .. autofunction:: trace
 .. autofunction:: tril
diff --git a/setup.py b/setup.py
index 85f2f5bca63969..042d8668bb7b96 100644
--- a/setup.py
+++ b/setup.py
@@ -318,6 +318,8 @@ def build_libs(libs):
     if USE_CUDA:
         my_env["CUDA_BIN_PATH"] = CUDA_HOME
         build_libs_cmd += ['--use-cuda']
+        if IS_WINDOWS:
+            my_env["NVTOOLEXT_HOME"] = NVTOOLEXT_HOME
     if USE_ROCM:
         build_libs_cmd += ['--use-rocm']
     if USE_NNPACK:
@@ -338,6 +340,8 @@ def build_libs(libs):
     if FULL_CAFFE2:
         build_libs_cmd += ['--full-caffe2']
 
+    my_env["BUILD_TORCH"] = "ON"
+
     if subprocess.call(build_libs_cmd + libs, env=my_env) != 0:
         print("Failed to run '{}'".format(' '.join(build_libs_cmd + libs)))
         sys.exit(1)
@@ -640,6 +644,7 @@ def run(self):
         '-std=c++11',
         '-Wall',
         '-Wextra',
+        '-Wno-strict-overflow',
         '-Wno-unused-parameter',
         '-Wno-missing-field-initializers',
         '-Wno-write-strings',
@@ -719,120 +724,73 @@ def run(self):
 main_compile_args = ['-D_THP_CORE', '-DONNX_NAMESPACE=' + ONNX_NAMESPACE]
 main_libraries = ['shm']
 main_link_args = CAFFE2_LIBS + [NANOPB_STATIC_LIB, PROTOBUF_STATIC_LIB]
+if IS_WINDOWS:
+    main_link_args.append(os.path.join(lib_path, 'torch.lib'))
+elif IS_DARWIN:
+    main_link_args.append(os.path.join(lib_path, 'libtorch.dylib'))
+else:
+    main_link_args.append(os.path.join(lib_path, 'libtorch.so'))
 main_sources = [
-    "torch/csrc/PtrWrapper.cpp",
-    "torch/csrc/Module.cpp",
-    "torch/csrc/Generator.cpp",
-    "torch/csrc/Size.cpp",
-    "torch/csrc/Dtype.cpp",
+    "torch/csrc/DataLoader.cpp",
     "torch/csrc/Device.cpp",
+    "torch/csrc/Dtype.cpp",
+    "torch/csrc/DynamicTypes.cpp",
     "torch/csrc/Exceptions.cpp",
+    "torch/csrc/Generator.cpp",
     "torch/csrc/Layout.cpp",
+    "torch/csrc/Module.cpp",
+    "torch/csrc/PtrWrapper.cpp",
+    "torch/csrc/Size.cpp",
     "torch/csrc/Storage.cpp",
-    "torch/csrc/DataLoader.cpp",
-    "torch/csrc/DynamicTypes.cpp",
-    "torch/csrc/assertions.cpp",
+    "torch/csrc/autograd/functions/init.cpp",
+    "torch/csrc/autograd/generated/python_functions.cpp",
+    "torch/csrc/autograd/generated/python_nn_functions.cpp",
+    "torch/csrc/autograd/generated/python_torch_functions.cpp",
+    "torch/csrc/autograd/generated/python_variable_methods.cpp",
+    "torch/csrc/autograd/init.cpp",
+    "torch/csrc/autograd/python_anomaly_mode.cpp",
+    "torch/csrc/autograd/python_cpp_function.cpp",
+    "torch/csrc/autograd/python_engine.cpp",
+    "torch/csrc/autograd/python_function.cpp",
+    "torch/csrc/autograd/python_hook.cpp",
+    "torch/csrc/autograd/python_legacy_variable.cpp",
+    "torch/csrc/autograd/python_variable.cpp",
+    "torch/csrc/autograd/python_variable_indexing.cpp",
     "torch/csrc/byte_order.cpp",
-    "torch/csrc/torch.cpp",
-    "torch/csrc/utils.cpp",
-    "torch/csrc/utils/cuda_lazy_init.cpp",
-    "torch/csrc/utils/invalid_arguments.cpp",
-    "torch/csrc/utils/object_ptr.cpp",
-    "torch/csrc/utils/python_arg_parser.cpp",
-    "torch/csrc/utils/tensor_list.cpp",
-    "torch/csrc/utils/tensor_new.cpp",
-    "torch/csrc/utils/tensor_numpy.cpp",
-    "torch/csrc/utils/tensor_dtypes.cpp",
-    "torch/csrc/utils/tensor_layouts.cpp",
-    "torch/csrc/utils/tensor_types.cpp",
-    "torch/csrc/utils/tuple_parser.cpp",
-    "torch/csrc/utils/tensor_apply.cpp",
-    "torch/csrc/utils/tensor_conversion_dispatch.cpp",
-    "torch/csrc/utils/tensor_flatten.cpp",
-    "torch/csrc/utils/variadic.cpp",
-    "torch/csrc/serialization.cpp",
     "torch/csrc/finalizer.cpp",
+    "torch/csrc/jit/batched/BatchTensor.cpp",
     "torch/csrc/jit/init.cpp",
-    "torch/csrc/jit/interpreter.cpp",
-    "torch/csrc/jit/register_prim_ops.cpp",
+    "torch/csrc/jit/passes/onnx.cpp",
+    "torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp",
+    "torch/csrc/jit/passes/onnx/peephole.cpp",
+    "torch/csrc/jit/passes/to_batch.cpp",
+    "torch/csrc/jit/python_arg_flatten.cpp",
     "torch/csrc/jit/python_interpreter.cpp",
-    "torch/csrc/jit/ir.cpp",
-    "torch/csrc/jit/fusion_compiler.cpp",
-    "torch/csrc/jit/graph_executor.cpp",
     "torch/csrc/jit/python_ir.cpp",
-    "torch/csrc/jit/test_jit.cpp",
-    "torch/csrc/jit/tracer.cpp",
-    "torch/csrc/jit/tracer_state.cpp",
     "torch/csrc/jit/python_tracer.cpp",
-    "torch/csrc/jit/passes/shape_analysis.cpp",
-    "torch/csrc/jit/interned_strings.cpp",
-    "torch/csrc/jit/type.cpp",
-    "torch/csrc/jit/export.cpp",
-    "torch/csrc/jit/import.cpp",
-    "torch/csrc/jit/autodiff.cpp",
-    "torch/csrc/jit/python_arg_flatten.cpp",
-    "torch/csrc/jit/variable_flags.cpp",
-    "torch/csrc/jit/passes/create_autodiff_subgraphs.cpp",
-    "torch/csrc/jit/passes/graph_fuser.cpp",
-    "torch/csrc/jit/passes/onnx.cpp",
-    "torch/csrc/jit/passes/dead_code_elimination.cpp",
-    "torch/csrc/jit/passes/remove_expands.cpp",
-    "torch/csrc/jit/passes/lower_tuples.cpp",
-    "torch/csrc/jit/passes/lower_grad_of.cpp",
-    "torch/csrc/jit/passes/common_subexpression_elimination.cpp",
-    "torch/csrc/jit/passes/peephole.cpp",
-    "torch/csrc/jit/passes/inplace_check.cpp",
-    "torch/csrc/jit/passes/canonicalize.cpp",
-    "torch/csrc/jit/passes/batch_mm.cpp",
-    "torch/csrc/jit/passes/decompose_addmm.cpp",
-    "torch/csrc/jit/passes/specialize_undef.cpp",
-    "torch/csrc/jit/passes/erase_number_types.cpp",
-    "torch/csrc/jit/passes/loop_unrolling.cpp",
-    "torch/csrc/jit/passes/to_batch.cpp",
-    "torch/csrc/jit/passes/onnx/peephole.cpp",
-    "torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp",
-    "torch/csrc/jit/generated/register_aten_ops.cpp",
-    "torch/csrc/jit/operator.cpp",
+    "torch/csrc/jit/script/init.cpp",
     "torch/csrc/jit/script/lexer.cpp",
-    "torch/csrc/jit/script/compiler.cpp",
     "torch/csrc/jit/script/module.cpp",
-    "torch/csrc/jit/script/init.cpp",
     "torch/csrc/jit/script/python_tree_views.cpp",
-    "torch/csrc/jit/batched/BatchTensor.cpp",
-    "torch/csrc/autograd/init.cpp",
-    "torch/csrc/autograd/aten_variable_hooks.cpp",
-    "torch/csrc/autograd/grad_mode.cpp",
-    "torch/csrc/autograd/anomaly_mode.cpp",
-    "torch/csrc/autograd/python_anomaly_mode.cpp",
-    "torch/csrc/autograd/engine.cpp",
-    "torch/csrc/autograd/function.cpp",
-    "torch/csrc/autograd/variable.cpp",
-    "torch/csrc/autograd/saved_variable.cpp",
-    "torch/csrc/autograd/input_buffer.cpp",
-    "torch/csrc/autograd/profiler.cpp",
-    "torch/csrc/autograd/python_function.cpp",
-    "torch/csrc/autograd/python_cpp_function.cpp",
-    "torch/csrc/autograd/python_variable.cpp",
-    "torch/csrc/autograd/python_variable_indexing.cpp",
-    "torch/csrc/autograd/python_legacy_variable.cpp",
-    "torch/csrc/autograd/python_engine.cpp",
-    "torch/csrc/autograd/python_hook.cpp",
-    "torch/csrc/autograd/generated/VariableType.cpp",
-    "torch/csrc/autograd/generated/Functions.cpp",
-    "torch/csrc/autograd/generated/python_torch_functions.cpp",
-    "torch/csrc/autograd/generated/python_variable_methods.cpp",
-    "torch/csrc/autograd/generated/python_functions.cpp",
-    "torch/csrc/autograd/generated/python_nn_functions.cpp",
-    "torch/csrc/autograd/functions/basic_ops.cpp",
-    "torch/csrc/autograd/functions/tensor.cpp",
-    "torch/csrc/autograd/functions/accumulate_grad.cpp",
-    "torch/csrc/autograd/functions/utils.cpp",
-    "torch/csrc/autograd/functions/init.cpp",
     "torch/csrc/nn/THNN.cpp",
-    "torch/csrc/tensor/python_tensor.cpp",
-    "torch/csrc/onnx/onnx.npb.cpp",
-    "torch/csrc/onnx/onnx.cpp",
     "torch/csrc/onnx/init.cpp",
+    "torch/csrc/serialization.cpp",
+    "torch/csrc/tensor/python_tensor.cpp",
+    "torch/csrc/utils.cpp",
+    "torch/csrc/utils/cuda_lazy_init.cpp",
+    "torch/csrc/utils/invalid_arguments.cpp",
+    "torch/csrc/utils/object_ptr.cpp",
+    "torch/csrc/utils/python_arg_parser.cpp",
+    "torch/csrc/utils/tensor_apply.cpp",
+    "torch/csrc/utils/tensor_conversion_dispatch.cpp",
+    "torch/csrc/utils/tensor_dtypes.cpp",
+    "torch/csrc/utils/tensor_flatten.cpp",
+    "torch/csrc/utils/tensor_layouts.cpp",
+    "torch/csrc/utils/tensor_list.cpp",
+    "torch/csrc/utils/tensor_new.cpp",
+    "torch/csrc/utils/tensor_numpy.cpp",
+    "torch/csrc/utils/tensor_types.cpp",
+    "torch/csrc/utils/tuple_parser.cpp",
 ]
 
 try:
diff --git a/test/common.py b/test/common.py
index 8ee3b2be855157..f3b60cf522f43c 100644
--- a/test/common.py
+++ b/test/common.py
@@ -28,7 +28,7 @@
 import torch
 import torch.cuda
 from torch._utils_internal import get_writable_path
-from torch._six import string_classes
+from torch._six import string_classes, inf
 import torch.backends.cudnn
 import torch.backends.mkl
 
@@ -56,25 +56,47 @@ def run_tests(argv=UNITTEST_ARGS):
 IS_WINDOWS = sys.platform == "win32"
 IS_PPC = platform.machine() == "ppc64le"
 
-TEST_NUMPY = True
-try:
-    import numpy
-except ImportError:
-    TEST_NUMPY = False
-
-TEST_SCIPY = True
-try:
-    import scipy
-except ImportError:
-    TEST_SCIPY = False
 
+def _check_module_exists(name):
+    r"""Returns if a top-level module with :attr:`name` exists *without**
+    importing it. This is generally safer than try-catch block around a
+    `import X`. It avoids third party libraries breaking assumptions of some of
+    our tests, e.g., setting multiprocessing start method when imported
+    (see librosa/#747, torchvision/#544).
+    """
+    if not PY3:  # Python 2
+        import imp
+        try:
+            imp.find_module(name)
+            return True
+        except ImportError:
+            return False
+    elif PY34:  # Python [3, 3.4)
+        import importlib
+        loader = importlib.find_loader(name)
+        return loader is not None
+    else:  # Python >= 3.4
+        import importlib
+        spec = importlib.util.find_spec(name)
+        return spec is not None
+
+TEST_NUMPY = _check_module_exists('numpy')
+TEST_SCIPY = _check_module_exists('scipy')
 TEST_MKL = torch.backends.mkl.is_available()
 
+# On Py2, importing librosa 0.6.1 triggers a TypeError (if using newest joblib)
+# see librosa/librosa#729.
+# TODO: allow Py2 when librosa 0.6.2 releases
+TEST_LIBROSA = _check_module_exists('librosa') and PY3
+
 NO_MULTIPROCESSING_SPAWN = os.environ.get('NO_MULTIPROCESSING_SPAWN', '0') == '1'
 TEST_WITH_ASAN = os.getenv('PYTORCH_TEST_WITH_ASAN', '0') == '1'
 TEST_WITH_UBSAN = os.getenv('PYTORCH_TEST_WITH_UBSAN', '0') == '1'
 BUILT_WITH_ROCM = os.getenv('PYTORCH_BUILT_WITH_ROCM', '0') == '1'
 
+if TEST_NUMPY:
+    import numpy
+
 
 def skipIfNoLapack(fn):
     @wraps(fn)
@@ -332,7 +354,7 @@ def assertTensorsEqual(a, b):
         elif isinstance(x, bool) and isinstance(y, bool):
             super(TestCase, self).assertEqual(x, y, message)
         elif isinstance(x, Number) and isinstance(y, Number):
-            if abs(x) == float('inf') or abs(y) == float('inf'):
+            if abs(x) == inf or abs(y) == inf:
                 if allow_inf:
                     super(TestCase, self).assertEqual(x, y, message)
                 else:
diff --git a/test/cpp/api/cursor.cpp b/test/cpp/api/cursor.cpp
index 01a8cdb0c375a1..5c998661be2368 100644
--- a/test/cpp/api/cursor.cpp
+++ b/test/cpp/api/cursor.cpp
@@ -101,19 +101,36 @@ TEST_CASE("cursor/module") {
     SECTION("Map works") {
       std::vector<Module*> vector(3);
       cursor.map(vector.begin(), [](Module& module) { return &module; });
+      REQUIRE(vector[0] == &model[0]);
+      REQUIRE(vector[1] == &model[1]);
+      REQUIRE(vector[2] == &model[2]);
 
       std::list<Module*> list;
-      cursor.map(
-          std::back_inserter(list), [](Module& module) { return &module; });
+      cursor.map(std::inserter(list, list.end()), [](Module& module) {
+        return &module;
+      });
+      REQUIRE(list.size() == 3);
+      auto iterator = list.begin();
+      REQUIRE(*iterator++ == &model[0]);
+      REQUIRE(*iterator++ == &model[1]);
+      REQUIRE(*iterator++ == &model[2]);
+      REQUIRE(iterator == list.end());
     }
 
     SECTION("Map_items works") {
-      std::map<const char*, Module*> output;
+      std::map<std::string, Module*> output;
       cursor.map_items(
           std::inserter(output, output.end()),
           [](const std::string& key, Module& module) {
-            return std::make_pair(key.c_str(), &module);
+            return std::make_pair(key, &module);
           });
+      REQUIRE(output.size() == 3);
+      REQUIRE(output.count("0"));
+      REQUIRE(output.count("1"));
+      REQUIRE(output.count("2"));
+      REQUIRE(output["0"] == &model[0]);
+      REQUIRE(output["1"] == &model[1]);
+      REQUIRE(output["2"] == &model[2]);
     }
 
     SECTION("Count works for flat models") {
@@ -280,29 +297,28 @@ TEST_CASE("cursor/parameter") {
 
     SECTION("Apply_items works") {
       size_t count = 0;
-      cursor.apply_items(
-          [&count, &model, &first, &second](
-              const std::string& key, torch::Tensor& tensor) {
-            switch (count) {
-              case 0: {
-                REQUIRE(tensor.equal(first->tensor1));
-                break;
-              }
-              case 1: {
-                REQUIRE(tensor.equal(first->tensor2));
-                break;
-              }
-              case 2: {
-                REQUIRE(tensor.equal(second->tensor1));
-                break;
-              }
-              case 3: {
-                REQUIRE(tensor.equal(second->tensor2));
-                break;
-              }
-            }
-            count += 1;
-          });
+      cursor.apply_items([&count, &model, &first, &second](
+                             const std::string& key, torch::Tensor& tensor) {
+        switch (count) {
+          case 0: {
+            REQUIRE(tensor.equal(first->tensor1));
+            break;
+          }
+          case 1: {
+            REQUIRE(tensor.equal(first->tensor2));
+            break;
+          }
+          case 2: {
+            REQUIRE(tensor.equal(second->tensor1));
+            break;
+          }
+          case 3: {
+            REQUIRE(tensor.equal(second->tensor2));
+            break;
+          }
+        }
+        count += 1;
+      });
       REQUIRE(count == 4);
     }
 
diff --git a/test/cpp/api/main.cpp b/test/cpp/api/main.cpp
index 9dc554419809b9..4b1aaba64b2ef1 100644
--- a/test/cpp/api/main.cpp
+++ b/test/cpp/api/main.cpp
@@ -16,10 +16,16 @@ int main(int argc, char* argv[]) {
     return return_code;
   }
 
+  // ~ disables tags.
   if (!torch::cuda::is_available()) {
-    std::cerr << "CUDA not available. Disabling CUDA tests" << std::endl;
-    // ~ disables the [cuda] tag.
+    std::cerr << "CUDA not available. Disabling [cuda] and [multi-cuda] tests"
+              << std::endl;
     session.configData().testsOrTags.emplace_back("~[cuda]");
+    session.configData().testsOrTags.emplace_back("~[multi-cuda]");
+  } else if (torch::cuda::device_count() < 2) {
+    std::cerr << "Only one CUDA device detected. Disabling [multi-cuda] tests"
+              << std::endl;
+    session.configData().testsOrTags.emplace_back("~[multi-cuda]");
   }
 
   return session.run();
diff --git a/test/cpp/api/module.cpp b/test/cpp/api/module.cpp
index c8e7bdc605660d..66b11c126df14f 100644
--- a/test/cpp/api/module.cpp
+++ b/test/cpp/api/module.cpp
@@ -120,7 +120,7 @@ TEST_CASE("module/as") {
   REQUIRE(unit.as<AGIUnit>() == &unit);
 }
 
-TEST_CASE("module/conversions", "[cuda]") {
+TEST_CASE("module/conversions", "[multi-cuda]") {
   torch::manual_seed(0);
   Linear module(128, 64);
   SECTION("starts as float on CPU") {
@@ -350,3 +350,31 @@ TEST_CASE("module/buffers") {
     REQUIRE(buffers.contains("c"));
   }
 }
+
+TEST_CASE("module/default-constructor") {
+  struct AImpl : torch::nn::Module {
+    AImpl() : x_(123) {}
+    AImpl(int x) : x_(x) {}
+    int x_;
+  };
+  TORCH_MODULE(A);
+
+  {
+    A a;
+    REQUIRE(a);
+    REQUIRE(!a.is_empty());
+    REQUIRE(a->x_ == 123);
+  }
+  {
+    A a(5);
+    REQUIRE(a);
+    REQUIRE(!a.is_empty());
+    REQUIRE(a->x_ == 5);
+  }
+  {
+    A a = nullptr;
+    REQUIRE(!a);
+    REQUIRE(a.is_empty());
+    REQUIRE_THROWS_WITH(a->x_, StartsWith("Accessing empty ModuleHolder"));
+  }
+}
diff --git a/test/cpp/api/optim.cpp b/test/cpp/api/optim.cpp
index 02a9ca14a36705..186159c8e98edf 100644
--- a/test/cpp/api/optim.cpp
+++ b/test/cpp/api/optim.cpp
@@ -135,6 +135,30 @@ void check_exact_values(
   }
 }
 
+TEST_CASE("Optim/BasicInterface") {
+  struct MyOptimizer : Optimizer {
+    using Optimizer::Optimizer;
+    void step() override {}
+  };
+  std::vector<torch::Tensor> parameters = {
+      torch::ones({2, 3}), torch::zeros({2, 3}), torch::rand({2, 3})};
+  {
+    MyOptimizer optimizer(parameters);
+    REQUIRE(optimizer.size() == parameters.size());
+  }
+  {
+    MyOptimizer optimizer;
+    REQUIRE(optimizer.size() == 0);
+    optimizer.add_parameters(parameters);
+    REQUIRE(optimizer.size() == parameters.size());
+  }
+  {
+    Linear linear(3, 4);
+    MyOptimizer optimizer(linear->parameters());
+    REQUIRE(optimizer.size() == linear->parameters().size());
+  }
+}
+
 TEST_CASE("Optim/XORConvergence/SGD") {
   REQUIRE(test_optimizer_xor<SGD>(
       SGDOptions(0.1).momentum(0.9).nesterov(true).weight_decay(1e-6)));
diff --git a/test/cpp/api/parallel.cpp b/test/cpp/api/parallel.cpp
new file mode 100644
index 00000000000000..2d8f413c053006
--- /dev/null
+++ b/test/cpp/api/parallel.cpp
@@ -0,0 +1,230 @@
+#include <catch.hpp>
+
+#include <torch/csrc/autograd/functions/comm.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/parallel/data_parallel.h>
+#include <torch/nn/pimpl.h>
+#include <torch/tensor.h>
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+using Catch::StartsWith;
+
+using namespace torch::autograd;
+using namespace torch::nn;
+
+TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") {
+  Scatter scatter(
+      {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
+
+  auto input = torch::ones(10, torch::requires_grad(true));
+  auto output = scatter.apply({input});
+
+  REQUIRE(output.size() == 2);
+  REQUIRE(output[0].size(0) == 5);
+  REQUIRE(output[1].size(0) == 5);
+
+  REQUIRE(torch::cat({output[0].to(torch::kCPU), output[1].to(torch::kCPU)})
+              .allclose(input));
+
+  auto sum = output[0].to({torch::kCUDA, 1}) + output[1];
+  sum.backward();
+
+  REQUIRE(input.grad().defined());
+  REQUIRE(input.grad().device().is_cpu());
+  REQUIRE(input.grad().sum().toCInt() == 10);
+}
+
+TEST_CASE("Parallel/DifferentiableGather", "[multi-cuda]") {
+  Gather gather(torch::Device(torch::kCUDA, 1));
+
+  auto a = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 0}));
+  auto b = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 1}));
+
+  auto outputs = gather.apply({a, b});
+  REQUIRE(outputs.size() == 1);
+  auto& output = outputs.front();
+
+  REQUIRE(output.size(0) == 10);
+  REQUIRE(output.device() == torch::Device(torch::kCUDA, 1));
+
+  auto chunks = output.chunk(2);
+  REQUIRE(chunks[0].to({torch::kCUDA, 0}).allclose(a));
+  REQUIRE(chunks[1].allclose(b));
+
+  output.backward();
+
+  REQUIRE(a.grad().defined());
+  REQUIRE(a.grad().device() == torch::Device(torch::kCUDA, 0));
+  REQUIRE(a.grad().sum().toCInt() == 5);
+
+  REQUIRE(b.grad().defined());
+  REQUIRE(b.grad().device() == torch::Device(torch::kCUDA, 1));
+  REQUIRE(b.grad().sum().toCInt() == 5);
+}
+
+TEST_CASE("Parallel/Replicate", "[multi-cuda]") {
+  Linear linear(3, 4);
+  auto replicas = parallel::replicate(
+      linear, {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
+  REQUIRE(replicas.size() == 2);
+
+  auto original_parameters = linear->parameters();
+
+  auto replica1_parameters = replicas[0]->parameters();
+  for (auto& parameter : replica1_parameters) {
+    REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 0));
+  }
+  replicas[0]->to(torch::kCPU);
+  REQUIRE(replica1_parameters.size() == original_parameters.size());
+  for (size_t i = 0; i < original_parameters.size(); ++i) {
+    REQUIRE(replica1_parameters[i]->allclose(*original_parameters[i]));
+    REQUIRE(
+        replica1_parameters[i]->data().data<float>() !=
+        original_parameters[i]->data().data<float>());
+  }
+
+  auto replica2_parameters = replicas[1]->parameters();
+  for (auto& parameter : replica2_parameters) {
+    REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 1));
+  }
+  replicas[1]->to(torch::kCPU);
+  REQUIRE(replica2_parameters.size() == original_parameters.size());
+  for (size_t i = 0; i < original_parameters.size(); ++i) {
+    REQUIRE(replica2_parameters[i]->allclose(*original_parameters[i]));
+    REQUIRE(
+        replica2_parameters[i]->data().data<float>() !=
+        original_parameters[i]->data().data<float>());
+  }
+}
+
+TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") {
+  Linear a(3, 4);
+
+  Linear b(std::static_pointer_cast<LinearImpl>(a->clone()));
+  b->to({torch::kCUDA, 0});
+
+  Linear c(std::static_pointer_cast<LinearImpl>(a->clone()));
+  c->to({torch::kCUDA, 1});
+
+  std::vector<Linear> modules = {a, b, c};
+  std::vector<torch::Tensor> inputs = {
+      torch::ones({2, 3}),
+      torch::ones({2, 3}, torch::device({torch::kCUDA, 0})),
+      torch::ones({2, 3}, torch::device({torch::kCUDA, 1}))};
+
+  auto outputs = parallel::parallel_apply(modules, inputs);
+
+  REQUIRE(outputs.size() == 3);
+  REQUIRE(outputs[0].device().is_cpu());
+
+  REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
+  REQUIRE(outputs[1].to(torch::kCPU).allclose(outputs[0]));
+
+  REQUIRE(outputs[2].device() == torch::Device(torch::kCUDA, 1));
+  REQUIRE(outputs[2].to(torch::kCPU).allclose(outputs[0]));
+}
+
+TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") {
+  struct M : torch::nn::Module {
+    torch::Tensor forward(torch::Tensor input) {
+      return torch::ones({5}, torch::dtype(torch::kInt32));
+    }
+  };
+
+  std::vector<std::shared_ptr<M>> modules = {
+      std::make_shared<M>(), std::make_shared<M>(), std::make_shared<M>()};
+  std::vector<torch::Tensor> inputs = {
+      torch::empty({}), torch::empty({}), torch::empty({})};
+  std::vector<torch::Device> devices = {
+      {torch::kCUDA, 1}, {torch::kCUDA, 0}, {torch::kCPU}};
+
+  auto outputs = parallel::parallel_apply(modules, inputs, devices);
+
+  REQUIRE(outputs.size() == 3);
+  REQUIRE(outputs[0].device().is_cuda());
+  REQUIRE(outputs[0].device() == torch::Device(torch::kCUDA, 1));
+
+  REQUIRE(outputs[1].device().is_cuda());
+  REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
+
+  REQUIRE(outputs[2].device().is_cpu());
+}
+
+TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") {
+  struct M : torch::nn::Cloneable<M> {
+    void reset() override {}
+    torch::Tensor forward(torch::Tensor input) {
+      throw std::runtime_error("Badness!");
+    }
+  };
+
+  auto m = std::make_shared<M>();
+  auto input = torch::ones({10, 3});
+  REQUIRE_THROWS_WITH(
+      parallel::data_parallel(m, input), StartsWith("Badness!"));
+}
+
+TEST_CASE(
+    "Parallel/DataParallelPlacesTheOutputOnTheRequestedDevice",
+    "[multi-cuda]") {
+  struct M : torch::nn::Cloneable<M> {
+    void reset() override {}
+    torch::Tensor forward(torch::Tensor input) {
+      // Intermediate tensors should be on the replica's current device.
+      intermediate_tensor = torch::rand(5);
+      // The returned tensor should be on the output device.
+      return torch::ones(3);
+    }
+    torch::Tensor intermediate_tensor;
+  };
+  auto m = std::make_shared<M>();
+  auto input = torch::ones({10, 3});
+  {
+    auto output = parallel::data_parallel(
+        m,
+        input,
+        /*devices=*/at::nullopt,
+        /*output_device=*/torch::Device(torch::kCUDA, 1));
+    REQUIRE(output.defined());
+    REQUIRE(output.device().is_cuda());
+    REQUIRE(output.device().index() == 1);
+  }
+  {
+    // Verify for the single-device case (where we don't scatter/gather).
+    auto output = parallel::data_parallel(
+        m,
+        input,
+        /*devices=*/std::vector<torch::Device>{torch::Device(torch::kCUDA, 0)},
+        /*output_device=*/torch::Device(torch::kCUDA, 1));
+    REQUIRE(m->intermediate_tensor.defined());
+    REQUIRE(m->intermediate_tensor.device().is_cuda());
+    REQUIRE(m->intermediate_tensor.device().index() == 0);
+    REQUIRE(output.defined());
+    REQUIRE(output.device().is_cuda());
+    REQUIRE(output.device().index() == 1);
+  }
+}
+
+TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") {
+  struct M : torch::nn::Cloneable<M> {
+    void reset() override {}
+    torch::Tensor forward(torch::Tensor input) {
+      return torch::tensor(torch::DefaultTensorOptions::get().device().index());
+    }
+  };
+
+  auto m = std::make_shared<M>();
+  auto input = torch::ones({10, 3});
+  auto output = parallel::data_parallel(m, input);
+
+  const auto device_count = torch::cuda::device_count();
+  REQUIRE(output.numel() == device_count);
+  for (size_t i = 0; i < device_count; ++i) {
+    REQUIRE(output[i].toCInt() == i);
+  }
+}
diff --git a/test/cpp/api/tensor_cuda.cpp b/test/cpp/api/tensor_cuda.cpp
index 5a92bd18ec90f1..82d874e74b11b0 100644
--- a/test/cpp/api/tensor_cuda.cpp
+++ b/test/cpp/api/tensor_cuda.cpp
@@ -4,7 +4,7 @@
 
 #include <cmath>
 
-TEST_CASE("Tensor/AllocatesTensorOnTheCorrectDevice", "[cuda]") {
+TEST_CASE("Tensor/AllocatesTensorOnTheCorrectDevice", "[multi-cuda]") {
   auto tensor = at::tensor({1, 2, 3}, at::device({at::kCUDA, 1}));
   REQUIRE(tensor.device().type() == at::Device::Type::CUDA);
   REQUIRE(tensor.device().index() == 1);
diff --git a/test/cpp/api/tensor_options_cuda.cpp b/test/cpp/api/tensor_options_cuda.cpp
index 596e3b77610365..f5b0635c834d9a 100644
--- a/test/cpp/api/tensor_options_cuda.cpp
+++ b/test/cpp/api/tensor_options_cuda.cpp
@@ -41,7 +41,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") {
   REQUIRE_OPTIONS(kCUDA, 5, kFloat, kSparse);
 }
 
-TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[cuda]") {
+TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[multi-cuda]") {
   auto options = TensorOptions(empty(5, device(kCUDA).dtype(kDouble)));
   REQUIRE_OPTIONS(kCUDA, 0, kDouble, kStrided);
 
@@ -66,7 +66,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[cuda]") {
   }
 }
 
-TEST_CASE("OptionsGuardCUDA", "[cuda]") {
+TEST_CASE("OptionsGuardCUDA", "[multi-cuda]") {
   Tensor tensor;
   {
     OptionsGuard guard(device(kCUDA));
@@ -87,7 +87,7 @@ TEST_CASE("OptionsGuardCUDA", "[cuda]") {
   REQUIRE_TENSOR_OPTIONS(kCUDA, 0, kInt, kStrided);
 }
 
-TEST_CASE("DeviceGuardOptionsGuardInteraction", "[cuda]") {
+TEST_CASE("DeviceGuardOptionsGuardInteraction", "[multi-cuda]") {
   Tensor tensor;
   {
     // Check that OptionsGuard respects any active device before construction.
@@ -111,3 +111,18 @@ TEST_CASE("DeviceGuardOptionsGuardInteraction", "[cuda]") {
     }
   }
 }
+
+TEST_CASE("DeviceGuardIsMovable", "[cuda]") {
+  DeviceGuard first(1);
+  REQUIRE(first.original_index() == 0);
+  REQUIRE(first.last_index() == 1);
+  DeviceGuard second(std::move(first));
+  REQUIRE(second.original_index() == 0);
+  REQUIRE(second.last_index() == 1);
+  REQUIRE(first.original_index() == -1);
+  DeviceGuard third;
+  third = std::move(second);
+  REQUIRE(third.original_index() == 0);
+  REQUIRE(third.last_index() == 1);
+  REQUIRE(second.original_index() == -1);
+}
diff --git a/test/expect/TestScript.test_call_python_fn_from_traced_module.expect b/test/expect/TestScript.test_call_python_fn_from_traced_module.expect
index 2a87a361a9e3fa..4c9e2e2146aaf2 100644
--- a/test/expect/TestScript.test_call_python_fn_from_traced_module.expect
+++ b/test/expect/TestScript.test_call_python_fn_from_traced_module.expect
@@ -1,6 +1,6 @@
 graph(%0 : Double(3, 4)
       %1 : Double(4, 3)) {
-  %2 : Double(3, 4) = aten::neg(%0)
-  %4 : Double(3, 3) = aten::mm(%2, %1)
+  %2 : Double(3, 4) = aten::neg(%0), scope: TracedModule
+  %4 : Double(3, 3) = aten::mm(%2, %1), scope: TracedModule
   return (%4);
 }
diff --git a/test/expect/TestScript.test_call_python_mod_from_traced_module.expect b/test/expect/TestScript.test_call_python_mod_from_traced_module.expect
index 925bbf19ea5058..d39acaf5257d3d 100644
--- a/test/expect/TestScript.test_call_python_mod_from_traced_module.expect
+++ b/test/expect/TestScript.test_call_python_mod_from_traced_module.expect
@@ -1,8 +1,8 @@
 graph(%0 : Double(3, 4)
       %1 : Double(4, 5)
       %2 : Double(5, 7)) {
-  %4 : Double(3, 5) = aten::mm(%0, %1)
-  %6 : Double(3, 7) = aten::mm(%4, %2)
-  %7 : Double(3, 7) = aten::add[other={1}, alpha={1}](%6)
+  %4 : Double(3, 5) = aten::mm(%0, %1), scope: TracedModule
+  %6 : Double(3, 7) = aten::mm(%4, %2), scope: TracedModule/PythonModule[mod]
+  %7 : Double(3, 7) = aten::add[other={1}, alpha={1}](%6), scope: TracedModule
   return (%7);
 }
diff --git a/test/expect/TestScript.test_call_python_mod_from_tracing_fn.expect b/test/expect/TestScript.test_call_python_mod_from_tracing_fn.expect
index 4de15a540cc32e..ea847d630c8ba5 100644
--- a/test/expect/TestScript.test_call_python_mod_from_tracing_fn.expect
+++ b/test/expect/TestScript.test_call_python_mod_from_tracing_fn.expect
@@ -1,6 +1,6 @@
 graph(%0 : Double(3, 4)) {
-  %1 : Double(4, 3) = prim::Constant[value=<Tensor>]()
-  %3 : Double(3, 3) = aten::mm(%0, %1)
+  %1 : Double(4, 3) = prim::Constant[value=<Tensor>](), scope: PythonMod
+  %3 : Double(3, 3) = aten::mm(%0, %1), scope: PythonMod
   %4 : Double(3, 3) = aten::add[other={1}, alpha={1}](%3)
   return (%4);
 }
diff --git a/test/expect/TestScript.test_call_script_fn_from_traced_module.expect b/test/expect/TestScript.test_call_script_fn_from_traced_module.expect
index adaab3880dc46d..6bf57b856cac8e 100644
--- a/test/expect/TestScript.test_call_script_fn_from_traced_module.expect
+++ b/test/expect/TestScript.test_call_script_fn_from_traced_module.expect
@@ -1,6 +1,6 @@
 graph(%0 : Double(3, 4)
       %1 : Double(4, 5)) {
-  %3 : Double(3, 5) = aten::mm(%0, %1)
-  %5 : Double(3, 5) = aten::neg(%3)
+  %3 : Double(3, 5) = aten::mm(%0, %1), scope: TracedModule
+  %5 : Double(3, 5) = aten::neg(%3), scope: TracedModule/ScriptModule
   return (%5);
 }
diff --git a/test/expect/TestScript.test_call_script_fn_from_tracing_fn.expect b/test/expect/TestScript.test_call_script_fn_from_tracing_fn.expect
index cffec80d884616..dc8b4945df4773 100644
--- a/test/expect/TestScript.test_call_script_fn_from_tracing_fn.expect
+++ b/test/expect/TestScript.test_call_script_fn_from_tracing_fn.expect
@@ -1,5 +1,5 @@
 graph(%0 : Double(3, 4)) {
-  %2 : Double(3, 4) = aten::neg(%0)
+  %2 : Double(3, 4) = aten::neg(%0), scope: ScriptModule
   %3 : Double(3, 4) = aten::add[other={1}, alpha={1}](%2)
   return (%3);
 }
diff --git a/test/expect/TestScript.test_call_script_mod_from_tracing_fn.expect b/test/expect/TestScript.test_call_script_mod_from_tracing_fn.expect
index d446882fbaa956..fc7039bd971f23 100644
--- a/test/expect/TestScript.test_call_script_mod_from_tracing_fn.expect
+++ b/test/expect/TestScript.test_call_script_mod_from_tracing_fn.expect
@@ -1,6 +1,6 @@
 graph(%0 : Double(3, 4)) {
-  %1 : Double(4, 3) = prim::Constant[value=<Tensor>]()
-  %4 : Double(3, 3) = aten::mm(%0, %1)
+  %1 : Double(4, 3) = prim::Constant[value=<Tensor>](), scope: ScriptMod
+  %4 : Double(3, 3) = aten::mm(%0, %1), scope: ScriptMod
   %5 : Double(3, 3) = aten::add[other={1}, alpha={1}](%4)
   return (%5);
 }
diff --git a/test/expect/TestScript.test_call_script_module_from_traced_module.expect b/test/expect/TestScript.test_call_script_module_from_traced_module.expect
index c249ddc6b8c171..21b14a2a62f8cf 100644
--- a/test/expect/TestScript.test_call_script_module_from_traced_module.expect
+++ b/test/expect/TestScript.test_call_script_module_from_traced_module.expect
@@ -1,8 +1,8 @@
 graph(%0 : Double(3, 4)
       %1 : Double(4, 5)
       %2 : Double(5, 7)) {
-  %4 : Double(3, 5) = aten::mm(%0, %1)
-  %7 : Double(3, 7) = aten::mm(%4, %2)
-  %8 : Double(3, 7) = aten::add[other={1}, alpha={1}](%7)
+  %4 : Double(3, 5) = aten::mm(%0, %1), scope: TracedModule
+  %7 : Double(3, 7) = aten::mm(%4, %2), scope: TracedModule/ScriptMod[mod]
+  %8 : Double(3, 7) = aten::add[other={1}, alpha={1}](%7), scope: TracedModule
   return (%8);
 }
diff --git a/test/expect/TestScript.test_call_traced_fn_from_traced_module.expect b/test/expect/TestScript.test_call_traced_fn_from_traced_module.expect
index 4e25a8581f0706..f45c3f15a9caed 100644
--- a/test/expect/TestScript.test_call_traced_fn_from_traced_module.expect
+++ b/test/expect/TestScript.test_call_traced_fn_from_traced_module.expect
@@ -1,6 +1,6 @@
 graph(%0 : Double(3, 4)
       %1 : Double(4, 5)) {
-  %3 : Double(3, 5) = aten::mm(%0, %1)
-  %5 : Double(3, 4) = aten::neg(%3)
+  %3 : Double(3, 5) = aten::mm(%0, %1), scope: TracedModule
+  %5 : Double(3, 4) = aten::neg(%3), scope: TracedModule/traced_fn
   return (%5);
 }
diff --git a/test/expect/TestScript.test_call_traced_fn_from_tracing_fn.expect b/test/expect/TestScript.test_call_traced_fn_from_tracing_fn.expect
index cffec80d884616..ed737f4b6580b4 100644
--- a/test/expect/TestScript.test_call_traced_fn_from_tracing_fn.expect
+++ b/test/expect/TestScript.test_call_traced_fn_from_tracing_fn.expect
@@ -1,5 +1,5 @@
 graph(%0 : Double(3, 4)) {
-  %2 : Double(3, 4) = aten::neg(%0)
+  %2 : Double(3, 4) = aten::neg(%0), scope: traced_fn1
   %3 : Double(3, 4) = aten::add[other={1}, alpha={1}](%2)
   return (%3);
 }
diff --git a/test/expect/TestScript.test_call_traced_mod_from_tracing_fn.expect b/test/expect/TestScript.test_call_traced_mod_from_tracing_fn.expect
index d446882fbaa956..3fac45fc2dfdab 100644
--- a/test/expect/TestScript.test_call_traced_mod_from_tracing_fn.expect
+++ b/test/expect/TestScript.test_call_traced_mod_from_tracing_fn.expect
@@ -1,6 +1,6 @@
 graph(%0 : Double(3, 4)) {
-  %1 : Double(4, 3) = prim::Constant[value=<Tensor>]()
-  %4 : Double(3, 3) = aten::mm(%0, %1)
+  %1 : Double(4, 3) = prim::Constant[value=<Tensor>](), scope: TracedModule[TracedModule]
+  %4 : Double(3, 3) = aten::mm(%0, %1), scope: TracedModule[TracedModule]
   %5 : Double(3, 3) = aten::add[other={1}, alpha={1}](%4)
   return (%5);
 }
diff --git a/test/expect/TestScript.test_call_traced_module_from_traced_module.expect b/test/expect/TestScript.test_call_traced_module_from_traced_module.expect
index c249ddc6b8c171..471f9f1c2ec3fe 100644
--- a/test/expect/TestScript.test_call_traced_module_from_traced_module.expect
+++ b/test/expect/TestScript.test_call_traced_module_from_traced_module.expect
@@ -1,8 +1,8 @@
 graph(%0 : Double(3, 4)
       %1 : Double(4, 5)
       %2 : Double(5, 7)) {
-  %4 : Double(3, 5) = aten::mm(%0, %1)
-  %7 : Double(3, 7) = aten::mm(%4, %2)
-  %8 : Double(3, 7) = aten::add[other={1}, alpha={1}](%7)
+  %4 : Double(3, 5) = aten::mm(%0, %1), scope: TracedModule
+  %7 : Double(3, 7) = aten::mm(%4, %2), scope: TracedModule/TracedModule[TracedModule1][mod]
+  %8 : Double(3, 7) = aten::add[other={1}, alpha={1}](%7), scope: TracedModule
   return (%8);
 }
diff --git a/test/onnx/model_defs/squeezenet.py b/test/onnx/model_defs/squeezenet.py
index 3db99b338bca97..e4ace18194ab71 100644
--- a/test/onnx/model_defs/squeezenet.py
+++ b/test/onnx/model_defs/squeezenet.py
@@ -67,7 +67,7 @@ def __init__(self, version=1.0, num_classes=1000, ceil_mode=False):
                 Fire(384, 64, 256, 256),
                 Fire(512, 64, 256, 256),
             )
-        # Final convolution is initialized differently form the rest
+        # Final convolution is initialized differently from the rest
         final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
         self.classifier = nn.Sequential(
             nn.Dropout(p=0.5),
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index cbde1fa75c3610..dd601881131b96 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -432,7 +432,7 @@ def symb(g, x, y):
             return g.op('Sum', x, y[0], y[1]), (
                 g.op('Neg', x), g.op('Neg', y[0]))
 
-        @torch.onnx.symbolic_override_first_arg_based(symb)
+        @torch.onnx.symbolic_override(symb)
         def foo(x, y):
             return x + y[0] + y[1], (-x, -y[0])
 
diff --git a/test/test_autograd.py b/test/test_autograd.py
index cb1c4e3def7e2b..e09b3366029427 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -10,6 +10,7 @@
 from itertools import product
 from operator import mul, itemgetter
 from functools import reduce, wraps
+from torch._six import inf, nan
 from torch.autograd.gradcheck import gradgradcheck, gradcheck
 from torch.autograd.function import once_differentiable
 from torch.autograd.profiler import profile
@@ -1524,12 +1525,12 @@ def _test_pyscalar_conversions(self, t, integral_conv):
         pyscalar = -12345.1
         f[0] = pyscalar
         self.assertEqual(float(f), pyscalar)
-        f[0] = float('nan')
+        f[0] = nan
         self.assertTrue(math.isnan(float(f)))
-        f[0] = float('inf')
-        self.assertEqual(float(f), float('inf'), allow_inf=True)
-        f[0] = float('-inf')
-        self.assertEqual(float(f), float('-inf'), allow_inf=True)
+        f[0] = inf
+        self.assertEqual(float(f), inf, allow_inf=True)
+        f[0] = -inf
+        self.assertEqual(float(f), -inf, allow_inf=True)
 
         # integral -> floating point
         # check we can convert something that loses precision
@@ -1539,11 +1540,11 @@ def _test_pyscalar_conversions(self, t, integral_conv):
         self.assertEqual(float(l), float(pyscalar))
 
         # floating point -> integral
-        f[0] = float('nan')
+        f[0] = nan
         self.assertRaises(ValueError, lambda: integral_conv(f[0]))
-        f[0] = float('inf')
+        f[0] = inf
         self.assertRaises(OverflowError, lambda: integral_conv(f[0]))
-        f[0] = float('-inf')
+        f[0] = -inf
         self.assertRaises(OverflowError, lambda: integral_conv(f[0]))
         f[0] = sys.float_info.max
         self.assertEqual(integral_conv(f), sys.float_info.max)
@@ -1558,9 +1559,9 @@ def test_nonzero(tensor, value, expected):
         test_nonzero(l, -2, True)
         test_nonzero(f, 0.0, False)
         test_nonzero(f, sys.float_info.min, True)
-        test_nonzero(f, float('nan'), bool(float('nan')))
-        test_nonzero(f, float('inf'), bool(float('inf')))
-        test_nonzero(f, float('-inf'), bool(float('-inf')))
+        test_nonzero(f, nan, bool(nan))
+        test_nonzero(f, inf, bool(inf))
+        test_nonzero(f, -inf, bool(-inf))
 
     def test_pyscalar_conversions(self):
         self._test_pyscalar_conversions(lambda x: x, lambda x: int(x))
@@ -2106,13 +2107,22 @@ def test_dir(self):
 
     def test_as_strided(self):
 
-        def test(x, repro_fn, *args):
+        def test(x, prepro_fn, size, strides, offset=None):
+            x = x.to(torch.double).detach().requires_grad_()
+
+            # Check that forward will **not** resize storage because it may
+            # cause NaN in output and fail numerical Jacobian check consequently
+            with torch.no_grad():
+                y = prepro_fn(x) if prepro_fn is not None else x
+                max_offset = sum((si - 1) * st for si, st in zip(size, strides))
+                max_offset += offset if offset is not None else y.storage_offset()
+                assert max_offset < len(y.storage()), "test case resizes storage"
+
             def closure(x):
-                if repro_fn is not None:
-                    x = repro_fn(x)
-                return x.as_strided(*args)
+                if prepro_fn is not None:
+                    x = prepro_fn(x)
+                return x.as_strided(size, strides, offset)
 
-            x = x.to(torch.double).detach().requires_grad_()
             gradcheck(closure, [x])
             gradgradcheck(closure, [x])
 
@@ -2120,7 +2130,7 @@ def closure(x):
         test(torch.arange(0, 25), lambda x: x.view(5, 5), [3, 3], [6, 2], 2)
 
         # test crazy stride at dim with size 1 case
-        test(torch.randn(10), None, [1, 2, 1, 5], [0, 5, 100, 1], 2)
+        test(torch.randn(12), None, [1, 2, 1, 5], [0, 5, 100, 1], 2)
 
         # test expand case
         test(torch.randn(5), None, [3, 3, 3], [0, 1, 0], 2)
@@ -2634,6 +2644,9 @@ class dont_convert(tuple):
     ('reshape', (S,), (S,), '1d'),
     ('reshape', (), (dont_convert(()),), 'scalar_to_scalar'),
     ('reshape', (), (1,), 'scalar_to_1d'),
+    ('reshape_as', (S, S, S), (non_differentiable(torch.rand(S * S, S)),)),
+    ('reshape_as', (), (non_differentiable(torch.tensor(42.)),), 'scalar'),
+    ('reshape_as', (), (non_differentiable(torch.rand(1, 1)),), 'scalar_to_dims'),
     ('flip', (S, S, S), ([0],), 'd0'),
     ('flip', (S, S, S), ([0, 1, 2],), 'd012'),
     ('flip', (S, S, S), ([0, 2],), 'd02'),
@@ -2825,7 +2838,7 @@ class dont_convert(tuple):
     ('std', (S,), (0, True, True), 'keepdim_dim_1d', [0]),
     ('renorm', (S, S, S), (2, 1, 0.5), 'dim', [1]),
     ('renorm', (S, S, S), (1, 2, 3), 'norm_1'),
-    ('renorm', (S, S, S), (float('inf'), 2, 0.5), 'norm_inf'),
+    ('renorm', (S, S, S), (inf, 2, 0.5), 'norm_inf'),
     ('repeat', (S,), (2,), 'single_number'),
     ('repeat', (), (2, 3), 'scalar'),
     ('repeat', (2, 2), (3, 2)),
@@ -2917,7 +2930,7 @@ class dont_convert(tuple):
     ('norm', (S, S), (0.5,), '0_5'),
     ('norm', (S, S), (1,), '1'),
     ('norm', (S, S), (3,), '3'),
-    ('norm', (S, S), (float('inf'),), 'inf'),
+    ('norm', (S, S), (inf,), 'inf'),
     ('norm', (S, S), (-1,), 'neg_1'),
     ('norm', (S, S), (-0.5,), 'neg_0_5'),
     ('norm', (S, S), (-1.5,), 'neg_1_5'),
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 6e6705b9b3eac6..9d56682f3cf066 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -12,6 +12,7 @@
 import torch.cuda
 import torch.cuda.comm as comm
 from torch import multiprocessing as mp
+from torch._six import inf, nan
 
 from test_torch import TestTorch
 from common import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests, \
@@ -89,11 +90,7 @@ def is_half(t):
 
 
 def number(floating, integer, t):
-    name = type(t).__name__
-    if 'Double' in name or 'Float' in name or 'Half' in name:
-        return floating
-    else:
-        return integer
+    return floating if is_floating(t) else integer
 
 
 def cast_tensor(tensor, t):
@@ -480,6 +477,7 @@ def tmp(t):
     'add': 1e-2,
     'acos': 1e-3,
     'addbmm': 1e-1,
+    'addcdiv': 1e-2,
     'addcmul': 1e-2,
     'addmm': 1e-1,
     'addmv': 1e-2,
@@ -501,6 +499,7 @@ def tmp(t):
     'erfinv': 1e-3,
     'exp': 1e-2,
     'expm1': 1e-2,
+    'fill': 1e-3,
     'lerp': 1e-2,
     'lgamma': 1e-2,
     'log': 1e-2,
@@ -595,7 +594,7 @@ def tmp(self):
         gpu_tensor = to_gpu(cpu_tensor)
         cpu_args = arg_constructor(t)
         gpu_args = [to_gpu(arg) for arg in cpu_args]
-        if t.__name__ == 'HalfTensor':
+        if is_half(t):
             cpu_tensor = cpu_tensor.float()
             cpu_args = [arg.float() if isinstance(arg, torch.Tensor) and is_half(arg) else arg for arg in cpu_args]
         cpu_result = getattr(cpu_tensor, fn)(*cpu_args)
@@ -784,7 +783,7 @@ def advance(gen, end):
             if not end0:
                 gen1_max_times = torch.LongTensor(1).random_(0, 3)[0]
             else:
-                gen1_max_times = float('inf')
+                gen1_max_times = inf
             t = 0
             while t < gen1_max_times and not end1:
                 end1 = advance(gen1, end1)
@@ -903,7 +902,7 @@ def test_min_max_nan(self):
                  (lambda x: x.max(0)[0], 'max_dim')]
         for f, name in tests:
             a = torch.arange(25.0).view(5, 5)
-            a[2, 2] = float('nan')
+            a[2, 2] = nan
             actual = f(a.cuda()).cpu()
             expected = f(a).cpu()
             self.assertEqual(torch.isnan(actual), torch.isnan(expected), 'nans for {}'.format(name))
@@ -1479,11 +1478,8 @@ def mute():
         os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stderr.fileno())
 
     def _spawn_method(self, method, arg):
-        try:
-            mp.set_start_method('spawn')
-        except RuntimeError:
-            pass
-        with mp.Pool(1, initializer=self.mute) as pool:
+        ctx = mp.get_context("spawn")
+        with ctx.Pool(1, initializer=self.mute) as pool:
             errors = pool.map(method, [arg])
             for e in errors:
                 if 'device-side assert triggered' not in str(e):
@@ -1508,9 +1504,9 @@ def _test_multinomial_invalid_probs_cuda(probs):
     def test_multinomial_invalid_probs_cuda(self):
         test_method = TestCuda._test_multinomial_invalid_probs_cuda
         self._spawn_method(test_method, torch.Tensor([0, -1]))
-        self._spawn_method(test_method, torch.Tensor([0, float('inf')]))
-        self._spawn_method(test_method, torch.Tensor([0, float('-inf')]))
-        self._spawn_method(test_method, torch.Tensor([0, float('nan')]))
+        self._spawn_method(test_method, torch.Tensor([0, inf]))
+        self._spawn_method(test_method, torch.Tensor([0, -inf]))
+        self._spawn_method(test_method, torch.Tensor([0, nan]))
 
     def test_broadcast(self):
         TestTorch._test_broadcast(self, lambda t: t.cuda())
@@ -1691,7 +1687,6 @@ def test(use_double=False):
         cpu_tensor = torch.tensor([-0.999999994, -1.999999994, -2.0000000111,
                                   -100.99999994, -1931.99999994, 0.000000111,
                                   -0.000000111, 0, -1, -2, -931])
-        nan = float('nan')
         expected_errors = torch.tensor([0, 0, 0, 0, 0, 0, 0, nan, nan, nan, nan])
         gpu_tensor = cpu_tensor.cuda()
         cpu_out = cpu_tensor.digamma()
@@ -1912,7 +1907,7 @@ def generate_tests():
                 continue
 
             precision = custom_precision.get(name, TestCuda.precision)
-            if t == torch.HalfTensor:
+            if is_half(t):
                 precision = custom_half_precision.get(name, precision)
 
             for inplace in (True, False):
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index d9a03f3401158d..8b3136a57d8e6a 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -9,7 +9,7 @@
 import traceback
 import unittest
 import subprocess
-from torch import multiprocessing
+from torch import multiprocessing as mp
 from torch.utils.data import Dataset, TensorDataset, DataLoader, ConcatDataset
 from torch.utils.data.dataset import random_split
 from torch.utils.data.dataloader import default_collate, ExceptionWrapper, MANAGER_STATUS_CHECK_INTERVAL
@@ -24,12 +24,9 @@
 # We need spawn start method for test_manager_unclean_exit, but
 # Python 2.7 doesn't allow it.
 if sys.version_info[0] == 3:
-    # Without the try-catch block, some tests will complain that
-    # context has already been set.
-    try:
-        multiprocessing.set_start_method('spawn')
-    except RuntimeError:
-        pass
+    # Get a multiprocessing context because some test / third party library will
+    # set start_method when imported, and setting again triggers RuntimeError.
+    mp = mp.get_context(method='spawn')
 
 
 JOIN_TIMEOUT = 17.0 if IS_WINDOWS else 6.5
@@ -144,11 +141,11 @@ def test_add_dataset(self):
 
 # Stores the first encountered exception in .exception.
 # Inspired by https://stackoverflow.com/a/33599967
-class ErrorTrackingProcess(multiprocessing.Process):
+class ErrorTrackingProcess(mp.Process):
 
     def __init__(self, *args, **kwargs):
         super(ErrorTrackingProcess, self).__init__(*args, **kwargs)
-        self._pconn, self._cconn = multiprocessing.Pipe()
+        self._pconn, self._cconn = mp.Pipe()
         self._exception = None
 
     def run(self):
@@ -235,8 +232,8 @@ class SynchronizedSeedDataset(Dataset):
 
     def __init__(self, size, num_workers):
         assert size >= num_workers
-        self.count = multiprocessing.Value('i', 0, lock=True)
-        self.barrier = multiprocessing.Semaphore(0)
+        self.count = mp.Value('i', 0, lock=True)
+        self.barrier = mp.Semaphore(0)
         self.num_workers = num_workers
         self.size = size
 
@@ -537,12 +534,12 @@ def _is_process_alive(pid, pname):
     def test_manager_unclean_exit(self):
         '''there might be ConnectionResetError or leaked semaphore warning (due to dirty process exit), \
 but they are all safe to ignore'''
-        worker_pids = multiprocessing.Array('i', [0] * 4)
+        worker_pids = mp.Array('i', [0] * 4)
 
-        manager_exit_event = multiprocessing.Event()
-        mp = multiprocessing.Process(target=TestDataLoader._manager_process,
-                                     args=(self.dataset, worker_pids, manager_exit_event))
-        mp.start()
+        manager_exit_event = mp.Event()
+        p = mp.Process(target=TestDataLoader._manager_process,
+                       args=(self.dataset, worker_pids, manager_exit_event))
+        p.start()
 
         manager_exit_event.wait()
 
diff --git a/test/test_distributions.py b/test/test_distributions.py
index 2f97370f713d8f..f53271e1ea0277 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -30,6 +30,7 @@
 from random import shuffle
 
 import torch
+from torch._six import inf
 from common import TestCase, run_tests, set_rng_seed, TEST_WITH_UBSAN
 from common_cuda import TEST_CUDA
 from torch.autograd import grad, gradcheck
@@ -782,7 +783,7 @@ def test_geometric(self):
         s = 0.3
         self.assertEqual(Geometric(p).sample((8,)).size(), (8, 3))
         self.assertEqual(Geometric(1).sample(), 0)
-        self.assertEqual(Geometric(1).log_prob(torch.tensor(1.)), -float('inf'), allow_inf=True)
+        self.assertEqual(Geometric(1).log_prob(torch.tensor(1.)), -inf, allow_inf=True)
         self.assertEqual(Geometric(1).log_prob(torch.tensor(0.)), 0)
         self.assertFalse(Geometric(p).sample().requires_grad)
         self.assertEqual(Geometric(r).sample((8,)).size(), (8,))
@@ -1162,8 +1163,8 @@ def test_uniform(self):
         uniform = Uniform(low_1d, high_1d)
         above_high = torch.tensor([4.0])
         below_low = torch.tensor([-1.0])
-        self.assertEqual(uniform.log_prob(above_high).item(), -float('inf'), allow_inf=True)
-        self.assertEqual(uniform.log_prob(below_low).item(), -float('inf'), allow_inf=True)
+        self.assertEqual(uniform.log_prob(above_high).item(), -inf, allow_inf=True)
+        self.assertEqual(uniform.log_prob(below_low).item(), -inf, allow_inf=True)
 
         # check cdf computation when value outside range
         self.assertEqual(uniform.cdf(below_low).item(), 0)
@@ -1190,7 +1191,7 @@ def test_cauchy(self):
         loc_1d = torch.zeros(1, requires_grad=True)
         scale_1d = torch.ones(1, requires_grad=True)
         self.assertTrue(is_all_nan(Cauchy(loc_1d, scale_1d).mean))
-        self.assertEqual(Cauchy(loc_1d, scale_1d).variance, float('inf'), allow_inf=True)
+        self.assertEqual(Cauchy(loc_1d, scale_1d).variance, inf, allow_inf=True)
         self.assertEqual(Cauchy(loc, scale).sample().size(), (5, 5))
         self.assertEqual(Cauchy(loc, scale).sample((7,)).size(), (7, 5, 5))
         self.assertEqual(Cauchy(loc_1d, scale_1d).sample().size(), (1,))
@@ -1216,7 +1217,7 @@ def test_halfcauchy(self):
         scale = torch.ones(5, 5, requires_grad=True)
         scale_1d = torch.ones(1, requires_grad=True)
         self.assertTrue(is_all_nan(HalfCauchy(scale_1d).mean))
-        self.assertEqual(HalfCauchy(scale_1d).variance, float('inf'), allow_inf=True)
+        self.assertEqual(HalfCauchy(scale_1d).variance, inf, allow_inf=True)
         self.assertEqual(HalfCauchy(scale).sample().size(), (5, 5))
         self.assertEqual(HalfCauchy(scale).sample((7,)).size(), (7, 5, 5))
         self.assertEqual(HalfCauchy(scale_1d).sample().size(), (1,))
@@ -1714,8 +1715,8 @@ def test_pareto(self):
         alpha = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
         scale_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
         alpha_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
-        self.assertEqual(Pareto(scale_1d, 0.5).mean, float('inf'), allow_inf=True)
-        self.assertEqual(Pareto(scale_1d, 0.5).variance, float('inf'), allow_inf=True)
+        self.assertEqual(Pareto(scale_1d, 0.5).mean, inf, allow_inf=True)
+        self.assertEqual(Pareto(scale_1d, 0.5).variance, inf, allow_inf=True)
         self.assertEqual(Pareto(scale, alpha).sample().size(), (2, 3))
         self.assertEqual(Pareto(scale, alpha).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(Pareto(scale_1d, alpha_1d).sample((1,)).size(), (1, 1))
@@ -1832,7 +1833,7 @@ def test_studentT(self):
         df_1d = torch.tensor(torch.exp(torch.randn(1)), requires_grad=True)
         self.assertTrue(is_all_nan(StudentT(1).mean))
         self.assertTrue(is_all_nan(StudentT(1).variance))
-        self.assertEqual(StudentT(2).variance, float('inf'), allow_inf=True)
+        self.assertEqual(StudentT(2).variance, inf, allow_inf=True)
         self.assertEqual(StudentT(df).sample().size(), (2, 3))
         self.assertEqual(StudentT(df).sample((5,)).size(), (5, 2, 3))
         self.assertEqual(StudentT(df_1d).sample((1,)).size(), (1, 1))
@@ -2962,7 +2963,7 @@ def test_kl_exponential_family(self):
 
     def test_kl_infinite(self):
         for p, q in self.infinite_examples:
-            self.assertTrue((kl_divergence(p, q) == float('inf')).all(),
+            self.assertTrue((kl_divergence(p, q) == inf).all(),
                             'Incorrect KL({}, {})'.format(type(p).__name__, type(q).__name__))
 
     def test_kl_edgecases(self):
@@ -2996,7 +2997,7 @@ def test_entropy_monte_carlo(self):
                     continue
                 x = dist.sample(sample_shape=(60000,))
                 expected = -dist.log_prob(x).mean(0)
-                ignore = (expected == float('inf'))
+                ignore = (expected == inf)
                 expected[ignore] = actual[ignore]
                 self.assertEqual(actual, expected, prec=0.2, message='\n'.join([
                     '{} example {}/{}, incorrect .entropy().'.format(Dist.__name__, i + 1, len(params)),
@@ -3157,12 +3158,12 @@ def test_categorical_log_prob(self):
 
     def test_categorical_log_prob_with_logits(self):
         for dtype in ([torch.float, torch.double]):
-            p = torch.tensor([-float('inf'), 0], dtype=dtype, requires_grad=True)
+            p = torch.tensor([-inf, 0], dtype=dtype, requires_grad=True)
             categorical = OneHotCategorical(logits=p)
             log_pdf_prob_1 = categorical.log_prob(torch.tensor([0, 1], dtype=dtype))
             self.assertEqual(log_pdf_prob_1.item(), 0)
             log_pdf_prob_0 = categorical.log_prob(torch.tensor([1, 0], dtype=dtype))
-            self.assertEqual(log_pdf_prob_0.item(), -float('inf'), allow_inf=True)
+            self.assertEqual(log_pdf_prob_0.item(), -inf, allow_inf=True)
 
     def test_multinomial_log_prob(self):
         for dtype in ([torch.float, torch.double]):
@@ -3174,12 +3175,12 @@ def test_multinomial_log_prob(self):
 
     def test_multinomial_log_prob_with_logits(self):
         for dtype in ([torch.float, torch.double]):
-            p = torch.tensor([-float('inf'), 0], dtype=dtype, requires_grad=True)
+            p = torch.tensor([-inf, 0], dtype=dtype, requires_grad=True)
             multinomial = Multinomial(10, logits=p)
             log_pdf_prob_1 = multinomial.log_prob(torch.tensor([0, 10], dtype=dtype))
             self.assertEqual(log_pdf_prob_1.item(), 0)
             log_pdf_prob_0 = multinomial.log_prob(torch.tensor([10, 0], dtype=dtype))
-            self.assertEqual(log_pdf_prob_0.item(), -float('inf'), allow_inf=True)
+            self.assertEqual(log_pdf_prob_0.item(), -inf, allow_inf=True)
 
 
 class TestLazyLogitsInitialization(TestCase):
diff --git a/test/test_jit.py b/test/test_jit.py
index 706f33fe2db1d5..a20436e167188f 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -333,9 +333,9 @@ def test_scopes(self):
 
         def f(x, y):
             out = x + y
-            with torch.jit.scope('Foo', out):
+            with torch.jit.scope('Foo'):
                 out = x * out
-                with torch.jit.scope('Bar', out):
+                with torch.jit.scope('Bar'):
                     out = torch.tanh(out)
                 out = torch.sigmoid(out)
             return out
@@ -488,6 +488,16 @@ def test_relu(self):
 
         ge = self.checkTrace(self.fn_test_relu, (x, y))
 
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_small_constant(self):
+        def fn_test_small_constant(x, y):
+            return (1e-8 * x + 5e-9 * y) * 1e8
+        x = torch.randn(4, 4, dtype=torch.float, device='cuda')
+        y = torch.randn(4, 4, dtype=torch.float, device='cuda')
+
+        ge = self.checkTrace(fn_test_small_constant, (x, y))
+
     @staticmethod
     def fn_test_exp(x, y):
         return (x + .5 * y).exp()
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index e88a3578f2d9d2..da7ee1cdd545c7 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -249,8 +249,6 @@ def test_fd_sharing(self):
         self._test_sharing(repeat=TEST_REPEATS)
 
     @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS")
-    @unittest.skipIf(TEST_WITH_ASAN,
-                     "test_fd_preserve_sharing is known buggy, see https://github.com/pytorch/pytorch/issues/5311")
     def test_fd_preserve_sharing(self):
         self._test_preserve_sharing(repeat=TEST_REPEATS)
 
@@ -264,8 +262,6 @@ def test_fs_sharing(self):
         with fs_sharing():
             self._test_sharing(repeat=TEST_REPEATS)
 
-    @unittest.skipIf(TEST_WITH_ASAN,
-                     "test_fs_preserve_sharing is known buggy, see https://github.com/pytorch/pytorch/issues/5311")
     def test_fs_preserve_sharing(self):
         with fs_sharing():
             self._test_preserve_sharing(repeat=TEST_REPEATS)
diff --git a/test/test_nn.py b/test/test_nn.py
index 6e4de3aba533cb..f318132d9b51ea 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -15,6 +15,7 @@
 import os
 
 import torch
+from torch._six import inf, nan
 import torch.backends.cudnn as cudnn
 import torch.nn as nn
 import torch.nn.functional as F
@@ -733,6 +734,20 @@ def _test_dropout(self, cls, input):
         module.__repr__()
         str(module)
 
+    def _test_alpha_dropout(self, cls, input):
+        mean = input.mean()
+        std = input.std()
+
+        for p in [0.2, 0.5, 0.8]:
+            module = cls(p)
+            input_var = torch.tensor(input, requires_grad=True)
+            output = module(input_var)
+            # output mean should be close to input mean
+            self.assertLess(abs(output.data.mean() - mean), 0.1)
+            # output std should be close to input std
+            self.assertLess(abs(output.data.std() - std), 0.1)
+            output.backward(input)
+
     def test_parameters(self):
         def num_params(module):
             return len(list(module.parameters()))
@@ -1451,7 +1466,7 @@ def test_clip_grad_norm(self):
 
         def compute_norm(norm_type):
             norm_type = float(norm_type)
-            if norm_type != float('inf'):
+            if norm_type != inf:
                 total_norm = 0
                 for p in l.parameters():
                     total_norm += p.grad.data.abs().pow(norm_type).sum()
@@ -1546,8 +1561,6 @@ def test_vector_to_parameters(self):
     # We don't want to make propagating NaN a hard requirement on ops, but for
     # these easy ones, we should make them do so.
     def _test_nonlinearity_propagate_nan(self, device):
-        nan = float('nan')
-
         def test(nonlinearity, *args, **kwargs):
             x = torch.tensor([nan], device=device)
             fn = getattr(F, nonlinearity)
@@ -2086,19 +2099,16 @@ def test_Dropout3d(self):
     def test_AlphaDropout(self):
         # generate random tensor with zero mean and unit std
         input = torch.randn(5000)
+        self._test_alpha_dropout(nn.AlphaDropout, input)
 
-        mean = input.mean()
-        std = input.std()
-
-        for p in [0.2, 0.5, 0.8]:
-            module = nn.AlphaDropout(p)
-            input_var = torch.tensor(input, requires_grad=True)
-            output = module(input_var)
-            # output mean should be close to input mean
-            self.assertLess(abs(output.data.mean() - mean), 0.1)
-            # output std should be close to input std
-            self.assertLess(abs(output.data.std() - std), 0.1)
-            output.backward(input)
+    def test_FeatureAlphaDropout(self):
+        b = random.randint(1, 5)
+        w = random.randint(1, 5)
+        h = random.randint(1, 5)
+        d = random.randint(1, 2)
+        num_features = 1000
+        input = torch.randn(num_features, b, d, w, h)
+        self._test_alpha_dropout(nn.FeatureAlphaDropout, input)
 
     def _test_InstanceNorm_general(self, cls, input, device="cpu", dtype=torch.float):
         # default case track_running_stats=False
@@ -2536,7 +2546,7 @@ def _test_max_pool_nan(self, device, dtype=torch.float):
             for num_dim in [1, 2, 3]:
                 fn_name = '{}max_pool{}d'.format(adaptive, num_dim)
                 fn = getattr(F, fn_name)
-                x = torch.full([1, 1] + num_dim * [3], float('nan'))
+                x = torch.full([1, 1] + num_dim * [3], nan)
                 res = fn(x, 1 if adaptive else 3)
                 self.assertTrue(math.isnan(res.item()))
 
diff --git a/test/test_optim.py b/test/test_optim.py
index 67328919c32df6..35aa7b2bfb7a5b 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -3,6 +3,7 @@
 import functools
 from copy import deepcopy
 import torch
+from torch._six import inf
 import torch.optim as optim
 import torch.legacy.optim as old_optim
 import torch.nn.functional as F
@@ -479,8 +480,8 @@ def test_lbfgs(self):
     @unittest.skipIf(TEST_WITH_UBSAN, "division-by-zero error with UBSAN")
     def test_lbfgs_return_type(self):
         params = [torch.randn(10, 5), torch.randn(10)]
-        opt1 = optim.LBFGS(params, 0.01, tolerance_grad=float('inf'))
-        opt2 = optim.LBFGS(params, 0.01, tolerance_grad=-float('inf'))
+        opt1 = optim.LBFGS(params, 0.01, tolerance_grad=inf)
+        opt2 = optim.LBFGS(params, 0.01, tolerance_grad=-inf)
 
         def closure():
             return torch.Tensor([10])
diff --git a/test/test_torch.py b/test/test_torch.py
index 4a015829c389a5..bf8f9102529595 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -16,12 +16,13 @@
 from torch._utils_internal import get_file_path, get_file_path_2
 from torch.utils.dlpack import from_dlpack, to_dlpack
 from torch._utils import _rebuild_tensor
+from torch._six import inf, nan
 from itertools import product, combinations
 from functools import reduce
 from torch import multiprocessing as mp
 from common import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MKL, \
-    run_tests, download_file, skipIfNoLapack, suppress_warnings, IS_WINDOWS, \
-    PY3, NO_MULTIPROCESSING_SPAWN, skipIfNoZeroSize
+    TEST_LIBROSA, run_tests, download_file, skipIfNoLapack, suppress_warnings, \
+    IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, skipIfNoZeroSize
 from multiprocessing.reduction import ForkingPickler
 
 if TEST_NUMPY:
@@ -30,6 +31,9 @@
 if TEST_SCIPY:
     from scipy import signal
 
+if TEST_LIBROSA:
+    import librosa
+
 SIZE = 100
 
 can_retrieve_source = True
@@ -84,6 +88,60 @@ def __exit__(self, *args):
 
 
 class TestTorch(TestCase):
+    def _check_sum_dim(tensors, dim):
+        for tensor in tensors:
+            expected = tensor.numpy().sum(dim)
+            actual = tensor.sum(dim)
+            self.assertEqual(expected.shape, actual.shape)
+            if actual.dtype == torch.float:
+                self.assertTrue(np.allclose(expected, actual.numpy(), rtol=1e-03, atol=1e-05))
+            else:
+                self.assertTrue(np.allclose(expected, actual.numpy()))
+
+    def _make_tensors(self, shape, val_range=(-100, 100), use_floating=True, use_integral=True):
+        float_types = [torch.double,
+                       torch.float]
+        int_types = [torch.int64,
+                     torch.int32,
+                     torch.int16]
+
+        def make_contiguous(shape, dtype):
+            if dtype in float_types:
+                val = torch.randn(shape, dtype=dtype)
+                val = val * ((val_range[1] - val_range[0]) / (math.pi * 2.0))
+                val = val + ((val_range[1] - val_range[0]) / 2.0)
+                val = torch.clamp(val, min=val_range[0], max=val_range[1])
+                return val
+            result = torch.zeros(shape, dtype=dtype)
+            result.apply_(lambda x: random.randint(val_range[0], val_range[1]))
+            return result
+
+        def make_non_contiguous(shape, dtype):
+            contig = make_contiguous(shape, dtype)
+            non_contig = torch.empty(shape + (2, 2), dtype=dtype)[..., 0]
+            non_contig = non_contig.select(-1, -1)
+            non_contig.copy_(contig)
+            self.assertFalse(non_contig.is_contiguous())
+            return non_contig
+
+        def make_contiguous_slice(size, dtype):
+            contig = make_contiguous((1, size), dtype)
+            non_contig = contig[:1, 1:size - 1]
+            self.assertTrue(non_contig.is_contiguous())
+            return contig
+
+        types = []
+        if use_floating:
+            types += float_types
+        if use_integral:
+            types += int_types
+        tensors = {"cont": [], "noncont": [], "slice": []}
+        for dtype in types:
+            tensors["cont"].append(make_contiguous(shape, dtype))
+            tensors["noncont"].append(make_non_contiguous(shape, dtype))
+            tensors["slice"].append(make_contiguous_slice(sum(list(shape)), dtype))
+
+        return tensors
 
     def test_dot(self):
         types = {
@@ -238,17 +296,17 @@ def test_allclose(self):
         self.assertTrue(torch.allclose(x, y, rtol=0.01, atol=0.0))
         self.assertFalse(torch.allclose(x, y))
         self.assertTrue(torch.allclose(torch.tensor([0.0]), torch.tensor([1e-8])))
-        x = torch.tensor([2.0, 3.0, float('nan')])
-        y = torch.tensor([2.01, 3.01, float('nan')])
+        x = torch.tensor([2.0, 3.0, nan])
+        y = torch.tensor([2.01, 3.01, nan])
         self.assertFalse(torch.allclose(x, y, rtol=1e-2))
         self.assertTrue(torch.allclose(x, y, rtol=1e-2, equal_nan=True))
         self.assertFalse(torch.allclose(x, y, rtol=1e-3, equal_nan=True))
-        inf = torch.tensor([float('inf')])
-        self.assertTrue(torch.allclose(inf, inf))
-        self.assertTrue(torch.allclose(-inf, -inf))
-        self.assertFalse(torch.allclose(inf, -inf))
-        self.assertFalse(torch.allclose(inf, torch.tensor([1e20])))
-        self.assertFalse(torch.allclose(-inf, torch.tensor([-1e20])))
+        inf_t = torch.tensor([inf])
+        self.assertTrue(torch.allclose(inf_t, inf_t))
+        self.assertTrue(torch.allclose(-inf_t, -inf_t))
+        self.assertFalse(torch.allclose(inf_t, -inf_t))
+        self.assertFalse(torch.allclose(inf_t, torch.tensor([1e20])))
+        self.assertFalse(torch.allclose(-inf_t, torch.tensor([-1e20])))
 
     def test_linear_algebra_scalar_raises(self):
         m = torch.randn(5, 5)
@@ -356,13 +414,13 @@ def sinh(x):
             try:
                 return math.sinh(x)
             except OverflowError:
-                return float('inf') if x > 0 else float('-inf')
+                return inf if x > 0 else -inf
         self._test_math(torch.sinh, sinh)
 
     def test_lgamma(self):
         def lgamma(x):
             if x <= 0 and x == int(x):
-                return float('inf')
+                return inf
             return math.lgamma(x)
         self._test_math(torch.lgamma, lgamma)
 
@@ -389,14 +447,14 @@ def test_digamma(self):
         # scipy 1.1.0 changed when it returns +/-inf vs. NaN
         def torch_digamma_without_inf(inp):
             res = torch.digamma(inp)
-            res[(res == float('-inf')) | (res == float('inf'))] = float('nan')
+            res[(res == -inf) | (res == inf)] = nan
             return res
 
         def scipy_digamma_without_inf(inp):
             res = digamma(inp)
             if np.isscalar(res):
-                return res if np.isfinite(res) else float('nan')
-            res[np.isinf(res)] = float('nan')
+                return res if np.isfinite(res) else nan
+            res[np.isinf(res)] = nan
             return res
 
         self._test_math(torch_digamma_without_inf, scipy_digamma_without_inf, self._digamma_input())
@@ -410,7 +468,7 @@ def test_polygamma(self):
                             self._digamma_input(test_poles=False))
 
     def test_asin(self):
-        self._test_math(torch.asin, lambda x: math.asin(x) if abs(x) <= 1 else float('nan'))
+        self._test_math(torch.asin, lambda x: math.asin(x) if abs(x) <= 1 else nan)
 
     def test_cos(self):
         self._test_math_by_name('cos')
@@ -422,11 +480,11 @@ def cosh(x):
             except OverflowError:
                 # Return inf on overflow.
                 # See http://en.cppreference.com/w/cpp/numeric/math/cosh
-                return float('inf')
+                return inf
         self._test_math(torch.cosh, cosh)
 
     def test_acos(self):
-        self._test_math(torch.acos, lambda x: math.acos(x) if abs(x) <= 1 else float('nan'))
+        self._test_math(torch.acos, lambda x: math.acos(x) if abs(x) <= 1 else nan)
 
     def test_tan(self):
         self._test_math_by_name('tan')
@@ -440,36 +498,36 @@ def test_atan(self):
     def test_log(self):
         def log(x):
             if x == 0:
-                return float('-inf')
+                return -inf
             elif x < 0:
-                return float('nan')
+                return nan
             return math.log(x)
         self._test_math(torch.log, log)
 
     def test_log10(self):
         def log10(x):
             if x == 0:
-                return float('-inf')
+                return -inf
             elif x < 0:
-                return float('nan')
+                return nan
             return math.log10(x)
         self._test_math(torch.log10, log10)
 
     def test_log1p(self):
         def log1p(x):
             if x == -1:
-                return float('-inf')
+                return -inf
             elif x < -1:
-                return float('nan')
+                return nan
             return math.log1p(x)
         self._test_math(torch.log1p, log1p)
 
     def test_log2(self):
         def log2(x):
             if x == 0:
-                return float('-inf')
+                return -inf
             elif x < 0:
-                return float('nan')
+                return nan
             try:
                 return math.log2(x)
             except AttributeError:
@@ -477,7 +535,7 @@ def log2(x):
         self._test_math(torch.log2, log2)
 
     def test_sqrt(self):
-        self._test_math(torch.sqrt, lambda x: math.sqrt(x) if x >= 0 else float('nan'))
+        self._test_math(torch.sqrt, lambda x: math.sqrt(x) if x >= 0 else nan)
 
     def test_erf(self):
         self._test_math_by_name('erf')
@@ -490,9 +548,9 @@ def checkType(tensor):
             inputValues = torch.randn(4, 4, out=tensor()).clamp(-2., 2.)
             self.assertEqual(tensor(inputValues).erf().erfinv(), tensor(inputValues))
             # test inf
-            self.assertTrue(torch.equal(tensor([-1, 1]).erfinv(), tensor([float('-inf'), float('inf')])))
+            self.assertTrue(torch.equal(tensor([-1, 1]).erfinv(), tensor([-inf, inf])))
             # test nan
-            self.assertEqual(tensor([-2, 2]).erfinv(), tensor([float('nan'), float('nan')]))
+            self.assertEqual(tensor([-2, 2]).erfinv(), tensor([nan, nan]))
 
         checkType(torch.FloatTensor)
         checkType(torch.DoubleTensor)
@@ -502,7 +560,7 @@ def exp(x):
             try:
                 return math.exp(x)
             except OverflowError:
-                return float('inf')
+                return inf
         self._test_math(torch.exp, exp)
 
     def test_expm1(self):
@@ -510,7 +568,7 @@ def expm1(x):
             try:
                 return math.expm1(x)
             except OverflowError:
-                return float('inf')
+                return inf
         self._test_math(torch.expm1, expm1)
 
     def test_floor(self):
@@ -522,9 +580,9 @@ def test_ceil(self):
     def test_rsqrt(self):
         def rsqrt(x):
             if x == 0:
-                return float('inf')
+                return inf
             elif x < 0:
-                return float('nan')
+                return nan
             return 1.0 / math.sqrt(x)
 
         self._test_math(torch.rsqrt, rsqrt)
@@ -612,7 +670,7 @@ def _testSelection(self, torchfn, mathfn):
         # NaNs
         for index in (0, 4, 99):
             m1 = torch.randn(100)
-            m1[index] = float('nan')
+            m1[index] = nan
             res1val, res1ind = torch.max(m1, 0)
             self.assertTrue(math.isnan(res1val))
             self.assertEqual(res1ind, index)
@@ -630,14 +688,14 @@ def _test_norm(self, device):
         # full reduction
         x = torch.randn(5, device=device)
         xn = x.cpu().numpy()
-        for p in [0, 1, 2, 3, 4, float('inf')]:
+        for p in [0, 1, 2, 3, 4, inf]:
             res = x.norm(p).item()
             expected = np.linalg.norm(xn, p)
             self.assertEqual(res, expected, "full reduction failed for {}-norm".format(p))
         # one dimension
         x = torch.randn(5, 5, device=device)
         xn = x.cpu().numpy()
-        for p in [0, 1, 2, 3, 4, float('inf')]:
+        for p in [0, 1, 2, 3, 4, inf]:
             res = x.norm(p, 1).cpu().numpy()
             expected = np.linalg.norm(xn, p, 1)
             self.assertEqual(res.shape, expected.shape)
@@ -805,10 +863,10 @@ def test_reduction_empty(self):
             ('prod', lambda *args, **kwargs: torch.prod(*args, **kwargs), 1),
             ('sum', lambda *args, **kwargs: torch.sum(*args, **kwargs), 0),
             ('norm', lambda *args, **kwargs: torch.norm(*args, p=2, **kwargs), 0),
-            ('mean', lambda *args, **kwargs: torch.mean(*args, **kwargs), float('nan')),
-            ('var', lambda *args, **kwargs: torch.var(*args, **kwargs), float('nan')),
-            ('std', lambda *args, **kwargs: torch.std(*args, **kwargs), float('nan')),
-            ('logsumexp', lambda *args, **kwargs: torch.logsumexp(*args, **kwargs), float('-inf')),
+            ('mean', lambda *args, **kwargs: torch.mean(*args, **kwargs), nan),
+            ('var', lambda *args, **kwargs: torch.var(*args, **kwargs), nan),
+            ('std', lambda *args, **kwargs: torch.std(*args, **kwargs), nan),
+            ('logsumexp', lambda *args, **kwargs: torch.logsumexp(*args, **kwargs), -inf),
         ]
 
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
@@ -875,8 +933,8 @@ def test_pairwise_distance_empty(self):
     def test_logsumexp(self):
         from scipy.special import logsumexp
         a = torch.randn(5, 4)
-        a[0, 0] = float('inf')
-        a[1, :] = float('-inf')
+        a[0, 0] = inf
+        a[1, :] = -inf
         actual = a.logsumexp(1)
         expected = logsumexp(a.numpy(), 1)
         self.assertEqual(expected.shape, actual.shape)
@@ -1064,18 +1122,18 @@ def test_csub(self):
 
     @staticmethod
     def _test_neg(self, cast):
-        float_types = ['torch.DoubleTensor', 'torch.FloatTensor', 'torch.LongTensor']
-        int_types = ['torch.IntTensor', 'torch.ShortTensor', 'torch.ByteTensor',
-                     'torch.CharTensor']
+        float_types = [torch.DoubleTensor, torch.FloatTensor, torch.LongTensor]
+        int_types = [torch.IntTensor, torch.ShortTensor, torch.ByteTensor,
+                     torch.CharTensor]
 
         for t in float_types + int_types:
             if t in float_types:
                 a = cast(torch.randn(100, 90).type(t))
             else:
-                a = cast(torch.Tensor(100, 90).type(t).random_(-128, 128))
+                a = cast(torch.randint(-128, 128, (100, 90), dtype=t.dtype))
             zeros = cast(torch.Tensor().type(t)).resize_as_(a).zero_()
 
-            if t == 'torch.ByteTensor':
+            if t == torch.ByteTensor:
                 res_add = torch.add(zeros, a, alpha=255)
             else:
                 res_add = torch.add(zeros, a, alpha=-1)
@@ -1537,7 +1595,7 @@ def test_cmul(self):
         self._test_cop(torch.mul, lambda x, y: x * y)
 
     def test_cpow(self):
-        self._test_cop(torch.pow, lambda x, y: float('nan') if x < 0 else math.pow(x, y))
+        self._test_cop(torch.pow, lambda x, y: nan if x < 0 else math.pow(x, y))
 
     @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
     def test_einsum(self):
@@ -1615,65 +1673,32 @@ def check_sum_all(tensor):
 
     @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
     def test_sum_dim(self):
-        def check_sum_dim(tensors, dim):
-            for tensor in tensors:
-                expected = tensor.numpy().sum(dim)
-                actual = tensor.sum(dim)
-                self.assertEqual(expected.shape, actual.shape)
-                if actual.dtype == torch.float:
-                    self.assertTrue(np.allclose(expected, actual.numpy(), rtol=1e-03, atol=1e-05))
-                else:
-                    self.assertTrue(np.allclose(expected, actual.numpy()))
-
-        float_types = [torch.double,
-                       torch.float]
-        int_types = [torch.int64,
-                     torch.int32,
-                     torch.int16]
-
-        def make_contiguous(shape, dtype):
-            if dtype in float_types:
-                return torch.randn(*shape, dtype=dtype)
-            result = torch.zeros(*shape, dtype=dtype)
-            result.apply_(lambda x: random.randint(-100, 100))
-            return result
-
-        def make_non_contiguous(shape, dtype):
-            contig = make_contiguous(shape, dtype)
-            non_contig = torch.empty(shape + (2,), dtype=dtype)[..., 0]
-            non_contig.copy_(contig)
-            self.assertFalse(non_contig.is_contiguous())
-            return non_contig
-
-        def make_tensors(*shape):
-            tensors = []
-            for dtype in float_types + int_types:
-                tensors.append(make_contiguous(shape, dtype))
-                tensors.append(make_non_contiguous(shape, dtype))
-            return tensors
-
-        check_sum_dim(make_tensors(5, 400000), 1)
-        check_sum_dim(make_tensors(3, 5, 7), 0)
-        check_sum_dim(make_tensors(3, 5, 7), 1)
-        check_sum_dim(make_tensors(3, 5, 7), 2)
-        check_sum_dim(make_tensors(100000), -1)
-        check_sum_dim(make_tensors(50, 50, 50), 0)
-        check_sum_dim(make_tensors(50, 50, 50), 1)
-        check_sum_dim(make_tensors(50, 50, 50), 2)
-        check_sum_dim(make_tensors(50, 50, 50), (1, 2))
-        check_sum_dim(make_tensors(50, 50, 50), (1, -1))
-
-        def make_contiguous_slice(size, dtype):
-            contig = make_contiguous((1, size), dtype)
-            non_contig = contig[:1, 1:size - 1]
-            self.assertTrue(non_contig.is_contiguous())
-            return contig
-
-        for dtype in float_types + int_types:
-            check_sum_dim(make_contiguous_slice(5, dtype), 0)
-            check_sum_dim(make_contiguous_slice(50, dtype), 0)
-            check_sum_dim(make_contiguous_slice(500, dtype), 0)
-            check_sum_dim(make_contiguous_slice(100000, dtype), 0)
+        def check_sum_dim(tensors_dict, dim):
+            for category, tensors in tensors_dict.items():
+                if category == "slice":
+                    dim = 0
+                for tensor in tensors:
+                    expected = tensor.numpy().sum(dim)
+                    actual = tensor.sum(dim)
+                    self.assertEqual(expected.shape, actual.shape)
+                    if actual.dtype == torch.float:
+                        self.assertTrue(np.allclose(expected, actual.numpy(), rtol=1e-03, atol=1e-05))
+                    else:
+                        self.assertTrue(np.allclose(expected, actual.numpy()))
+
+        float_types = [torch.double, torch.float]
+        int_types = [torch.int64, torch.int32, torch.int16]
+
+        check_sum_dim(self._make_tensors((5, 400000)), 1)
+        check_sum_dim(self._make_tensors((3, 5, 7)), 0)
+        check_sum_dim(self._make_tensors((3, 5, 7)), 1)
+        check_sum_dim(self._make_tensors((3, 5, 7)), 2)
+        check_sum_dim(self._make_tensors((100000, )), -1)
+        check_sum_dim(self._make_tensors((50, 50, 50)), 0)
+        check_sum_dim(self._make_tensors((50, 50, 50)), 1)
+        check_sum_dim(self._make_tensors((50, 50, 50)), 2)
+        check_sum_dim(self._make_tensors((50, 50, 50)), (1, 2))
+        check_sum_dim(self._make_tensors((50, 50, 50)), (1, -1))
 
     def test_sum_out(self):
         x = torch.rand(100, 100)
@@ -1906,6 +1931,12 @@ def test_device(self):
         self.assertRaises(TypeError, lambda: torch.device('other'))
         self.assertRaises(TypeError, lambda: torch.device('other:0'))
 
+        device_set = {'cpu', 'cpu:0', 'cuda', 'cuda:0', 'cuda:1', 'cuda:10', 'cuda:100'}
+        device_hash_set = set()
+        for device in list(device_set):
+            device_hash_set.add(hash(torch.device(device)))
+        self.assertEqual(len(device_set), len(device_hash_set))
+
     def test_tensor_device(self):
         def assertEqual(device_str, fn):
             self.assertEqual(torch.device(device_str), fn().device)
@@ -2407,7 +2438,7 @@ def _test_renorm_ps(self, device):
         # full reduction
         x = torch.randn(5, 5)
         xn = x.numpy()
-        for p in [1, 2, 3, 4, float('inf')]:
+        for p in [1, 2, 3, 4, inf]:
             res = x.renorm(p, 1, 1)
             expected = x / x.norm(p, 0, keepdim=True).clamp(min=1)
             self.assertEqual(res.numpy(), expected.numpy(), "renorm failed for {}-norm".format(p))
@@ -2523,9 +2554,9 @@ def _test_multinomial_invalid_probs(probs):
     def test_multinomial_invalid_probs(self):
         test_method = TestTorch._test_multinomial_invalid_probs
         self._spawn_method(test_method, torch.Tensor([0, -1]))
-        self._spawn_method(test_method, torch.Tensor([0, float('inf')]))
-        self._spawn_method(test_method, torch.Tensor([0, float('-inf')]))
-        self._spawn_method(test_method, torch.Tensor([0, float('nan')]))
+        self._spawn_method(test_method, torch.Tensor([0, inf]))
+        self._spawn_method(test_method, torch.Tensor([0, -inf]))
+        self._spawn_method(test_method, torch.Tensor([0, nan]))
 
     @suppress_warnings
     def test_range(self):
@@ -4455,106 +4486,61 @@ def test_fft_ifft_rfft_irfft(self):
 
     @staticmethod
     def _test_stft(self, device='cpu'):
-        def naive_stft(x, frame_length, hop, fft_size=None, normalized=False,
-                       onesided=True, window=None, pad_end=0):
-            if fft_size is None:
-                fft_size = frame_length
-            x = x.clone()
+        if not TEST_LIBROSA:
+            raise unittest.SkipTest('librosa not found')
+
+        def librosa_stft(x, n_fft, hop_length, win_length, window, center):
             if window is None:
-                window = x.new_ones(frame_length)
+                window = np.ones(n_fft if win_length is None else win_length)
             else:
-                window = window.clone()
+                window = window.cpu().numpy()
             input_1d = x.dim() == 1
             if input_1d:
                 x = x.view(1, -1)
-            batch = x.size(0)
-            if pad_end > 0:
-                x_pad = x.new(batch, pad_end).fill_(0)
-                x = torch.cat([x, x_pad], 1)
-            length = x.size(1)
-            if TEST_NUMPY and TEST_SCIPY:
-                sp_result = signal.stft(
-                    x,
-                    nperseg=frame_length,
-                    noverlap=frame_length - hop,
-                    window=window,
-                    nfft=fft_size,
-                    return_onesided=onesided,
-                    boundary=None,
-                    padded=False,
-                )[2].transpose((0, 2, 1)) * np.abs(window.sum().item())
-                result = torch.Tensor(np.stack([sp_result.real, sp_result.imag], -1))
-            else:
-                if onesided:
-                    return_size = int(fft_size / 2) + 1
-                else:
-                    return_size = fft_size
-                result = x.new(batch, int((length - frame_length) / float(hop)) + 1, return_size, 2)
-                for w in range(return_size):  # freq
-                    radians = torch.arange(float(frame_length)) * w * 2 * math.pi / fft_size
-                    radians = radians.type_as(x)
-                    re_kernel = radians.cos().mul_(window)
-                    im_kernel = -radians.sin().mul_(window)
-                    for b in range(batch):
-                        for i, t in enumerate(range(0, length - frame_length + 1, hop)):
-                            seg = x[b, t:(t + frame_length)]
-                            re = seg.dot(re_kernel)
-                            im = seg.dot(im_kernel)
-                            result[b, i, w, 0] = re
-                            result[b, i, w, 1] = im
-            if normalized:
-                result /= frame_length ** 0.5
+            result = []
+            for xi in x:
+                ri = librosa.stft(xi.cpu().numpy(), n_fft, hop_length, win_length, window, center=center)
+                result.append(torch.from_numpy(np.stack([ri.real, ri.imag], -1)))
+            result = torch.stack(result, 0)
             if input_1d:
                 result = result[0]
             return result
 
-        def _test(sizes, frame_length, hop, fft_size=None, normalized=False,
-                  onesided=True, window_sizes=None, pad_end=0, expected_error=None):
+        def _test(sizes, n_fft, hop_length=None, win_length=None, win_sizes=None,
+                  center=True, expected_error=None):
             x = torch.randn(*sizes, device=device)
-            if window_sizes is not None:
-                window = torch.randn(*window_sizes, device=device)
+            if win_sizes is not None:
+                window = torch.randn(*win_sizes, device=device)
             else:
                 window = None
             if expected_error is None:
-                result = x.stft(frame_length, hop, fft_size, normalized, onesided, window, pad_end)
-                ref_result = naive_stft(x, frame_length, hop, fft_size, normalized, onesided, window, pad_end)
-                self.assertEqual(result.data, ref_result, 7e-6, 'stft result')
+                result = x.stft(n_fft, hop_length, win_length, window, center=center)
+                ref_result = librosa_stft(x, n_fft, hop_length, win_length, window, center)
+                self.assertEqual(result, ref_result, 7e-6, 'stft comparison against librosa')
             else:
                 self.assertRaises(expected_error,
-                                  lambda: x.stft(frame_length, hop, fft_size, normalized, onesided, window, pad_end))
-
-        _test((2, 5), 4, 2, pad_end=1)
-        _test((4, 150), 90, 45, pad_end=0)
-        _test((10,), 7, 2, pad_end=0)
-        _test((10, 4000), 1024, 512, pad_end=0)
+                                  lambda: x.stft(n_fft, hop_length, win_length, window, center=center))
 
-        _test((2, 5), 4, 2, window_sizes=(4,), pad_end=1)
-        _test((4, 150), 90, 45, window_sizes=(90,), pad_end=0)
-        _test((10,), 7, 2, window_sizes=(7,), pad_end=0)
-        _test((10, 4000), 1024, 512, window_sizes=(1024,), pad_end=0)
+        for center in [True, False]:
+            _test((10,), 7, center=center)
+            _test((10, 4000), 1024, center=center)
 
-        _test((2, 5), 4, 2, fft_size=5, window_sizes=(4,), pad_end=1)
-        _test((4, 150), 90, 45, fft_size=100, window_sizes=(90,), pad_end=0)
-        _test((10,), 7, 2, fft_size=33, window_sizes=(7,), pad_end=0)
-        _test((10, 4000), 1024, 512, fft_size=1500, window_sizes=(1024,), pad_end=0)
+            _test((10,), 7, 2, center=center)
+            _test((10, 4000), 1024, 512, center=center)
 
-        _test((2, 5), 4, 2, fft_size=5, onesided=False, window_sizes=(4,), pad_end=1)
-        _test((4, 150), 90, 45, fft_size=100, onesided=False, window_sizes=(90,), pad_end=0)
-        _test((10,), 7, 2, fft_size=33, onesided=False, window_sizes=(7,), pad_end=0)
-        _test((10, 4000), 1024, 512, fft_size=1500, onesided=False, window_sizes=(1024,), pad_end=0)
+            _test((10,), 7, 2, win_sizes=(7,), center=center)
+            _test((10, 4000), 1024, 512, win_sizes=(1024,), center=center)
 
-        _test((2, 5), 4, 2, fft_size=5, normalized=True, onesided=False, window_sizes=(4,), pad_end=1)
-        _test((4, 150), 90, 45, fft_size=100, normalized=True, onesided=False, window_sizes=(90,), pad_end=0)
-        _test((10,), 7, 2, fft_size=33, normalized=True, onesided=False, window_sizes=(7,), pad_end=0)
-        _test((10, 4000), 1024, 512, fft_size=1500, normalized=True, onesided=False, window_sizes=(1024,), pad_end=0)
+            # spectral oversample
+            _test((10,), 7, 2, win_length=5, center=center)
+            _test((10, 4000), 1024, 512, win_length=100, center=center)
 
         _test((10, 4, 2), 1, 1, expected_error=RuntimeError)
-        _test((10,), 11, 1, expected_error=RuntimeError)
-        _test((10,), 0, 1, pad_end=4, expected_error=RuntimeError)
-        _test((10,), 15, 1, pad_end=4, expected_error=RuntimeError)
-        _test((10,), 5, -4, expected_error=RuntimeError)
-        _test((10,), 5, 4, window_sizes=(11,), expected_error=RuntimeError)
-        _test((10,), 5, 4, window_sizes=(1, 1), expected_error=RuntimeError)
+        _test((10,), 11, 1, center=False, expected_error=RuntimeError)
+        _test((10,), -1, 1, expected_error=RuntimeError)
+        _test((10,), 3, win_length=5, expected_error=RuntimeError)
+        _test((10,), 5, 4, win_sizes=(11,), expected_error=RuntimeError)
+        _test((10,), 5, 4, win_sizes=(1, 1), expected_error=RuntimeError)
 
     def test_stft(self):
         self._test_stft(self)
@@ -4707,14 +4693,18 @@ def test_logical(self):
         self.assertEqual(neqs.long().sum(), xne.long().sum(), 0)
         self.assertEqual(x.nelement(), all.long().sum())
 
-    def test_isnan(self):
-        x = torch.Tensor([1, float('nan'), 2])
-        self.assertEqual(torch.isnan(x), torch.ByteTensor([0, 1, 0]))
+    def test_isfinite(self):
+        x = torch.Tensor([1, inf, 2, -inf, nan, -10])
+        self.assertEqual(torch.isfinite(x), torch.ByteTensor([1, 0, 1, 0, 0, 1]))
 
     def test_isinf(self):
-        x = torch.Tensor([1, float('inf'), 2, float('-inf'), float('nan')])
+        x = torch.Tensor([1, inf, 2, -inf, nan])
         self.assertEqual(torch.isinf(x), torch.ByteTensor([0, 1, 0, 1, 0]))
 
+    def test_isnan(self):
+        x = torch.Tensor([1, nan, 2])
+        self.assertEqual(torch.isnan(x), torch.ByteTensor([0, 1, 0]))
+
     def test_RNGState(self):
         state = torch.get_rng_state()
         stateCloned = state.clone()
@@ -5871,26 +5861,31 @@ def test_masked_fill(self):
         self.assertEqual(dst, dst2, 0)
 
     def test_abs(self):
-        size = 1000
-        max_val = 1000
-        original = torch.rand(size).mul(max_val)
-        # Tensor filled with values from {-1, 1}
-        switch = torch.rand(size).mul(2).floor().mul(2).add(-1)
+        def _test_abs(tensors_dict):
+            for category, tensors in tensors_dict.items():
+                for data in tensors:
+                    switch = torch.rand(data.size()).mul(2).floor().mul(2).add(-1).type(data.dtype)
+                    res = torch.mul(data, switch)
+                    self.assertTensorsSlowEqual(res.abs(), data, 1e-16)
 
-        types = ['torch.DoubleTensor', 'torch.FloatTensor', 'torch.LongTensor',
-                 'torch.IntTensor', 'torch.ShortTensor']
-        for t in types:
-            data = original.type(t)
-            switch = switch.type(t)
-            res = torch.mul(data, switch)
-            # abs is used in assertEqual so we use the slow version instead
-            self.assertTensorsSlowEqual(res.abs(), data, 1e-16)
+        max_val = 1000
+        _test_abs(self._make_tensors((3, 4), val_range=(0, max_val)))
+        _test_abs(self._make_tensors((3, 5, 7), val_range=(0, max_val)))
+        _test_abs(self._make_tensors((2, 2, 5, 8, 2, 3), val_range=(0, max_val)))
+        _test_abs(self._make_tensors((1000, ), val_range=(0, max_val)))
+        _test_abs(self._make_tensors((30, 30, 30), val_range=(0, max_val)))
 
         # Checking that the right abs function is called for LongTensor
         bignumber = 2 ^ 31 + 1
         res = torch.LongTensor((-bignumber,))
         self.assertGreater(res.abs()[0], 0)
 
+        # One of
+        rec = torch.randn(2, 2, 3, 7, 6, 2).type(torch.float64).clamp(0, 1)
+        val1 = rec.select(-1, -1).data[0][0][0].sum()
+        val2 = rec.select(-1, -1).data.abs()[0][0][0].sum()
+        self.assertEqual(val1, val2, 1e-8, 'absolute value')
+
     def test_hardshrink(self):
         data_original = torch.tensor([1, 0.5, 0.3, 0.6]).view(2, 2)
         float_types = [
@@ -6031,6 +6026,11 @@ def test_reshape(self):
             self.assertEqual(empty.reshape([1, -1]).shape, (0,))
         self.assertRaises(RuntimeError, lambda: empty.reshape(1))
 
+        x = torch.randn(3, 3)
+        self.assertEqual(x.data_ptr(), x.reshape_as(torch.rand(9)).data_ptr())
+        self.assertEqual(x.data_ptr(), x.reshape_as(torch.rand(1, 9, 1)).data_ptr())
+        self.assertRaises(RuntimeError, lambda: x.reshape_as(torch.rand(10)))
+
     @skipIfNoZeroSize
     def test_empty_reshape(self):
         x = torch.randn(0, 6)
@@ -6106,6 +6106,255 @@ def test_tensor_shape_empty(self):
             self.assertEqual([(0, 1, 3, 0)], [z.shape for z in torch.split(x, 1, dim=0)])
             self.assertEqual([(0, 1, 3, 0)], [z.shape for z in torch.split(x, 0, dim=0)])
 
+    # functions that operate over a dimension but don't reduce.
+    @skipIfNoZeroSize
+    def test_dim_function_empty(self):
+        # FIXME: enable CUDA tests.
+        devices = ['cpu']  # if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+            shape = (0, 1, 2, 0)
+            x = torch.randn(shape, device=device)
+
+            # size stride
+            self.assertEqual(0, x.size(3))
+            self.assertEqual(2, x.size(2))
+            self.assertEqual(2, x.stride(0))
+            self.assertEqual(1, x.stride(2))
+
+            self.assertEqual(x, torch.nn.functional.glu(x, 0))
+            self.assertEqual((0, 1, 1, 0), torch.nn.functional.glu(x, 2).shape)
+
+            # softmax, logsoftmax
+            self.assertEqual(x, torch.nn.functional.softmax(x, 0))
+            self.assertEqual(x, torch.nn.functional.softmax(x, 2))
+
+            self.assertEqual(x, torch.nn.functional.log_softmax(x, 0))
+            self.assertEqual(x, torch.nn.functional.log_softmax(x, 2))
+
+            # cumsum, cumprod
+            self.assertEqual(shape, torch.cumsum(x, 0).shape)
+            self.assertEqual(shape, torch.cumsum(x, 2).shape)
+            self.assertEqual(shape, torch.cumprod(x, 0).shape)
+            self.assertEqual(shape, torch.cumprod(x, 2).shape)
+
+            # flip
+            self.assertEqual(x, x.flip(0))
+            self.assertEqual(x, x.flip(2))
+
+            # unbind
+            self.assertEqual((), x.unbind(0))
+            self.assertEqual((torch.empty((0, 1, 0), device=device), torch.empty((0, 1, 0), device=device)),
+                             x.unbind(2))
+
+            # cross
+            y = torch.randn((0, 1, 3, 0), device=device)
+            self.assertEqual(y.shape, torch.cross(y, y).shape)
+
+            # renorm
+            self.assertEqual(shape, torch.renorm(x, 1, 0, 5).shape)
+            self.assertEqual(shape, torch.renorm(x, 1, 2, 5).shape)
+
+            # sort
+            self.assertEqual([shape, shape], [z.shape for z in torch.sort(x, dim=0)])
+            self.assertEqual([shape, shape], [z.shape for z in torch.sort(x, dim=2)])
+
+            # topk
+            self.assertEqual([shape, shape], [z.shape for z in torch.topk(x, 0, dim=0)])
+            self.assertEqual([(0, 1, 1, 0), (0, 1, 1, 0)], [z.shape for z in torch.topk(x, 1, dim=2)])
+
+            y = torch.randn((2, 3, 4), device=device)
+            self.assertEqual([(2, 3, 0), (2, 3, 0)], [z.shape for z in torch.topk(y, 0)])
+
+            # gather
+            self.assertEqual(shape, torch.gather(x, 0, torch.empty(shape, dtype=torch.int64)).shape)
+            self.assertEqual(shape, torch.gather(x, 2, torch.empty(shape, dtype=torch.int64)).shape)
+            larger_shape = (0, 1, 3, 0)
+            self.assertEqual(larger_shape, torch.gather(x, 2, torch.empty(larger_shape, dtype=torch.int64)).shape)
+            smaller_shape = (0, 1, 0, 0)
+            self.assertEqual(smaller_shape, torch.gather(x, 2, torch.empty(smaller_shape, dtype=torch.int64)).shape)
+            y = torch.randn((2, 3, 4), device=device)
+            self.assertEqual((0, 3, 4), torch.gather(y, 0, torch.empty((0, 3, 4), dtype=torch.int64)).shape)
+
+            # scatter, scatter_add
+            for dim in [0, 2]:
+                y = torch.randn(shape, device=device)
+                y_src = torch.randn(shape, device=device)
+                self.assertEqual(shape, y.scatter_(dim, torch.empty(shape, dtype=torch.int64), y_src).shape)
+                self.assertEqual(shape, y.scatter_add_(dim, torch.empty(shape, dtype=torch.int64), y_src).shape)
+
+            z = torch.randn((2, 3, 4), device=device)
+            z_src = torch.randn((2, 3, 4), device=device)
+            self.assertEqual(z, z.scatter_(2, torch.empty((2, 3, 0), dtype=torch.int64), z_src))
+            self.assertEqual(z, z.scatter_add_(2, torch.empty((2, 3, 0), dtype=torch.int64), z_src))
+
+            # index_fill, index_copy, index_add
+            c = x.clone()
+            ind_empty = torch.tensor([], dtype=torch.int64)
+            ind_01 = torch.tensor([0, 1], dtype=torch.int64)
+            self.assertEqual(c, c.index_fill_(0, ind_empty, -1))
+            self.assertEqual(c, c.index_fill_(2, ind_empty, -1))
+            self.assertEqual(c, c.index_fill_(2, torch.tensor([0, 1], dtype=torch.int64), -1))
+            self.assertEqual(c, c.index_copy_(0, ind_empty, torch.empty((0, 1, 2, 0), device=device)))
+            self.assertEqual(c, c.index_copy_(2, ind_empty, torch.empty((0, 1, 0, 0), device=device)))
+            self.assertEqual(c, c.index_copy_(2, ind_01, torch.empty((0, 1, 2, 0), device=device)))
+            self.assertEqual(c, c.index_add_(0, ind_empty, torch.empty((0, 1, 2, 0), device=device)))
+            self.assertEqual(c, c.index_add_(2, ind_empty, torch.empty((0, 1, 0, 0), device=device)))
+            self.assertEqual(c, c.index_add_(2, ind_01, torch.empty((0, 1, 2, 0), device=device)))
+
+            # index fill/copy/add non-empty
+            z = torch.randn((2, 3, 4), device=device)
+            self.assertEqual(z, z.index_fill_(0, ind_empty, -1))
+            z = torch.randn((2, 3, 4), device=device)
+            self.assertEqual(z, z.index_copy_(0, ind_empty, torch.empty((0, 3, 4), device=device)))
+            z = torch.randn((2, 3, 4), device=device)
+            self.assertEqual(z, z.index_add_(0, ind_empty, torch.empty((0, 3, 4), device=device)))
+
+            # index_select
+            self.assertEqual(x, x.index_select(0, ind_empty))
+            self.assertEqual((0, 1, 0, 0), x.index_select(2, ind_empty).shape)
+            self.assertEqual(x, x.index_select(2, ind_01))
+            z = torch.randn((2, 3, 4), device=device)  # non-empty
+            self.assertEqual((0, 3, 4), z.index_select(0, ind_empty).shape)
+
+    @skipIfNoZeroSize
+    def test_blas_empty(self):
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+
+            def fn(torchfn, *args):
+                return torchfn(*tuple(torch.randn(shape, device=device) if isinstance(shape, tuple) else shape
+                                      for shape in args))
+
+            # mm, addmm
+            self.assertEqual((0, 0), fn(torch.mm, (0, 0), (0, 0)).shape)
+            self.assertEqual((0, 5), fn(torch.mm, (0, 0), (0, 5)).shape)
+            self.assertEqual((5, 0), fn(torch.mm, (5, 0), (0, 0)).shape)
+            self.assertEqual((3, 0), fn(torch.mm, (3, 2), (2, 0)).shape)
+            self.assertEqual(torch.zeros((5, 6), device=device), fn(torch.mm, (5, 0), (0, 6)))
+
+            self.assertEqual((0, 0), fn(torch.addmm, (0, 0), (0, 0), (0, 0)).shape)
+            self.assertEqual((5, 6), fn(torch.addmm, (5, 6), (5, 0), (0, 6)).shape)
+
+            # mv, addmv
+            self.assertEqual((0,), fn(torch.mv, (0, 0), (0,)).shape)
+            self.assertEqual((0,), fn(torch.mv, (0, 2), (2,)).shape)
+            self.assertEqual(torch.zeros((3,), device=device), fn(torch.mv, (3, 0), (0,)))
+
+            self.assertEqual((0,), fn(torch.addmv, (0,), (0, 0), (0,)).shape)
+            self.assertEqual((3,), fn(torch.addmv, (3,), (3, 0), (0,)).shape)
+
+            # ger, addr
+            self.assertEqual((0, 0), fn(torch.ger, (0,), (0,)).shape)
+            self.assertEqual((5, 0), fn(torch.ger, (5,), (0,)).shape)
+            self.assertEqual((0, 4), fn(torch.ger, (0,), (4,)).shape)
+
+            self.assertEqual((0, 0), fn(torch.addr, (0, 0), (0,), (0,)).shape)
+            self.assertEqual((5, 0), fn(torch.addr, (5, 0), (5,), (0,)).shape)
+            self.assertEqual((0, 4), fn(torch.addr, (0, 4), (0,), (4,)).shape)
+
+            # bmm, baddbmm
+            self.assertEqual((0, 0, 0), fn(torch.bmm, (0, 0, 0), (0, 0, 0)).shape)
+            self.assertEqual((3, 0, 5), fn(torch.bmm, (3, 0, 0), (3, 0, 5)).shape)
+            self.assertEqual((0, 5, 6), fn(torch.bmm, (0, 5, 0), (0, 0, 6)).shape)
+            self.assertEqual(torch.zeros((3, 5, 6), device=device), fn(torch.bmm, (3, 5, 0), (3, 0, 6)))
+
+            self.assertEqual((0, 0, 0), fn(torch.baddbmm, (0, 0, 0), (0, 0, 0), (0, 0, 0)).shape)
+            self.assertEqual((3, 0, 5), fn(torch.baddbmm, (3, 0, 5), (3, 0, 0), (3, 0, 5)).shape)
+            self.assertEqual((0, 5, 6), fn(torch.baddbmm, (0, 5, 6), (0, 5, 0), (0, 0, 6)).shape)
+            self.assertEqual((3, 5, 6), fn(torch.baddbmm, (3, 5, 6), (3, 5, 0), (3, 0, 6)).shape)
+
+            # addbmm
+            self.assertEqual((0, 0), fn(torch.addbmm, (0, 0), (0, 0, 0), (0, 0, 0)).shape)
+            self.assertEqual((0, 5), fn(torch.addbmm, (0, 5), (3, 0, 0), (3, 0, 5)).shape)
+            self.assertEqual((5, 6), fn(torch.addbmm, (5, 6), (0, 5, 0), (0, 0, 6)).shape)
+
+            # matmul
+            self.assertEqual(torch.tensor(0., device=device), fn(torch.matmul, (0,), (0,)))
+            self.assertEqual((0, 0), fn(torch.matmul, (0, 0), (0, 0)).shape)
+            self.assertEqual((0, 0, 0), fn(torch.matmul, (0, 0, 0), (0, 0, 0)).shape)
+            self.assertEqual((5, 0, 0), fn(torch.matmul, (5, 0, 0), (5, 0, 0)).shape)
+            self.assertEqual(torch.zeros((5, 3, 4), device=device), fn(torch.matmul, (5, 3, 0), (5, 0, 4)))
+
+            # dot
+            self.assertEqual(torch.tensor(0., device=device), fn(torch.dot, (0,), (0,)))
+
+            # btrifact
+            A_LU, pivots = fn(torch.btrifact, (0, 5, 5))
+            self.assertEqual([(0, 5, 5), (0, 5)], [A_LU.shape, pivots.shape])
+            A_LU, pivots = fn(torch.btrifact, (0, 0, 0))
+            self.assertEqual([(0, 0, 0), (0, 0)], [A_LU.shape, pivots.shape])
+            A_LU, pivots = fn(torch.btrifact, (2, 0, 0))
+            self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape])
+
+    @skipIfNoZeroSize
+    def test_blas_alpha_beta_empty(self):
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in ['cuda']:
+            # ensure beta is respected
+            value = 11
+            input = torch.full((2,), value, device=device)
+            mat = torch.ones((2, 0), device=device)
+            vec = torch.ones((0,), device=device)
+            out = torch.randn((2,), device=device)
+            alpha = 6
+            beta = 3
+            self.assertEqual(torch.full((2,), beta * value, device=device),
+                             torch.addmv(input=input, mat=mat, vec=vec, alpha=alpha, beta=beta))
+            self.assertEqual(torch.full((2,), beta * value, device=device),
+                             torch.addmv(input=input, mat=mat, vec=vec, alpha=alpha, beta=beta, out=out))
+
+            # torch.addmm
+            input = torch.full((2, 3), value, device=device)
+            mat2 = torch.ones((0, 3), device=device)
+            out = torch.randn((2, 3), device=device)
+            self.assertEqual(torch.full((2, 3), beta * value, device=device),
+                             torch.addmm(input=input, mat1=mat, mat2=mat2, alpha=alpha, beta=beta))
+            self.assertEqual(torch.full((2, 3), beta * value, device=device),
+                             torch.addmm(input=input, mat1=mat, mat2=mat2, alpha=alpha, beta=beta, out=out))
+
+    @skipIfNoZeroSize
+    @skipIfNoLapack
+    def test_lapack_empty(self):
+        # FIXME: these are just a selection of LAPACK functions -- we need a general strategy here.
+        # The LAPACK functions themselves generally do NOT work with zero sized dimensions, although
+        # numpy/sci often has a direct wrapper (e.g. lu_factor) and a wrapper that "does the right thing"
+        # (e.g. lu).  We often name our functions identically to the lapack function, so it will take work
+        # to name / migrate-to better wrappers.
+
+        # FIXME: enable CUDA tests.
+        devices = ['cpu']  # if not torch.cuda.is_available() else ['cpu', 'cuda']
+        for device in devices:
+
+            def fn(torchfn, *args):
+                return torchfn(*tuple(torch.randn(shape, device=device) if isinstance(shape, tuple) else shape
+                                      for shape in args))
+
+            # inverse, pinverse
+            self.assertEqual((0, 0), fn(torch.inverse, (0, 0)).shape)
+            self.assertEqual((5, 0), fn(torch.pinverse, (0, 5)).shape)
+            self.assertEqual((0, 5), fn(torch.pinverse, (5, 0)).shape)
+            self.assertEqual((0, 0), fn(torch.pinverse, (0, 0)).shape)
+
+            # svd
+            self.assertRaises(RuntimeError, lambda: fn(torch.svd, (0, 0)))
+
+            # det, logdet, slogdet
+            self.assertEqual(torch.tensor(1., device=device), fn(torch.det, (0, 0)))
+            self.assertEqual(torch.tensor(0., device=device), fn(torch.logdet, (0, 0)))
+            self.assertEqual((torch.tensor(1., device=device), torch.tensor(0., device=device)),
+                             fn(torch.slogdet, (0, 0)))
+
+            # eig, symeig
+            evalues, evectors = fn(torch.eig, (0, 0), True)
+            self.assertEqual([(0, 2), (0, 0)], [evalues.shape, evectors.shape])
+            evalues, evectors = fn(torch.symeig, (0, 0), True)
+            self.assertEqual([(0,), (0, 0)], [evalues.shape, evectors.shape])
+
+            # qr, gels
+            self.assertRaises(RuntimeError, lambda: torch.qr(torch.randn(0, 0)))
+            self.assertRaises(RuntimeError, lambda: torch.gels(torch.randn(0, 0), torch.randn(0, 0)))
+            self.assertRaises(RuntimeError, lambda: torch.gels(torch.randn(0,), torch.randn(0, 0)))
+
     def test_expand(self):
         tensor = torch.rand(1, 8, 1)
         tensor2 = torch.rand(5)
@@ -7221,7 +7470,7 @@ def test_print(self):
         self.assertExpected(str(x), subname='negint')
 
         # test inf and nan
-        x = torch.tensor([4, float('inf'), 1.5, float('-inf'), 0, float('nan'), 1])
+        x = torch.tensor([4, inf, 1.5, -inf, 0, nan, 1])
         self.assertEqual(x.__repr__(), str(x))
         self.assertExpected(str(x), subname='nonfinite')
 
diff --git a/third_party/onnx b/third_party/onnx
index b2817a682f25f9..0efd9f85c4e837 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit b2817a682f25f960586f06caa539bbbd7a96b859
+Subproject commit 0efd9f85c4e837e8d64a8ea4d2d5b7a59fab75bb
diff --git a/tools/amd_build/build_caffe2_amd.py b/tools/amd_build/build_caffe2_amd.py
new file mode 100755
index 00000000000000..9effd464bbdb38
--- /dev/null
+++ b/tools/amd_build/build_caffe2_amd.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import subprocess
+
+amd_build_dir = os.path.dirname(os.path.realpath(__file__))
+proj_dir = os.path.join(os.path.dirname(os.path.dirname(amd_build_dir)))
+
+includes = [
+    "caffe2/operators/*",
+    "caffe2/sgd/*",
+    "caffe2/image/*",
+    "caffe2/transforms/*",
+    "caffe2/video/*",
+    "caffe2/distributed/*",
+]
+
+ignores = [
+    "caffe2/operators/depthwise_3x3_conv_op.cu",
+    "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu",
+    "caffe2/operators/top_k.cu",
+    "caffe2/operators/top_k_radix_selection.cuh",
+    "caffe2/operators/top_k_heap_selection.cuh",
+    "caffe2/operators/pool_op_cudnn.cu",
+    "caffe2/operators/roi_align_op_gpu_test.cc",
+    # elementwise ops test is failing
+    "caffe2/operators/elementwise_op_gpu_test.cc",
+    '**/hip/**',
+]
+
+file_extensions = ['.cc', '.cu', '.h', '.cuh']
+
+# Execute the Hipify Script.
+args = [
+    "--project-directory", proj_dir,
+    "--output-directory", proj_dir,
+    "--includes"] + includes + \
+    ["--extensions"] + file_extensions + \
+    ["--ignores"] + ignores + \
+    ["--hipify_caffe2", "True"] + \
+    ["--add-static-casts", "True"]
+
+subprocess.check_call([
+    sys.executable,
+    os.path.join(amd_build_dir, "pyHIPIFY", "hipify-python.py"),
+] + args)
diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py
index 3d3ff2958a5ecf..ed7206f0bf6d27 100644
--- a/tools/amd_build/build_pytorch_amd.py
+++ b/tools/amd_build/build_pytorch_amd.py
@@ -8,9 +8,9 @@
 
 amd_build_dir = os.path.dirname(os.path.realpath(__file__))
 proj_dir = os.path.dirname(os.path.dirname(amd_build_dir))
-include_dirs = [
-    "aten",
-    "torch"
+includes = [
+    "aten/*",
+    "torch/*"
 ]
 
 # List of operators currently disabled
@@ -63,9 +63,12 @@
 # Execute the Hipify Script.
 args = (["--project-directory", proj_dir] +
         ["--output-directory", proj_dir] +
-        ["--include-dirs"] + include_dirs +
+        ["--includes"] + includes +
         ["--yaml-settings", yaml_file] +
         ["--add-static-casts", "True"] +
         ["--show-progress", "False"])
 
-os.execv(os.path.join(amd_build_dir, "pyHIPIFY", "hipify-python.py"), ['python'] + args)
+subprocess.check_call([
+    sys.executable,
+    os.path.join(amd_build_dir, "pyHIPIFY", "hipify-python.py")
+] + args)
diff --git a/tools/amd_build/pyHIPIFY/constants.py b/tools/amd_build/pyHIPIFY/constants.py
index 1ea8f81a9ca9f9..092de16cff7a90 100644
--- a/tools/amd_build/pyHIPIFY/constants.py
+++ b/tools/amd_build/pyHIPIFY/constants.py
@@ -53,3 +53,4 @@
 
 HIP_UNSUPPORTED = 43
 API_PYTORCH = 1337
+API_CAFFE2 = 1338
\ No newline at end of file
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index c03562861692a1..26322a5842c09c 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -2106,5 +2106,29 @@
     "define MAX_NUM_BLOCKS 200": ("define MAX_NUM_BLOCKS 64", API_PYTORCH),
 }
 
+CAFFE2_SPECIFIC_MAPPINGS = {
+     "CUDA" :("HIP", API_CAFFE2),
+     "REGISTER_CUDA_OPERATOR" : ("REGISTER_HIP_OPERATOR", API_CAFFE2),
+     "cuda_stream" : ("hip_stream", API_CAFFE2),
+     "context_gpu" : ("hip/context_hip", API_CAFFE2),
+     "common_gpu"  : ("hip/common_hip", API_CAFFE2),
+     "mixed_utils" : ("hip/mixed_utils_hip", API_CAFFE2),
+     "operator_fallback_gpu" : ("hip/operator_fallback_hip", API_CAFFE2),
+     "recurrent_network_executor_gpu" : ("hip/recurrent_network_executor_hip", API_CAFFE2),
+     "max_pool_with_index_gpu": ("hip/max_pool_with_index_hip", API_CAFFE2),
+     "CUDA_1D_KERNEL_LOOP" : ("HIP_1D_KERNEL_LOOP", API_CAFFE2),
+     "CUDAContext" : ("HIPContext", API_CAFFE2),
+     "CAFFE_CUDA_NUM_THREADS" : ("CAFFE_HIP_NUM_THREADS", API_CAFFE2),
+     "HasCudaGPU" : ("HasHipGPU", API_CAFFE2),
+     "__expf" : ("expf", API_CAFFE2),
+     "CUBLAS_ENFORCE" : ("ROCBLAS_ENFORCE", API_CAFFE2),
+     "cublas_handle" : ("rocblas_handle", API_CAFFE2),
+     "CURAND_ENFORCE" :("HIPRAND_ENFORCE", API_CAFFE2),
+     "curandGenerateUniform" : ("hiprandGenerateUniform", API_CAFFE2),
+     "curand_generator" : ("hiprand_generator", API_CAFFE2),
+     "set_cuda_gpu_id" : ("set_hip_gpu_id", API_CAFFE2),
+     "CaffeCudaGetDevice" : ("CaffeHipGetDevice", API_CAFFE2),
+}
+
 CUDA_TO_HIP_MAPPINGS = [CUDA_TYPE_NAME_MAP, CUDA_IDENTIFIER_MAP,
-                        CUDA_INCLUDE_MAP, CUDA_SPARSE_MAP, PYTORCH_SPECIFIC_MAPPINGS]
+                        CUDA_INCLUDE_MAP, CUDA_SPARSE_MAP, PYTORCH_SPECIFIC_MAPPINGS, CAFFE2_SPECIFIC_MAPPINGS]
diff --git a/tools/amd_build/pyHIPIFY/hipify-python.py b/tools/amd_build/pyHIPIFY/hipify-python.py
index 15a717c7766cdf..fc3efabd26db7f 100755
--- a/tools/amd_build/pyHIPIFY/hipify-python.py
+++ b/tools/amd_build/pyHIPIFY/hipify-python.py
@@ -26,11 +26,13 @@
 
 import argparse
 import constants
+import fnmatch
 import re
 import shutil
 import sys
 import os
 import yaml
+import ast
 
 from functools import reduce
 from enum import Enum
@@ -40,6 +42,7 @@
 """This dictionary provides the mapping from PyTorch kernel template types
 to their actual types."""
 PYTORCH_TEMPLATE_MAP = {"Dtype": "real", "T": "real"}
+CAFFE2_TEMPLATE_MAP = {}
 
 
 def openf(filename, mode):
@@ -210,72 +213,47 @@ def update_progress_bar(total, progress):
     sys.stderr.flush()
 
 
-def filename_ends_with_extension(filename, extensions):
-    """Helper method to see if filename ends with certain extension"""
-    for ext in extensions:
-        if filename.endswith("." + ext):
-            return True
+def matched_files_iter(root_path, includes=('*',), ignores=(), extensions=(), hipify_caffe2=False):
+    def _fnmatch(filepath, patterns):
+        return any(fnmatch.fnmatch(filepath, pattern) for pattern in patterns)
 
-    return False
+    def match_extensions(filename):
+        """Helper method to see if filename ends with certain extension"""
+        return os.path.splitext(filename)[1] in extensions
 
+    for (dirpath, _, filenames) in os.walk(root_path, topdown=True):
+        for fn in filenames:
+            filepath = os.path.join(dirpath, fn)
+            rel_filepath = os.path.relpath(filepath, root_path)
+            if _fnmatch(rel_filepath, includes) and (not _fnmatch(rel_filepath, ignores)) and match_extensions(fn):
+                if hipify_caffe2 and not is_caffe2_gpu_file(filepath):
+                    continue
 
-def inside_included_directories(dirpath, rootpath, include_dirs):
-    """Helper method to see if filename within included directories"""
-    for included_directory in include_dirs:
-        if re.match(r'{0}\b'.format(os.path.join(rootpath, included_directory)), dirpath):
-            return True
+                yield filepath
 
-    return False
 
-
-def walk_over_directory(rootpath, extensions, show_detailed=False, include_dirs=None, show_progress=True):
+def preprocess(all_files, show_detailed=False, show_progress=True, hipify_caffe2=False):
     """
-    Recursively walk over the directory and call preprocessor on selected files.
+    Call preprocessor on selected files.
 
     Arguments)
-        extensions - A plist of file extensions ['cu', 'cuh', ..]
-
-        include_dirs - Directories under the rootpath that should be included in the walk.
-
         show_detailed - Show a detailed summary of the transpilation process.
     """
 
-    # Default argument for excluded directories.
-    if include_dirs is None:
-        include_dirs = []
-
     # Compute the total number of files to be traversed.
-    total_files = 0
-    for (dirpath, _dirnames, filenames) in os.walk(rootpath):
-        if inside_included_directories(dirpath, rootpath, include_dirs):
-            for filename in filenames:
-                total_files += filename_ends_with_extension(filename, extensions)
-
-    current_file = 0
+    total_count = len(all_files)
+    finished_count = 0
 
     # Preprocessing statistics.
     stats = {"unsupported_calls": [], "kernel_launches": []}
 
-    # Begin traversing the files.
-    for (dirpath, _dirnames, filenames) in os.walk(rootpath, topdown=True):
-        # Check if file ends with a valid extensions
-        if not inside_included_directories(dirpath, rootpath, include_dirs):
-            continue
-
-        for filename in filenames:
-            if filename_ends_with_extension(filename, extensions):
-                # Construct the file's full path
-                filepath = os.sep.join([dirpath, filename])
-
-                # Execute the preprocessor on the specified file.
-                preprocessor(filepath, stats)
-
-                # Update the progress
-                if show_progress:
-                    print(os.path.join(dirpath, filename))
-                    update_progress_bar(total_files, current_file)
-
-                current_file += 1
+    for filepath in all_files:
+        preprocessor(filepath, stats, hipify_caffe2)
+        # Update the progress
+        if show_progress:
+            print(filepath)
+            update_progress_bar(total_count, finished_count)
+            finished_count += 1
 
     print(bcolors.OKGREEN + "Successfully preprocessed all matching files." + bcolors.ENDC)
 
@@ -297,6 +275,41 @@ def compute_stats(stats):
     print("\nTotal number of replaced kernel launches: {0:d}".format(len(stats["kernel_launches"])))
 
 
+def add_dim3(kernel_string, cuda_kernel):
+    '''adds dim3() to the second and third arguments in the kernel launch'''
+    count = 0
+    closure = 0
+    kernel_string = kernel_string.replace("<<<", "").replace(">>>", "")
+    arg_locs = [{} for _ in range(2)]
+    arg_locs[count]['start'] = 0
+    for ind, c in enumerate(kernel_string):
+        if count > 1:
+            break
+        if c == "(":
+            closure += 1
+        elif c == ")":
+            closure -= 1
+        elif (c == "," or ind == len(kernel_string) - 1) and closure == 0:
+            arg_locs[count]['end'] = ind
+            count += 1
+            if count < 2:
+                arg_locs[count]['start'] = ind + 1
+
+    first_arg_raw = kernel_string[arg_locs[0]['start']:arg_locs[0]['end'] + 1]
+    second_arg_raw = kernel_string[arg_locs[1]['start']:arg_locs[1]['end']]
+
+    first_arg_clean = kernel_string[arg_locs[0]['start']:arg_locs[0]['end']].replace("\n", "").strip(" ")
+    second_arg_clean = kernel_string[arg_locs[1]['start']:arg_locs[1]['end']].replace("\n", "").strip(" ")
+
+    first_arg_dim3 = "dim3({})".format(first_arg_clean)
+    second_arg_dim3 = "dim3({})".format(second_arg_clean)
+
+    first_arg_raw_dim3 = first_arg_raw.replace(first_arg_clean, first_arg_dim3)
+    second_arg_raw_dim3 = second_arg_raw.replace(second_arg_clean, second_arg_dim3)
+    cuda_kernel = cuda_kernel.replace(first_arg_raw + second_arg_raw, first_arg_raw_dim3 + second_arg_raw_dim3)
+    return cuda_kernel
+
+
 def processKernelLaunches(string, stats):
     """ Replace the CUDA style Kernel launches with the HIP style kernel launches."""
     # Concat the namespace with the kernel names. (Find cleaner way of doing this later).
@@ -396,12 +409,12 @@ def find_kernel_bounds(string):
 
         # Extract cuda kernel
         cuda_kernel = string[params[0]["start"]:parenthesis + 1]
-
+        kernel_string = string[kernel['start']:kernel['end']]
+        cuda_kernel_dim3 = add_dim3(kernel_string, cuda_kernel)
         # Keep number of kernel launch params consistent (grid dims, group dims, stream, dynamic shared size)
         num_klp = len(extract_arguments(0, kernel["group"].replace("<<<", "(").replace(">>>", ")")))
 
-        # Transform cuda kernel to hip kernel
-        hip_kernel = "hipLaunchKernelGGL(" + cuda_kernel[0:-1].replace(
+        hip_kernel = "hipLaunchKernelGGL(" + cuda_kernel_dim3[0:-1].replace(
             ">>>", ", 0" * (4 - num_klp) + ">>>").replace("<<<", ", ").replace(">>>", ", ")
 
         # Replace cuda kernel with hip kernel
@@ -450,6 +463,7 @@ def disable_asserts(input_string):
         output_string = output_string.replace(input_string[start:p_end + 1], "")
     return output_string
 
+
 def replace_forceinline(input_string):
     """__forceinline__'d methods can cause 'symbol multiply defined' errors in HIP. 
     Adding 'static' to all such methods leads to compilation errors, so
@@ -460,6 +474,7 @@ def replace_forceinline(input_string):
     output_string = re.sub("__forceinline__", "inline", output_string)
     return output_string
 
+
 def replace_math_functions(input_string):
     """ FIXME: Temporarily replace std:: invocations of math functions with non-std:: versions to prevent linker errors
         NOTE: This can lead to correctness issues when running tests, since the correct version of the math function (exp/expf) might not get called.
@@ -471,6 +486,7 @@ def replace_math_functions(input_string):
     output_string = re.sub("std::pow\(", "::pow(", output_string)
     return output_string
 
+
 def disable_function(input_string, function, replace_style):
     """ Finds and disables a function in a particular file.
 
@@ -610,11 +626,42 @@ def disable_function(input_string, function, replace_style):
     return output_string
 
 
-def preprocessor(filepath, stats):
+def get_hip_file_path(filepath, hipify_caffe2):
+    """ Returns the new name of the hipified file """
+    if not hipify_caffe2:
+        return filepath
+
+    dirpath, filename = os.path.split(filepath)
+    filename_without_ext, ext = os.path.splitext(filename)
+
+    if 'gpu' in filename_without_ext:
+        filename_without_ext = filename_without_ext.replace('gpu', 'hip')
+    else:
+        filename_without_ext += '_hip'
+
+    if ext == '.cu':
+        ext = '.cc'
+
+    return os.path.join(dirpath, 'hip', filename_without_ext + ext)
+
+
+def is_caffe2_gpu_file(filepath):
+    filename = os.path.basename(filepath)
+    _, ext = os.path.splitext(filename)
+    return 'gpu' in filename or ext in ['.cu', '.cuh']
+
+
+def preprocessor(filepath, stats, hipify_caffe2):
     """ Executes the CUDA -> HIP conversion on the specified file. """
-    with openf(filepath, "r+") as fileobj:
-        output_source = fileobj.read()
+    fin_path = filepath
+    with open(fin_path, 'r') as fin:
+        output_source = fin.read()
 
+    fout_path = get_hip_file_path(filepath, hipify_caffe2)
+    if not os.path.exists(os.path.dirname(fout_path)):
+        os.makedirs(os.path.dirname(fout_path))
+
+    with open(fout_path, 'w') as fout:
         # Perform type, method, constant replacements
         for mapping in CUDA_TO_HIP_MAPPINGS:
             for cuda_type, value in mapping.items():
@@ -622,13 +669,22 @@ def preprocessor(filepath, stats):
                 hip_type = value[0]
                 meta_data = value[1:]
 
+                if constants.API_CAFFE2 in meta_data and not hipify_caffe2:
+                    continue
+                if constants.API_RAND in meta_data and hipify_caffe2:
+                    continue
+
                 if output_source.find(cuda_type) > -1:
                     # Check if supported
                     if constants.HIP_UNSUPPORTED in meta_data:
                         stats["unsupported_calls"].append((cuda_type, filepath))
 
                 if cuda_type in output_source:
-                    output_source = re.sub(r'\b({0})\b'.format(cuda_type), lambda x: hip_type, output_source)
+                    if hipify_caffe2:
+                        pattern = r'({0})'.format(cuda_type)
+                    else:
+                        pattern = r'(\b{0}\b)'.format(cuda_type)
+                    output_source = re.sub(pattern, hip_type, output_source)
 
         # Perform Kernel Launch Replacements
         output_source = processKernelLaunches(output_source, stats)
@@ -643,14 +699,7 @@ def preprocessor(filepath, stats):
         # Replace __forceinline__ with inline
         output_source = replace_forceinline(output_source)
 
-        # Overwrite file contents
-        fileobj.seek(0)
-        fileobj.write(output_source)
-        fileobj.truncate()
-        fileobj.flush()
-
-        # Flush to disk
-        os.fsync(fileobj)
+        fout.write(output_source)
 
 
 def file_specific_replacement(filepath, search_string, replace_string, strict=False):
@@ -847,7 +896,7 @@ def extract_arguments(start, string):
             closures["("] -= 1
         elif string[current_position] == "<":
             closures["<"] += 1
-        elif string[current_position] == ">" and string[current_position - 1] != "-":
+        elif string[current_position] == ">" and string[current_position - 1] != "-" and closures["<"] > 0:
             closures["<"] -= 1
 
         # Finished all arguments
@@ -867,7 +916,7 @@ def extract_arguments(start, string):
 
 
 # Add static_cast to ensure that the type of kernel arguments matches that in the corresponding kernel definition
-def add_static_casts(directory, extensions, KernelTemplateParams):
+def add_static_casts(filepath, KernelTemplateParams):
     """Add static casts to kernel launches in order to keep launch argument types and kernel definition types matching.
 
        Example:
@@ -884,73 +933,70 @@ def add_static_casts(directory, extensions, KernelTemplateParams):
     static_cast_types = ["int", "const int", "int64_t", "THCIndex_t *",
                          "const int *", "ptrdiff_t", "long", "const int64_t*", "int64_t *", "double"]
 
-    # Add static_casts<> to all kernel launches.
-    for (dirpath, _dirnames, filenames) in os.walk(directory):
-        for filename in filenames:
-            if filename_ends_with_extension(filename, extensions):
-                filepath = os.sep.join([dirpath, filename])
-                with openf(filepath, "r+") as fileobj:
-                    input_source = fileobj.read()
-                    new_output_source = input_source
-                    for kernel in re.finditer("hipLaunchKernelGGL\(", input_source):
-                        arguments = extract_arguments(kernel.end() - 1, input_source)
-
-                        # Check if we have templating + static_cast information
-                        argument_strings = [input_source[arg["start"]:arg["end"]] for arg in arguments]
-                        original_kernel_name_with_template = argument_strings[0].strip()
-                        kernel_name = original_kernel_name_with_template.split("<")[0].strip()
-                        ignore = ["upscale"]
-                        if kernel_name in KernelTemplateParams and kernel_name not in ignore:
-                            # Add template to the kernel
-                            # Add static_casts to relevant arguments
-                            kernel_name_with_template = KernelTemplateParams[kernel_name]["kernel_with_template"]
-                            argument_types = KernelTemplateParams[kernel_name]["arg_types"]
-                            
-                            # The first 5 arguments are simply (function, number blocks, dimension blocks, shared memory, stream)
-                            # old_kernel_launch_parameters - will contain the actual arguments to the function itself.
-                            old_kernel_launch_parameters = input_source[arguments[5]["start"]:arguments[-1]["end"]]
-                            new_kernel_launch_parameters = old_kernel_launch_parameters
-            
-                            # full_old_kernel_launch - will contain the entire kernel launch closure.
-                            full_old_kernel_launch = input_source[arguments[0]["start"]:arguments[-1]["end"]]
-                            full_new_kernel_launch = full_old_kernel_launch
-
-                            kernel_params = argument_strings[5:]
-                            for arg_idx, arg in enumerate(kernel_params):
-                                if arg_idx in argument_types:
-                                    the_type = argument_types[arg_idx]
-                                    the_arg = arg.replace("\n", "").replace("\\", "").strip()
-                                    # Not all types have issues with the hipLaunchKernelGGL.
-                                    if the_type in static_cast_types:
-                                        static_argument = "static_cast<{0}>({1})".format(the_type, the_arg)
-
-                                        def replace_arg(match):
-                                          return match.group(1) + static_argument + match.group(3)
-                                        # Update to static_cast, account for cases where argument is at start/end of string
-                                        new_kernel_launch_parameters = re.sub(r'(^|\W)({0})(\W|$)'.format(
-                                            re.escape(the_arg)), replace_arg, new_kernel_launch_parameters)
- 
-                            # replace kernel arguments in full kernel launch arguments w/ static_cast ones
-                            full_new_kernel_launch = full_new_kernel_launch.replace(old_kernel_launch_parameters, new_kernel_launch_parameters)
-
-                            # PyTorch Specific: Add template type
-                            # Here the template value will be resolved from <real> to <Dtype>.
-                            if "THCUNN" in filepath.split("/") and "generic" not in filepath.split("/"):
-                                kernel_name_with_template = kernel_name_with_template.replace("<real>", "<Dtype>")
-                            full_new_kernel_launch = re.sub(r'\b{0}\b'.format(original_kernel_name_with_template),
-                                                       lambda x: kernel_name_with_template, full_new_kernel_launch)
-
-                            # Replace Launch
-                            new_output_source = new_output_source.replace(full_old_kernel_launch, full_new_kernel_launch)
-
-                    # Overwrite file contents
-                    fileobj.seek(0)
-                    fileobj.write(new_output_source)
-                    fileobj.truncate()
-                    fileobj.flush()
-
-                    # Flush to disk
-                    os.fsync(fileobj)
+    with openf(filepath, "r+") as fileobj:
+        input_source = fileobj.read()
+        new_output_source = input_source
+        for kernel in re.finditer("hipLaunchKernelGGL\(", input_source):
+            arguments = extract_arguments(kernel.end() - 1, input_source)
+
+            # Check if we have templating + static_cast information
+            argument_strings = [input_source[arg["start"]:arg["end"]] for arg in arguments]
+            original_kernel_name_with_template = argument_strings[0].strip()
+            kernel_name = original_kernel_name_with_template.split("<")[0].strip()
+            ignore = ["upscale"]
+            if kernel_name in KernelTemplateParams and kernel_name not in ignore:
+                # Add template to the kernel
+                # Add static_casts to relevant arguments
+                kernel_name_with_template = KernelTemplateParams[kernel_name]["kernel_with_template"]
+                argument_types = KernelTemplateParams[kernel_name]["arg_types"]
+
+                # The first 5 arguments are simply (function, number blocks, dimension blocks, shared memory, stream)
+                # old_kernel_launch_parameters - will contain the actual arguments to the function itself.
+                old_kernel_launch_parameters = input_source[arguments[5]["start"]:arguments[-1]["end"]]
+                new_kernel_launch_parameters = old_kernel_launch_parameters
+
+                # full_old_kernel_launch - will contain the entire kernel launch closure.
+                full_old_kernel_launch = input_source[arguments[0]["start"]:arguments[-1]["end"]]
+                full_new_kernel_launch = full_old_kernel_launch
+
+                kernel_params = argument_strings[5:]
+                for arg_idx, arg in enumerate(kernel_params):
+                    if arg_idx in argument_types:
+                        the_type = argument_types[arg_idx]
+                        the_arg = arg.replace("\n", "").replace("\\", "").strip()
+                        # Not all types have issues with the hipLaunchKernelGGL.
+                        if the_type in static_cast_types:
+                            static_argument = "static_cast<{0}>({1})".format(the_type, the_arg)
+
+                            def replace_arg(match):
+                                return match.group(1) + static_argument + match.group(3)
+                            # Update to static_cast, account for cases where argument is at start/end of string
+                            new_kernel_launch_parameters = re.sub(r'(^|\W)({0})(\W|$)'.format(
+                                re.escape(the_arg)), replace_arg, new_kernel_launch_parameters)
+
+                # replace kernel arguments in full kernel launch arguments w/ static_cast ones
+                full_new_kernel_launch = full_new_kernel_launch.replace(
+                    old_kernel_launch_parameters, new_kernel_launch_parameters)
+
+                # PyTorch Specific: Add template type
+                # Here the template value will be resolved from <real> to <Dtype>.
+                if "THCUNN" in filepath.split("/") and "generic" not in filepath.split("/"):
+                    kernel_name_with_template = kernel_name_with_template.replace("<real>", "<Dtype>")
+
+                full_new_kernel_launch = re.sub(r'\b{0}\b'.format(original_kernel_name_with_template),
+                                                lambda x: kernel_name_with_template, full_new_kernel_launch)
+
+                # Replace Launch
+                new_output_source = new_output_source.replace(full_old_kernel_launch, full_new_kernel_launch)
+
+        # Overwrite file contents
+        fileobj.seek(0)
+        fileobj.write(new_output_source)
+        fileobj.truncate()
+        fileobj.flush()
+
+        # Flush to disk
+        os.fsync(fileobj)
 
 
 def str2bool(v):
@@ -990,7 +1036,7 @@ def main():
     parser.add_argument(
         '--extensions',
         nargs='+',
-        default=["cu", "cuh", "c", "cpp", "h", "in", "hpp"],
+        default=[".cu", ".cuh", ".c", ".cpp", ".h", ".in", ".hpp"],
         help="The extensions for files to run the Hipify script over.",
         required=False)
 
@@ -1002,10 +1048,10 @@ def main():
         required=False)
 
     parser.add_argument(
-        '--include-dirs',
+        '--includes',
         nargs='+',
         default=[],
-        help="The directories under the root that should be included.",
+        help="The patterns of files that should be included.",
         required=False)
 
     parser.add_argument(
@@ -1022,6 +1068,19 @@ def main():
         help="Whether to automatically add static_casts to kernel arguments.",
         required=False)
 
+    parser.add_argument(
+        '--hipify_caffe2',
+        type=str2bool,
+        default=False,
+        help="Whether to hipify caffe2 source",
+        required=False)
+
+    parser.add_argument(
+        '--ignores',
+        nargs='+',
+        default=[],
+        help="list of patterns to ignore for hipifying")
+
     parser.add_argument(
         '--show-progress',
         type=str2bool,
@@ -1037,33 +1096,14 @@ def main():
         sys.exit(1)
 
     # If no output directory, provide a default one.
-    if args.output_directory is "":
+    if not args.output_directory:
         args.project_directory.rstrip("/")
         args.output_directory = args.project_directory + "_amd"
 
-    # Make sure output directory does not exist.
-    if not os.path.exists(args.output_directory):
-        print("The output folder already exists.")
-        sys.exit(2)
-
     # Copy from project directory to output directory if not done already.
     if not os.path.exists(args.output_directory):
         shutil.copytree(args.project_directory, args.output_directory)
 
-    # Extract all of the kernel parameter and template type information.
-    if args.add_static_casts:
-        KernelTemplateParams = {}
-        for (dirpath, _dirnames, filenames) in os.walk(args.output_directory):
-            for filename in filenames:
-                if filename_ends_with_extension(filename, args.extensions) and inside_included_directories(dirpath, args.output_directory, args.include_dirs):
-                    the_file = os.sep.join([dirpath, filename])
-
-                    # Store param information inside KernelTemplateParams
-                    get_kernel_template_params(
-                        the_file,
-                        KernelTemplateParams,
-                        PYTORCH_TEMPLATE_MAP)
-
     # Open YAML file with disable information.
     if args.yaml_settings != "":
         with openf(args.yaml_settings, "r") as f:
@@ -1152,17 +1192,28 @@ def main():
                 f.write(txt)
                 f.truncate()
 
+    all_files = list(matched_files_iter(args.output_directory, includes=args.includes,
+                                        ignores=args.ignores, extensions=args.extensions, hipify_caffe2=args.hipify_caffe2))
+
     # Start Preprocessor
-    walk_over_directory(
-        args.output_directory,
-        extensions=args.extensions,
+    preprocess(
+        all_files,
         show_detailed=args.show_detailed,
-        include_dirs=args.include_dirs,
-        show_progress=args.show_progress)
+        show_progress=args.show_progress,
+        hipify_caffe2=args.hipify_caffe2)
 
+    # Extract all of the kernel parameter and template type information.
     if args.add_static_casts:
+        KernelTemplateParams = {}
+        for filepath in all_files:
+            get_kernel_template_params(
+                filepath,
+                KernelTemplateParams,
+                CAFFE2_TEMPLATE_MAP if args.hipify_caffe2 else PYTORCH_TEMPLATE_MAP)
+
         # Execute the Clang Tool to Automatically add static casts
-        add_static_casts(args.output_directory, args.extensions, KernelTemplateParams)
+        for filepath in all_files:
+            add_static_casts(get_hip_file_path(filepath, hipify_caffe2=args.hipify_caffe2), KernelTemplateParams)
 
 
 if __name__ == '__main__':
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 960e55727031b3..3a396d84b66e4f 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -342,7 +342,7 @@
   self: at::zeros(self.sizes(), grad.type()).index_add_(dim, index, grad)
 
 - name: inverse(Tensor self)
-  self: -at::mm(output.t(), at::mm(grad, output.t()))
+  self: -at::mm(result.t(), at::mm(grad, result.t()))
 
 - name: kthvalue(Tensor self, int64_t k, int64_t dim, bool keepdim)
   self: index_select_backward(grad, dim, result1, self.sizes(), keepdim)
@@ -579,7 +579,7 @@
   self: repeat_backward(grad, self.dim(), repeats)
 
 # DO NOT define a backward for reshape!
-# reshape is special in that it sometimes returns a view, and somtimes not.
+# reshape is special in that it sometimes returns a view, and sometimes not.
 # Defining a backward will make codegen spit out the forward call as
 #     as_variable(baseType->reshape(self)),
 # making it impossible (hard) to detect when it is actually a view.
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index f9bd8d9c0d2b28..6d85270fffc89a 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -25,11 +25,12 @@
     'index',
     '_indexCopy_', 'max_values', 'min_values', 'argmax', 'argmin',
     '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', '_th_.*',
-    'arange.*', 'range.*', '_gesv.*', 'slice', 'max_pool1d', 'max_pool2d', 'max_pool3d'
+    'arange.*', 'range.*', '_gesv.*', '_getri.*', 'slice',
+    'max_pool1d', 'max_pool2d', 'max_pool3d'
 ]
 
 PY_VARIABLE_METHOD_VARARGS = CodeTemplate("""\
-static PyObject * ${pycname}(PyObject* self, PyObject* args, PyObject* kwargs)
+static PyObject * ${pycname}(PyObject* self_, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
   static PythonArgParser parser({
@@ -45,7 +46,7 @@
 """)
 
 PY_VARIABLE_METHOD_NOARGS = CodeTemplate("""\
-static PyObject * ${pycname}(PyObject* self, PyObject* args)
+static PyObject * ${pycname}(PyObject* self_, PyObject* args)
 {
   HANDLE_TH_ERRORS
   ${unpack_self}
@@ -98,7 +99,7 @@
 PY_VARIABLE_METHOD_DEF = CodeTemplate("""\
 {"${name}", (PyCFunction)${pycname}, ${flags}, NULL},""")
 
-UNPACK_SELF = "auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;"
+UNPACK_SELF = "auto& self = reinterpret_cast<THPVariable*>(self_)->cdata;"
 
 PYTHON_FUNCTION_SIGNATURE = CodeTemplate("""\
 ${name}(${py_formal_args})""")
@@ -329,7 +330,7 @@ def append_actuals_formals(actual, formal):
                 continue
             if has_self and arg['name'] == 'self':
                 formal_args.append('Tensor & self')
-                actuals.append('self_')
+                actuals.append('self')
                 continue
             append_actuals_formals(*parse_arg(arg, arg_idx, unpack))
             arg_idx += 1
@@ -582,7 +583,7 @@ def process_function(name, declarations):
 
         if len(declarations) == 1 and len(declarations[0]['args']) == 1 and has_self:
             tmpl = PY_VARIABLE_METHOD_NOARGS
-            env['actuals'] = ['self_']
+            env['actuals'] = ['self']
             env['flags'] = 'METH_NOARGS'
         else:
             tmpl = PY_VARIABLE_METHOD_VARARGS
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 679e3c97f9fa54..97130b1bdbb3bc 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -54,6 +54,13 @@
     's_native_mul': 'mul',
     'th_addmm': 'addmm',
     's_native_addmm': 'addmm',
+    'zero': 'zeros_like',
+    'fill': 'full_like',
+}
+
+# (declaration name, argument name) -> attribute name
+RENAME_ATTRIBUTES = {
+    ('fill_', 'value'): 'fill_value'
 }
 
 # These functions are not worth profiling because they are very cheap and may
@@ -126,7 +133,7 @@
 
 PRE_RECORD_TRACE = CodeTemplate("""\
 jit::tracer::PreTraceInfo trace_info;
-if (jit::tracer::isTracing( ${tensor_args} )) {
+if (jit::tracer::isTracing()) {
   trace_info = jit::tracer::preRecordTrace( jit::aten::${trace_name}, ${trace_inputs} );
   if (!jit::tracer::ArgumentStash::empty()) {
     ${record_positional_attributes}
@@ -138,13 +145,13 @@
 """)
 
 POST_RECORD_TRACE = CodeTemplate("""\
-if (trace_info.state != nullptr) {
+if (jit::tracer::isTracing()) {
   jit::tracer::postRecordTrace( trace_info,  ${trace_outputs} );
 }
 """)
 
 RECORD_ATTRIBUTE = CodeTemplate("""\
-setattr(trace_info.n, jit::attr::${name}, ${name});""")
+setattr(trace_info.n, jit::attr::${attr_name}, ${name});""")
 
 RECORD_POSITIONAL_ATTRIBUTE = CodeTemplate("""\
 setposattr(trace_info.n, ${i}, "${name}", ${name});""")
@@ -417,7 +424,8 @@ def emit_record_trace(env):
         for arg in declaration['arguments']:
             if arg['simple_type'] in {'Tensor', 'TensorList'}:
                 continue
-            local['record_attributes'].append(RECORD_ATTRIBUTE.substitute(name=arg['name']))
+            attr_name = RENAME_ATTRIBUTES.get((declaration['name'], arg['name']), arg['name'])
+            local['record_attributes'].append(RECORD_ATTRIBUTE.substitute(attr_name=attr_name, name=arg['name']))
 
         local['record_positional_attributes'] = []
         for i, arg in enumerate(declaration['arguments']):
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 0695c0d89befe6..03e0f641696144 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -14,6 +14,7 @@
 #include "torch/csrc/jit/symbolic_variable.h"
 #include "torch/csrc/jit/tensor_conversions.h"
 #include "torch/csrc/utils/variadic.h"
+#include "torch/csrc/autograd/functions/utils.h"
 
 #include <array>
 #include <cstddef>
@@ -38,10 +39,14 @@ using namespace torch::autograd::generated;
 namespace torch { namespace autograd {
 // Helper methods for working with Attributes (torch/csrc/jit/attributes.h)
 
+at::Tensor maybeUnwrapVar(const at::Tensor& t) {
+  return t.is_variable() ? Variable(t).data() : t;
+}
+
 // The overloaded accessors are convenient for the generated code (since we
 // don't want to make the codegen do the dispatch manually)
 static void setattr(jit::Node* n, jit::Symbol name, int64_t v)             { n->i_(name, v); }
-static void setattr(jit::Node* n, jit::Symbol name, const at::Scalar& v)   { n->t_(name, v.toTensor()); }
+static void setattr(jit::Node* n, jit::Symbol name, const at::Scalar& v)   { n->t_(name, maybeUnwrapVar(v.toTensor())); }
 static void setattr(jit::Node* n, jit::Symbol name, SparseTensorRef s)     { n->t_(name, s.tref); }
 static void setattr(jit::Node* n, jit::Symbol name, const at::IntList& v)  { n->is_(name, v); }
 static void setattr(jit::Node* n, jit::Symbol name, bool v)                { n->i_(name, v); }
@@ -327,26 +332,6 @@ static std::vector<Tensor> as_view(const Tensor & base, std::vector<Tensor> tens
   return tensors;
 }
 
-struct ComputeRequiresGrad : IterArgs<ComputeRequiresGrad> {
-  bool out = false;
-  using IterArgs<ComputeRequiresGrad>::operator();
-  void operator()(const at::Tensor& tensor) {
-    const auto& var = static_cast<const Variable&>(tensor);
-    if (var.defined() && var.requires_grad()) {
-      out = true;
-    }
-  }
-  bool short_circuit() { return out; }
-};
-
-template<typename... Args>
-static bool compute_requires_grad(Args&&... args) {
-  if (!GradMode::is_enabled()) {
-    return false;
-  }
-  return ComputeRequiresGrad().apply(std::forward<Args>(args)...).out;
-}
-
 static void check_no_requires_grad(const Tensor& tensor, const char* name) {
   auto& var = static_cast<const Variable&>(tensor);
   if (var.defined() && var.requires_grad()) {
@@ -394,20 +379,6 @@ static void rebase_history(ArrayRef<Variable> vars, std::shared_ptr<Function> gr
   }
 }
 
-static void set_history(ArrayRef<Variable> vars, std::shared_ptr<Function> grad_fn) {
-  if (grad_fn) {
-    for (auto& var : vars) {
-      if (var.defined()) {
-        // TODO: eliminate const_cast
-        auto output_nr = grad_fn->add_input_metadata(var.type(), var.sizes());
-        const_cast<Variable&>(var).set_gradient_edge({grad_fn, output_nr});
-      } else {
-        grad_fn->add_input_metadata(Function::undefined_input());
-      }
-    }
-  }
-}
-
 struct Flatten : IterArgs<Flatten> {
   Flatten(variable_list& out) : out(out) {}
   variable_list& out;
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index 567bbdf5f60b27..5bf7a4e5591155 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -4,6 +4,8 @@
 
 #include <ATen/ATen.h>
 
+#include <torch/csrc/WindowsTorchApiMacro.h>
+
 #include <cstdint> // for size_t
 #include <functional> // for function
 #include <memory> // for unique_ptr
@@ -28,7 +30,7 @@ using at::optional;
 
 void register_variable_type_for(at::Type* baseType);
 
-struct VariableType final : public at::Type {
+struct TORCH_API VariableType final : public at::Type {
   VariableType(Context* context, at::Type* baseType);
   virtual at::ScalarType scalarType() const override;
   virtual at::Backend backend() const override;
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index 8e8c87607b478b..9e4405482b4485 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -140,7 +140,7 @@ static PyObject * THPVariable_size(PyObject* self, PyObject* args, PyObject* kwa
   ParsedArgs<3> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
-    if (jit::tracer::isTracing(self_)) {
+    if (jit::tracer::isTracing()) {
       return wrap(jit::tracer::getSizeOf(self_, r.toInt64(0)));
     } else {
       return wrap(self_.size(r.toInt64(0)));
diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat
index 0bfe87b3c9a8a1..cc8271515590ba 100755
--- a/tools/build_pytorch_libs.bat
+++ b/tools/build_pytorch_libs.bat
@@ -175,6 +175,9 @@ goto:eof
   cmake .. %CMAKE_GENERATOR_COMMAND% ^
                   -DCMAKE_BUILD_TYPE=%BUILD_TYPE% ^
                   -DBUILD_CAFFE2=OFF ^
+                  -DBUILD_TORCH="%BUILD_TORCH%" ^
+                  -DNVTOOLEXT_HOME="%NVTOOLEXT_HOME%" ^
+                  -DNO_API=ON ^
                   -DBUILD_ATEN=ON ^
                   -DBUILD_PYTHON=OFF ^
                   -DBUILD_BINARY=OFF ^
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index e3d6202c69ec8f..4a0dbd04c905f1 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -255,6 +255,7 @@ function build_caffe2() {
       -DBUILDING_WITH_TORCH_LIBS=ON \
       -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
       -DBUILD_CAFFE2=$FULL_CAFFE2 \
+      -DBUILD_TORCH=$BUILD_TORCH \
       -DBUILD_ATEN=ON \
       -DBUILD_PYTHON=$FULL_CAFFE2 \
       -DBUILD_BINARY=OFF \
diff --git a/tools/cpp_build/build_all.sh b/tools/cpp_build/build_all.sh
deleted file mode 100755
index abc9cd5c5d7bed..00000000000000
--- a/tools/cpp_build/build_all.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
-source $SCRIPTPATH/build_caffe2.sh
-source $SCRIPTPATH/build_libtorch.sh
diff --git a/tools/cpp_build/build_caffe2.sh b/tools/cpp_build/build_caffe2.sh
index 4c3254e7de66b6..b35435acb388c6 100755
--- a/tools/cpp_build/build_caffe2.sh
+++ b/tools/cpp_build/build_caffe2.sh
@@ -12,7 +12,9 @@ echo "Building Caffe2"
 mkdir -p $CAFFE2_BUILDPATH
 pushd $CAFFE2_BUILDPATH
 
-cmake -DUSE_CUDA=$USE_CUDA \
+cmake -DUSE_CUDA:BOOL=$USE_CUDA \
+      -DBUILD_TORCH=ON \
+      -DUSE_OPENMP:BOOL=${USE_OPENMP:ON} \
       -DBUILD_CAFFE2=OFF \
       -DBUILD_ATEN=ON \
       -DBUILD_PYTHON=OFF \
diff --git a/tools/cpp_build/build_common.sh b/tools/cpp_build/build_common.sh
index 6a801937a936d9..be9ac2b271743d 100755
--- a/tools/cpp_build/build_common.sh
+++ b/tools/cpp_build/build_common.sh
@@ -11,7 +11,6 @@ fi
 
 CAFFE2_BUILDPATH="$BUILD_PATH/caffe2"
 NANOPB_BUILDPATH="$BUILD_PATH/nanopb"
-LIBTORCH_BUILDPATH="$BUILD_PATH/libtorch"
 
 # Build with Ninja if available. It has much cleaner output.
 GENERATE="Unix Makefiles"
diff --git a/tools/cpp_build/build_libtorch.sh b/tools/cpp_build/build_libtorch.sh
index b5001120b8ca91..92a9b9981ed697 100755
--- a/tools/cpp_build/build_libtorch.sh
+++ b/tools/cpp_build/build_libtorch.sh
@@ -13,7 +13,7 @@ mkdir -p $LIBTORCH_BUILDPATH
 pushd $LIBTORCH_BUILDPATH
 
 cmake -DUSE_CUDA:BOOL=$USE_CUDA \
-      -DNO_API:BOOL=${NO_API:0} \
+      -DNO_API:BOOL=${NO_API:-0} \
       -DCAFFE2_PATH=$PYTORCHPATH/ \
       -DCAFFE2_BUILD_PATH=$CAFFE2_BUILDPATH \
       -DONNX_NAMESPACE=$ONNX_NAMESPACE \
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index d73406aa063d5f..02fd0428622c13 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -1,91 +1,20 @@
-if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+  if (NOT BUILD_TORCH)
+    return()
+  endif()
+else()
   cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
-  include(CMakeDependentOption)
+  project(torch CXX C)
+  find_package(Caffe2 REQUIRED)
   option(USE_CUDA "Use CUDA" ON)
-  option(TORCH_BUILD_TEST "Build torch test binaries" ON)
-
-  # Flag for shared dependencies
-  set(BUILD_TORCH ON)
 endif()
 
-cmake_policy(VERSION 3.0)
-
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-
-if (VERBOSE)
-  message(STATUS "CAFFE2_PATH is ${CAFFE2_PATH}")
-  message(STATUS "CAFFE2_BUILD_PATH is ${CAFFE2_BUILD_PATH}")
-  message(STATUS "INSTALL_PREFIX is ${INSTALL_PREFIX}")
-endif()
+option(BUILD_TORCH_TEST "Build torch test binaries" ON)
 
-set(CAFFE2_INCLUDE_DIR "${CAFFE2_PATH}")
-set(CAFFE2_BUILD_LIB_DIR "${CAFFE2_BUILD_PATH}/lib")
-set(CAFFE2_INSTALL_INCLUDE_DIR "${INSTALL_PREFIX}/include")
-set(CAFFE2_INSTALL_SHARE_DIR "${INSTALL_PREFIX}/share")
-set(CAFFE2_INSTALL_LIB_DIR "${INSTALL_PREFIX}/lib")
 set(TORCH_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
 
-find_library(CAFFE2_LIBRARY caffe2
-  NAMES libcaffe2.so libcaffe2.dylib caffe2.lib
-  PATHS ${CAFFE2_INSTALL_LIB_DIR} NO_DEFAULT_PATH)
-find_library(CAFFE2_GPU_LIBRARY caffe2_gpu
-  NAMES libcaffe2_gpu.so libcaffe2_gpu.dylib caffe2_gpu.lib
-  PATHS ${CAFFE2_INSTALL_LIB_DIR} NO_DEFAULT_PATH)
-find_library(PROTOBUF_LIBRARY protobuf
-  NAMES libprotobuf.a libprotobufd.a libprotobuf.lib libprotobufd.lib
-  PATHS ${CAFFE2_BUILD_LIB_DIR} NO_DEFAULT_PATH)
-
 add_subdirectory(../third_party/nanopb protobuf-nanopb)
 
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-
-if(USE_CUDA)
-  set(CMAKE_MODULE_PATH
-    ${INSTALL_PREFIX}/share/cmake
-    ${TORCH_SRC_DIR}/../cmake/Modules
-    ${TORCH_SRC_DIR}/../cmake/public
-    ${TORCH_SRC_DIR}/../cmake/Modules_CUDA_fix
-    /usr/lib/x86_64-linux-gnu/
-    ${CMAKE_MODULE_PATH})
-  set(CMAKE_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/ ${CMAKE_LIBRARY_PATH})
-
-  if(NOT CUDA_FOUND)
-    find_package(CUDA 7.0)
-  endif()
-
-  find_package(MAGMA)
-  if(CUDA_FOUND AND MAGMA_FOUND)
-    include_directories("${MAGMA_INCLUDE_DIR}")
-    set(CMAKE_REQUIRED_INCLUDES "${MAGMA_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}")
-    include(CheckPrototypeDefinition)
-    check_prototype_definition(magma_get_sgeqrf_nb
-     "magma_int_t magma_get_sgeqrf_nb( magma_int_t m, magma_int_t n );"
-     "0"
-     "magma.h"
-      MAGMA_V2)
-    IF (MAGMA_V2)
-      add_definitions(-DMAGMA_V2)
-    endif (MAGMA_V2)
-
-    set(USE_MAGMA 1)
-    if(VERBOSE)
-      message(STATUS "Compiling with MAGMA support")
-      message(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}")
-      message(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}")
-      message(STATUS "MAGMA V2 check: ${MAGMA_V2}")
-    endif()
-  else()
-    message(STATUS "MAGMA not found. Compiling without MAGMA support")
-  endif()
-endif()
-
-add_definitions(-DUSE_CATCH -D_FORCE_INLINES -DONNX_NAMESPACE=${ONNX_NAMESPACE})
-
 if(NOT TORCH_INSTALL_BIN_DIR)
   set(TORCH_INSTALL_BIN_DIR bin)
 endif()
@@ -98,19 +27,6 @@ if(NOT TORCH_INSTALL_LIB_DIR)
   set(TORCH_INSTALL_LIB_DIR lib)
 endif()
 
-if(USE_CUDA)
-  add_definitions(-DUSE_CUDA)
-
-  set(TORCH_CUDA_LIBRARIES
-    ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda.so
-    ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvrtc.so
-    ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so
-    ${CUDA_LIBRARIES})
-
-  list(APPEND CUDA_INCLUDE_DIRS
-    ${CAFFE2_INSTALL_INCLUDE_DIR}/THC)
-endif()
-
 # RPATH stuff
 # see https://cmake.org/Wiki/CMake_RPATH_handling
 if(APPLE)
@@ -134,11 +50,11 @@ endif()
 # Generate files
 set(TOOLS_PATH "${TORCH_SRC_DIR}/../tools")
 
-configure_file("${CAFFE2_PATH}/aten/src/ATen/common_with_cwrap.py"
+configure_file("${TORCH_SRC_DIR}/../aten/src/ATen/common_with_cwrap.py"
                "${TOOLS_PATH}/shared/cwrap_common.py"
                COPYONLY)
 
-configure_file("${CAFFE2_PATH}/torch/_utils_internal.py"
+configure_file("${TORCH_SRC_DIR}/_utils_internal.py"
                "${TOOLS_PATH}/shared/_utils_internal.py"
                COPYONLY)
 
@@ -164,11 +80,11 @@ add_custom_command(
   "${TORCH_SRC_DIR}/csrc/jit/generated/aten_interned_strings.h"
   COMMAND
   python tools/setup_helpers/generate_code.py
-    --declarations-path "${CAFFE2_INSTALL_SHARE_DIR}/ATen/Declarations.yaml"
+    --declarations-path "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml"
     --nn-path "aten/src/"
   DEPENDS
-  "${CAFFE2_INSTALL_SHARE_DIR}/ATen/Declarations.yaml"
-  "${CAFFE2_INSTALL_INCLUDE_DIR}/THNN/generic/THNN.h"
+  "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml"
+  "${CMAKE_CURRENT_LIST_DIR}/../aten/src/THNN/generic/THNN.h"
   "${TOOLS_PATH}/autograd/templates/VariableType.h"
   "${TOOLS_PATH}/autograd/templates/VariableType.cpp"
   "${TOOLS_PATH}/autograd/templates/Functions.h"
@@ -191,62 +107,68 @@ add_custom_command(
   WORKING_DIRECTORY "${TORCH_SRC_DIR}/..")
 
 set(TORCH_SRCS
-  ${TORCH_SRC_DIR}/csrc/autograd/aten_variable_hooks.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp
+  ${TORCH_SRC_DIR}/csrc/assertions.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/anomaly_mode.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/aten_variable_hooks.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/engine.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/function.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/functions/basic_ops.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/functions/accumulate_grad.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/functions/basic_ops.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/functions/comm.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/functions/tensor.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/variable.cpp
-  ${TORCH_SRC_DIR}/csrc/autograd/engine.cpp
-  ${TORCH_SRC_DIR}/csrc/assertions.cpp
-  ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp
+  ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/export.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/fusion_compiler.cpp
   ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/variable_flags.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/graph_executor.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/import.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/interned_strings.cpp
   ${TORCH_SRC_DIR}/csrc/jit/interpreter.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/ir.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/graph_executor.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/fusion_compiler.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/graph_fuser.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/decompose_addmm.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/erase_number_types.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/lower_tuples.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/graph_fuser.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/inplace_check.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/loop_unrolling.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/lower_grad_of.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/lower_tuples.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/peephole.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/inplace_check.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/remove_expands.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/decompose_addmm.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_undef.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/loop_unrolling.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/interned_strings.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/register_symbols.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/lexer.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/module.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp
   ${TORCH_SRC_DIR}/csrc/jit/tracer.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/tracer_state.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp
   ${TORCH_SRC_DIR}/csrc/jit/type.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/export.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/import.cpp
   ${TORCH_SRC_DIR}/csrc/onnx/onnx.cpp
   ${TORCH_SRC_DIR}/csrc/onnx/onnx.npb.cpp
-  ${TORCH_SRC_DIR}/csrc/torch.cpp)
+  ${TORCH_SRC_DIR}/csrc/torch.cpp
+  ${TORCH_SRC_DIR}/csrc/utils/tensor_flatten.cpp
+  ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp
+  )
 
-if (NOT NO_API)
+if (NOT NO_API AND NOT USE_ROCM)
   list(APPEND TORCH_SRCS
     ${TORCH_SRC_DIR}/csrc/api/src/utils.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/cuda.cpp
@@ -266,73 +188,159 @@ if (NOT NO_API)
     ${TORCH_SRC_DIR}/csrc/api/src/optim/lbfgs.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/optim/rmsprop.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/optim/sgd.cpp
-  )
+    )
+
 endif()
 
 add_library(torch SHARED ${TORCH_SRCS})
 
-# https://gcc.gnu.org/onlinedocs/gcc-4.0.3/gcc/Warning-Options.html
-target_compile_options(torch
-  PRIVATE
-  -Wall
-  -Wextra
-  -pedantic
-  -Wcast-align
-  -Wcast-qual
-  -Wctor-dtor-privacy
-  -Wdisabled-optimization
-  -Winit-self
-  -Wmissing-include-dirs
-  -Woverloaded-virtual
-  -Wsign-promo
-  -Wstrict-overflow=5
-  -Wundef
-  -fdiagnostics-show-option
-  -Wno-unused-parameter
-  -Wno-missing-braces # This warning is buggy
-  -Wno-unknown-pragmas)
-
-if ($ENV{WERROR})
-  target_compile_options(torch PRIVATE -Werror)
+target_compile_definitions(torch PRIVATE _THP_CORE)
+
+# until they can be unified, keep these lists synced with setup.py
+if(MSVC)
+  target_compile_options(torch PRIVATE
+    /MD
+    /Z7
+    /EHa
+    /DNOMINMAX
+    /wd4267
+    /wd4251
+    /wd4522
+    /wd4522
+    /wd4838
+    /wd4305
+    /wd4244
+    /wd4190
+    /wd4101
+    /wd4996
+    /wd4275
+    /bigobj
+    )
+else()
+  target_compile_options(torch PRIVATE
+    -std=c++11
+    -Wall
+    -Wextra
+    -Wno-unused-parameter
+    -Wno-missing-field-initializers
+    -Wno-write-strings
+    -Wno-unknown-pragmas
+    # Clang has an unfixed bug leading to spurious missing braces
+    # warnings, see https://bugs.llvm.org/show_bug.cgi?id=21629
+    -Wno-missing-braces
+    )
+endif()
+
+# see the source file for explanation
+set_source_files_properties(
+  ${TORCH_SRC_DIR}/csrc/jit/register_symbols.cpp
+  PROPERTIES COMPILE_FLAGS -O0
+  )
+
+if (MSVC)
+elseif ($ENV{WERROR})
+  target_compile_options(torch PRIVATE -Werror -Wno-strict-overflow)
+endif()
+
+if (MSVC)
+  target_link_libraries(torch onnx onnx_library)
 endif()
 
 target_link_libraries(torch
-  ${TORCH_CUDA_LIBRARIES}
-  ${CAFFE2_LIBRARY}
-  ${PROTOBUF_LIBRARY}
-  protobuf-nanopb
-)
+  caffe2_library
+  protobuf-nanopb)
+
+find_package(OpenMP)
+if(OPENMP_FOUND)
+  if (VERBOSE)
+    message(STATUS "Compiling with OpenMP")
+  endif()
+  target_compile_options(torch INTERFACE -fopenmp)
+  target_link_libraries(torch -fopenmp)
+endif()
+
 if(USE_CUDA)
-  if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
-    target_link_libraries(torch -Wl,--no-as-needed ${CAFFE2_GPU_LIBRARY} -Wl,--as-needed)
+  if(MSVC)
+    set(TORCH_CUDA_LIBRARIES
+      ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
+      ${CUDA_LIBRARIES})
+    target_include_directories(torch PRIVATE "${NVTOOLEXT_HOME}/include")
+  elseif(APPLE)
+    set(TORCH_CUDA_LIBRARIES
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib
+      ${CUDA_LIBRARIES})
+    set_target_properties(torch PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
   else()
-    target_link_libraries(torch ${CAFFE2_GPU_LIBRARY})
+    set(TORCH_CUDA_LIBRARIES
+      ${CUDA_CUDA_LIB}
+      ${CUDA_NVRTC_LIB}
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so
+      ${CUDA_LIBRARIES})
   endif()
+
+  if(MSVC OR APPLE)
+    target_link_libraries(torch caffe2_gpu_library ${TORCH_CUDA_LIBRARIES})
+  else()
+    # TODO: using the full TORCH_CUDA_LIBRARIES here causes some
+    # builds to fail in CI, as libcuda.so can no longer be found. It's
+    # not clear why this is the case, and the situation should be
+    # investigated/cleaned up.  Note that the test+jit/test_api
+    # targets below do require linking against the full
+    # TORCH_CUDA_LIBRARIES, even on Linux
+    target_link_libraries(torch caffe2_gpu_library ${CUDA_LIBRARIES})
+  endif()
+  target_compile_definitions(torch PRIVATE USE_CUDA)
 endif()
 
-target_include_directories(torch
-  PUBLIC
-  "${CAFFE2_INCLUDE_DIR}"
-  "${CAFFE2_INSTALL_INCLUDE_DIR}"
-  "${CAFFE2_INSTALL_INCLUDE_DIR}/TH"
-  "${TORCH_SRC_DIR}/.."
-  "${TORCH_SRC_DIR}")
+if(USE_ROCM)
+  target_link_libraries(torch caffe2_hip_library)
+  target_compile_definitions(torch PRIVATE
+    USE_ROCM
+    __HIP_PLATFORM_HCC__
+    )
+  target_include_directories(torch PRIVATE
+    /opt/rocm/include
+    /opt/rocm/hcc/include
+    /opt/rocm/hipblas/include
+    /opt/rocm/hcsparse/include
+    )
+endif()
 
-if (NOT NO_API)
-  target_include_directories(torch PUBLIC
-    "${TORCH_SRC_DIR}/csrc/api/"
-    "${TORCH_SRC_DIR}/csrc/api/include")
+
+set(TH_CPU_INCLUDE
+  # dense
+  ${TORCH_SRC_DIR}/../aten/src/TH
+  ${CMAKE_CURRENT_BINARY_DIR}/../aten/src/TH
+  ${TORCH_SRC_DIR}/../aten/src
+  ${CMAKE_CURRENT_BINARY_DIR}/../aten/src
+  ${CMAKE_BINARY_DIR}/aten/src)
+target_include_directories(torch PRIVATE ${TH_CPU_INCLUDE})
+
+if(USE_CUDA OR USE_ROCM)
+  set(TH_CUDA_INCLUDE
+    # dense
+    ${TORCH_SRC_DIR}/../aten/src/THC
+    ${CMAKE_CURRENT_BINARY_DIR}/../aten/src/THC)
+  target_include_directories(torch PRIVATE ${TH_CUDA_INCLUDE})
 endif()
 
+set(ATen_CPU_INCLUDE
+  ${TORCH_SRC_DIR}/../aten/src
+  ${CMAKE_CURRENT_BINARY_DIR}/../aten/src
+  ${CMAKE_CURRENT_BINARY_DIR}/../aten/src/ATen
+  ${CMAKE_BINARY_DIR}/aten/src)
+target_include_directories(torch PRIVATE ${ATen_CPU_INCLUDE})
+
+target_include_directories(torch PUBLIC
+  ${TORCH_SRC_DIR}/csrc)
+
 # SYSTEM headers are included with -isystem and thus do not trigger warnings.
 target_include_directories(torch SYSTEM PUBLIC
   "${TORCH_SRC_DIR}/../third_party/cereal/include" # For cereal/
   "${TORCH_SRC_DIR}/../third_party/nanopb")
 
-if(USE_CUDA)
-  target_include_directories(torch SYSTEM PUBLIC "${CUDA_INCLUDE_DIRS}")
-endif()
-
 set_target_properties(torch PROPERTIES VERSION 1 SOVERSION 1)
 
 if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1")
@@ -348,45 +356,75 @@ install(TARGETS torch
   LIBRARY DESTINATION "${TORCH_INSTALL_LIB_DIR}"
   ARCHIVE DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
-if (TORCH_BUILD_TEST)
-  # JIT Tests. TODO: Put into test/cpp/jit folder
-
+# JIT Tests. TODO: Put into test/cpp/jit folder
+if (NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
   add_executable(test_jit ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp)
+  target_link_libraries(test_jit torch ${TORCH_CUDA_LIBRARIES})
+  target_compile_definitions(test_jit PUBLIC USE_CATCH _FORCE_INLINES)
+  target_include_directories(test_jit PUBLIC
+    "${TORCH_SRC_DIR}/../third_party/catch/single_include"
+    ${ATen_CPU_INCLUDE})
 
-  target_link_libraries(test_jit torch)
+  if (USE_CUDA)
+    target_link_libraries(test_jit ${CUDA_LIBRARIES})
+  endif()
+endif()
 
-  target_include_directories(test_jit PUBLIC
-    "${TORCH_SRC_DIR}/../third_party/catch/single_include")
-
-  # API Tests
-
-  if (NOT NO_API)
-    set(TORCH_API_TEST_DIR "${TORCH_SRC_DIR}/../test/cpp/api")
-
-    add_executable(test_api
-      ${TORCH_API_TEST_DIR}/any.cpp
-      ${TORCH_API_TEST_DIR}/modules.cpp
-      ${TORCH_API_TEST_DIR}/cursor.cpp
-      ${TORCH_API_TEST_DIR}/integration.cpp
-      ${TORCH_API_TEST_DIR}/main.cpp
-      ${TORCH_API_TEST_DIR}/misc.cpp
-      ${TORCH_API_TEST_DIR}/module.cpp
-      ${TORCH_API_TEST_DIR}/optim.cpp
-      ${TORCH_API_TEST_DIR}/sequential.cpp
-      ${TORCH_API_TEST_DIR}/rnn.cpp
-      ${TORCH_API_TEST_DIR}/serialization.cpp
-      ${TORCH_API_TEST_DIR}/static.cpp
-      ${TORCH_API_TEST_DIR}/tensor.cpp
-      ${TORCH_API_TEST_DIR}/tensor_cuda.cpp
-      # Temporary until ATen tests are built with Caffe2
-      ${TORCH_API_TEST_DIR}/tensor_options.cpp
-      ${TORCH_API_TEST_DIR}/tensor_options_cuda.cpp
-    )
+if (BUILD_TORCH_TEST AND NOT NO_API AND NOT USE_ROCM)
+  target_include_directories(torch PUBLIC
+    ${TORCH_SRC_DIR}/csrc/api
+    ${TORCH_SRC_DIR}/csrc/api/include)
+
+  if (NOT MSVC)
+    target_compile_options(torch PRIVATE -Wno-maybe-uninitialized)
+  endif()
 
-      target_include_directories(test_api
-        PUBLIC
-        "${TORCH_SRC_DIR}/../third_party/catch/single_include")
+  if (APPLE)
+    target_compile_options(torch PRIVATE -Wno-unknown-warning-option)
+  endif()
 
-    target_link_libraries(test_api torch)
+  set(TORCH_API_TEST_DIR "${TORCH_SRC_DIR}/../test/cpp/api")
+
+  add_executable(test_api
+    ${TORCH_API_TEST_DIR}/any.cpp
+    ${TORCH_API_TEST_DIR}/cursor.cpp
+    ${TORCH_API_TEST_DIR}/integration.cpp
+    ${TORCH_API_TEST_DIR}/main.cpp
+    ${TORCH_API_TEST_DIR}/misc.cpp
+    ${TORCH_API_TEST_DIR}/module.cpp
+    ${TORCH_API_TEST_DIR}/modules.cpp
+    ${TORCH_API_TEST_DIR}/optim.cpp
+    ${TORCH_API_TEST_DIR}/parallel.cpp
+    ${TORCH_API_TEST_DIR}/rnn.cpp
+    ${TORCH_API_TEST_DIR}/sequential.cpp
+    ${TORCH_API_TEST_DIR}/serialization.cpp
+    ${TORCH_API_TEST_DIR}/static.cpp
+    ${TORCH_API_TEST_DIR}/tensor_cuda.cpp
+    ${TORCH_API_TEST_DIR}/tensor.cpp
+    # Temporary until ATen tests are built with Caffe2
+    ${TORCH_API_TEST_DIR}/tensor_options.cpp
+    ${TORCH_API_TEST_DIR}/tensor_options_cuda.cpp
+    )
+
+  target_include_directories(test_api
+    PUBLIC
+    "${TORCH_SRC_DIR}/../third_party/catch/single_include"
+    ${ATen_CPU_INCLUDE})
+
+  target_link_libraries(test_api torch ${TORCH_CUDA_LIBRARIES})
+
+  if (APPLE)
+    target_compile_options(test_api PRIVATE
+      -Wno-unknown-warning-option
+      -Wno-missing-braces
+      -Wno-maybe-uninitialized
+      -Wno-reorder
+      )
+  elseif (MSVC)
+  else()
+    target_compile_options(test_api PRIVATE
+      -Wno-unused-but-set-parameter
+      -Wno-reorder
+      )
   endif()
 endif()
diff --git a/torch/_six.py b/torch/_six.py
index 1cea0661f56459..1d70df51830d5e 100644
--- a/torch/_six.py
+++ b/torch/_six.py
@@ -25,6 +25,13 @@
 PY2 = sys.version_info[0] == 2
 PY3 = sys.version_info[0] == 3
 
+if PY2:
+    inf = float('inf')
+    nan = float('nan')
+else:
+    import math
+    inf = math.inf
+    nan = math.nan
 
 if PY2:
     string_classes = basestring
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 0c83e23a9703e3..eb9709f864d0f6 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1666,6 +1666,20 @@ def callable(a, b) -> number
 See :func:`torch.reshape`
 """)
 
+add_docstr_all('reshape_as',
+               r"""
+reshape_as(other) -> Tensor
+
+Returns this tensor as the same shape as :attr:`other`.
+``self.reshape_as(other)`` is equivalent to ``self.reshape(other.sizes())``.
+
+Please see :meth:`~Tensor.reshape` for more information about ``reshape``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same shape
+        as :attr:`other`.
+""")
+
 add_docstr_all('resize_',
                r"""
 resize_(*sizes) -> Tensor
@@ -1729,8 +1743,8 @@ def callable(a, b) -> number
 
 Writes all values from the tensor :attr:`src` into :attr:`self` at the indices
 specified in the :attr:`index` tensor. For each value in :attr:`src`, its output
-index is specified by its index in :attr:`src` for dimension != :attr:`dim` and
-by the corresponding value in :attr:`index` for dimension = :attr:`dim`.
+index is specified by its index in :attr:`src` for ``dimension != dim`` and by
+the corresponding value in :attr:`index` for ``dimension = dim``.
 
 For a 3-D tensor, :attr:`self` is updated as::
 
@@ -1740,14 +1754,14 @@ def callable(a, b) -> number
 
 This is the reverse operation of the manner described in :meth:`~Tensor.gather`.
 
-:attr:`self`, :attr:`index` and :attr:`src` should have same number of
-dimensions. It is also required that `index.size(d) <= src.size(d)` for all
-dimensions `d`, and that `index.size(d) <= self.size(d)` for all dimensions
-`d != dim`.
+:attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should have same
+number of dimensions. It is also required that ``index.size(d) <= src.size(d)``
+for all dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all
+dimensions ``d != dim``.
 
 Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
-between `0` and `(self.size(dim) -1)` inclusive, and all values in a row along
-the specified dimension :attr:`dim` must be unique.
+between ``0`` and ``self.size(dim) - 1`` inclusive, and all values in a row
+along the specified dimension :attr:`dim` must be unique.
 
 Args:
     dim (int): the axis along which to index
@@ -1771,6 +1785,50 @@ def callable(a, b) -> number
             [ 0.0000,  0.0000,  0.0000,  1.2300]])
 """)
 
+add_docstr_all('scatter_add_',
+               r"""
+scatter_add_(dim, index, other) -> Tensor
+
+Adds all values from the tensor :attr:`other` into :attr:`self` at the indices
+specified in the :attr:`index` tensor in a similar fashion as
+:meth:`~torch.Tensor.scatter_`. For each value in :attr:`other`, it is added to
+an index in :attr:`self` which is specified by its index in :attr:`other`
+for ``dimension != dim`` and by the corresponding value in :attr:`index` for
+``dimension = dim``.
+
+For a 3-D tensor, :attr:`self` is updated as::
+
+    self[index[i][j][k]][j][k] += other[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] += other[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] += other[i][j][k]  # if dim == 2
+
+:attr:`self`, :attr:`index` and :attr:`other` should have same number of
+dimensions. It is also required that ``index.size(d) <= other.size(d)`` for all
+dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all dimensions
+``d != dim``.
+
+Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
+between ``0`` and ``self.size(dim) - 1`` inclusive, and all values in a row along
+the specified dimension :attr:`dim` must be unique.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter and add
+    other (Tensor): the source elements to scatter and add
+
+Example::
+
+    >>> x = torch.rand(2, 5)
+    >>> x
+    tensor([[0.7404, 0.0427, 0.6480, 0.3806, 0.8328],
+            [0.7953, 0.2009, 0.9154, 0.6782, 0.9620]])
+    >>> torch.ones(3, 5).scatter_add_(0, torch.tensor([[0, 1, 2, 0, 0], [2, 0, 0, 1, 2]]), x)
+    tensor([[1.7404, 1.2009, 1.9154, 1.3806, 1.8328],
+            [1.0000, 1.0427, 1.0000, 1.6782, 1.0000],
+            [1.7953, 1.0000, 1.6480, 1.0000, 1.9620]])
+
+""")
+
 add_docstr_all('select',
                r"""
 select(dim, index) -> Tensor
@@ -2407,6 +2465,20 @@ def callable(a, b) -> number
 
 """)
 
+add_docstr_all('view_as',
+               r"""
+view_as(other) -> Tensor
+
+View this tensor as the same size as :attr:`other`.
+``self.view_as(other)`` is equivalent to ``self.view(other.size())``.
+
+Please see :meth:`~Tensor.view` for more information about ``view``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same size
+        as :attr:`other`.
+""")
+
 add_docstr_all('expand',
                r"""
 expand(*sizes) -> Tensor
@@ -2445,6 +2517,20 @@ def callable(a, b) -> number
             [ 3,  3,  3,  3]])
 """)
 
+add_docstr_all('expand_as',
+               r"""
+expand_as(other) -> Tensor
+
+Expand this tensor to the same size as :attr:`other`.
+``self.expand_as(other)`` is equivalent to ``self.expand(other.size())``.
+
+Please see :meth:`~Tensor.expand` for more information about ``expand``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same size
+        as :attr:`other`.
+""")
+
 add_docstr_all('zero_',
                r"""
 zero_() -> Tensor
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index f02449ab6e4ff5..ee90abff708a19 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -2,6 +2,7 @@
 import torch
 from functools import reduce
 from sys import float_info
+from torch._six import inf, nan
 
 
 class __PrinterOptions(object):
@@ -50,7 +51,7 @@ def set_printoptions(
             PRINT_OPTS.linewidth = 80
         elif profile == "full":
             PRINT_OPTS.precision = 4
-            PRINT_OPTS.threshold = float('inf')
+            PRINT_OPTS.threshold = inf
             PRINT_OPTS.edgeitems = 3
             PRINT_OPTS.linewidth = 80
 
@@ -101,8 +102,8 @@ def __init__(self, tensor):
 
             else:
                 copy_abs = copy.abs()
-                pos_inf_mask = copy_abs.eq(float('inf'))
-                neg_inf_mask = copy_abs.eq(float('-inf'))
+                pos_inf_mask = copy_abs.eq(inf)
+                neg_inf_mask = copy_abs.eq(-inf)
                 nan_mask = copy_abs.ne(copy)
                 invalid_value_mask = pos_inf_mask + neg_inf_mask + nan_mask
                 if invalid_value_mask.all():
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 73c42ef0fa83e4..84a08a155ce97e 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3782,11 +3782,11 @@ def parse_kwargs(desc):
     end (Number): the ending value for the set of points
     step (Number): the gap between each pair of adjacent points. Default: ``1``.
     {out}
-    {dtype}
-        If `dtype` is not given, infer the data type from the other input arguments.
-        If any of `start`, `end`, or `stop` are floating-point,
-        the `dtype` is inferred to be the default dtype, see :meth:`~torch.get_default_dtype`.
-        Otherwise, the `dtype` is inferred to be `torch.int64`.
+    {dtype} If `dtype` is not given, infer the data type from the other input
+        arguments. If any of `start`, `end`, or `stop` are floating-point, the
+        `dtype` is inferred to be the default dtype, see
+        :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+        be `torch.int64`.
     {layout}
     {device}
     {requires_grad}
@@ -4202,9 +4202,6 @@ def parse_kwargs(desc):
 `squeeze(input, 0)` leaves the tensor unchanged, but :func:`squeeze(input, 1)` will
 squeeze the tensor to the shape :math:`(A \times B)`.
 
-.. note:: As an exception to the above, a 1-dimensional tensor of size 1 will
-          not have its dimensions changed.
-
 .. note:: The returned tensor shares the storage with the input tensor,
           so changing the contents of one will change the contents of the other.
 
@@ -5108,58 +5105,6 @@ def parse_kwargs(desc):
     {requires_grad}
 """.format(**factory_like_common_args))
 
-add_docstr(torch.stft,
-           r"""
-stft(signal, frame_length, hop, fft_size=None, normalized=False, onesided=True, window=None, pad_end=0) -> Tensor
-
-Short-time Fourier transform (STFT).
-
-Ignoring the batch dimension, this method computes the following expression:
-
-.. math::
-    X[m, \omega] = \sum_{k = 0}^{\text{frame_length}}%
-                        window[k]\ signal[m \times hop + k]\ e^{- j \frac{2 \pi \cdot \omega k}{\text{frame_length}}},
-
-where :math:`m` is the index of the sliding window, and :math:`\omega` is
-the frequency that :math:`0 \leq \omega <` :attr:`fft_size`. When
-:attr:`return_onsesided` is the default value ``True``, only values for
-:math:`\omega` in range :math:`\left[0, 1, 2, \dots, \left\lfloor \frac{\text{fft_size}}{2} \right\rfloor + 1\right]`
-are returned because the real-to-complex transform satisfies the Hermitian
-symmetry, i.e., :math:`X[m, \omega] = X[m, \text{fft_size} - \omega]^*`.
-
-The input :attr:`signal` must be 1-D sequence :math:`(T)` or 2-D a batch of
-sequences :math:`(N \times T)`. If :attr:`fft_size` is ``None``, it is
-default to same value as  :attr:`frame_length`. :attr:`window` can be a
-1-D tensor of size :attr:`frame_length`, e.g., see
-:meth:`torch.hann_window`. If :attr:`window` is the default value ``None``,
-it is treated as if having :math:`1` everywhere in the frame.
-:attr:`pad_end` indicates the amount of zero padding at the end of
-:attr:`signal` before STFT. If :attr:`normalized` is set to ``True``, the
-function returns the normalized STFT results, i.e., multiplied by
-:math:`(frame\_length)^{-0.5}`.
-
-Returns the real and the imaginary parts together as one tensor of size
-:math:`(* \times N \times 2)`, where :math:`*` is the shape of input :attr:`signal`,
-:math:`N` is the number of :math:`\omega` s considered depending on
-:attr:`fft_size` and :attr:`return_onesided`, and each pair in the last
-dimension represents a complex number as real part and imaginary part.
-
-Arguments:
-    signal (Tensor): the input tensor
-    frame_length (int): the size of window frame and STFT filter
-    hop (int): the distance between neighboring sliding window frames
-    fft_size (int, optional): size of Fourier transform. Default: ``None``
-    normalized (bool, optional): controls whether to return the normalized STFT results
-         Default: ``False``
-    onesided (bool, optional): controls whether to return half of results to
-        avoid redundancy Default: ``True``
-    window (Tensor, optional): the optional window function. Default: ``None``
-    pad_end (int, optional): implicit zero padding at the end of :attr:`signal`. Default: 0
-
-Returns:
-    Tensor: A tensor containing the STFT result
-""")
-
 add_docstr(torch.det,
            r"""
 det(A) -> Tensor
@@ -5569,7 +5514,7 @@ def parse_kwargs(desc):
     normalized (bool, optional): controls whether to return normalized results.
         Default: ``False``
     onesided (bool, optional): controls whether to return half of results to
-        avoid redundancy Default: ``True``
+        avoid redundancy. Default: ``True``
 
 Returns:
     Tensor: A tensor containing the real-to-complex Fourier transform result
diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp
index f130f8164c0ac1..fcd0cf2ff8bb52 100644
--- a/torch/csrc/Device.cpp
+++ b/torch/csrc/Device.cpp
@@ -4,12 +4,14 @@
 #include "torch/csrc/utils/object_ptr.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/utils/python_numbers.h"
 #include "torch/csrc/utils/pybind.h"
 
 #include <ATen/Device.h>
 #include <ATen/Error.h>
 
 #include <cstring>
+#include <limits>
 #include <structmember.h>
 #include <sstream>
 
@@ -95,6 +97,13 @@ PyObject *THPDevice_index(THPDevice *self)
   END_HANDLE_TH_ERRORS
 }
 
+static Py_ssize_t THPDevice_hash(THPDevice *self)
+{
+  HANDLE_TH_ERRORS
+  return static_cast<Py_ssize_t>(std::hash<at::Device>{}(self->device) % std::numeric_limits<Py_ssize_t>::max());
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
 PyObject *THPDevice_rc(PyObject *a, PyObject *b, int op) {
   HANDLE_TH_ERRORS
   if (!THPDevice_Check(a) || !THPDevice_Check(b)) {
@@ -181,7 +190,7 @@ PyTypeObject THPDeviceType = {
   0,                                     /* tp_as_number */
   0,                                     /* tp_as_sequence */
   0,                                     /* tp_as_mapping */
-  0,                                     /* tp_hash  */
+  (hashfunc)THPDevice_hash,              /* tp_hash  */
   0,                                     /* tp_call */
   (reprfunc)THPDevice_str,               /* tp_str */
   0,                                     /* tp_getattro */
diff --git a/torch/csrc/Size.cpp b/torch/csrc/Size.cpp
index 0708ebaaa36ccf..95c7c648244b9c 100644
--- a/torch/csrc/Size.cpp
+++ b/torch/csrc/Size.cpp
@@ -14,7 +14,7 @@ struct THPSize {
 
 PyObject * THPSize_New(const torch::autograd::Variable& var)
 {
-  if (!torch::jit::tracer::isTracing(var)) {
+  if (!torch::jit::tracer::isTracing()) {
     auto sizes = var.sizes();
     return THPSize_NewFromSizes(var.dim(), sizes.data());
   }
@@ -38,10 +38,10 @@ PyObject * THPSize_NewFromSizes(int dim, const int64_t *sizes)
   return self.release();
 }
 
-static bool isTracedVar(PyObject *item) {
+static bool isTracedZeroDimVar(PyObject *item) {
   if (!THPVariable_Check(item)) return false;
   auto & var = reinterpret_cast<THPVariable*>(item)->cdata;
-  return torch::jit::tracer::isTracing(var);
+  return var.dim() == 0 && torch::jit::tracer::getValueTrace(var);
 }
 
 static PyObject * THPSize_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
@@ -50,10 +50,10 @@ static PyObject * THPSize_pynew(PyTypeObject *type, PyObject *args, PyObject *kw
   if (self) {
     for (Py_ssize_t i = 0; i < PyTuple_Size(self); ++i) {
       PyObject *item = PyTuple_GET_ITEM(self.get(), i);
-      if (isTracedVar(item)) {
+      if (THPUtils_checkLong(item)) {
         continue;
       }
-      if (THPUtils_checkLong(item)) {
+      if (torch::jit::tracer::isTracing() && isTracedZeroDimVar(item)) {
         continue;
       }
       // item.__index__() works with 0-dim tensors and tensors with one element
diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
index 80c6705cb77a46..3b15ead08d66a1 100644
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@@ -12,7 +12,7 @@
 #include <TH/TH.h>
 // See Note [TH abstraction violation]
 //  - Used to get at the allocator associated with a storage
-#include <TH/THStorage.hpp>
+#include <TH/THStorageFunctions.hpp>
 #include <torch/csrc/finalizer.h>
 #include <libshm.h>
 #include "THP.h"
@@ -39,7 +39,7 @@ void THPPointer<THStorage>::free() {
     } else {
       AT_ASSERT(ptr->data_ptr.device().is_cuda());
 #ifdef USE_CUDA
-      THCStorage_free(at::globalContext().lazyInitCUDA(), ptr);
+      THStorage_free(ptr);
 #else
       AT_ERROR("Cannot free THCStorage when not built with CUDA");
 #endif
diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h
new file mode 100644
index 00000000000000..b2b03eca03dc63
--- /dev/null
+++ b/torch/csrc/WindowsTorchApiMacro.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#ifdef _WIN32
+
+#if defined(torch_EXPORTS)
+#define TORCH_API __declspec(dllexport)
+#else
+#define TORCH_API __declspec(dllimport)
+#endif
+
+#else
+#if defined(torch_EXPORTS)
+#define TORCH_API
+#else
+#define TORCH_API
+#endif
+
+#endif
diff --git a/torch/csrc/api/include/torch/nn/cloneable.h b/torch/csrc/api/include/torch/nn/cloneable.h
index 61a32e20fe8061..3b304a652d133a 100644
--- a/torch/csrc/api/include/torch/nn/cloneable.h
+++ b/torch/csrc/api/include/torch/nn/cloneable.h
@@ -43,7 +43,8 @@ class Cloneable : public Module {
         "Are you sure you called register_parameter() inside reset() "
         "and not the constructor?");
     for (const auto& parameter : parameters_) {
-      copy->parameters_[parameter.key].data().copy_(parameter->data());
+      copy->parameters_[parameter.key].data().copy_(
+          parameter->data(), /*non_blocking=*/true);
     }
     AT_CHECK(
         copy->buffers_.size() == buffers_.size(),
@@ -52,7 +53,8 @@ class Cloneable : public Module {
         "Are you sure you called register_buffer() inside reset() "
         "and not the constructor?");
     for (const auto& buffer : buffers_) {
-      copy->buffers_[buffer.key].data().copy_(buffer->data());
+      copy->buffers_[buffer.key].data().copy_(
+          buffer->data(), /*non_blocking=*/true);
     }
     AT_CHECK(
         copy->children_.size() == children_.size(),
diff --git a/torch/csrc/api/include/torch/nn/cursor.h b/torch/csrc/api/include/torch/nn/cursor.h
index c4007d89686ff6..c0f56eea72fbd0 100644
--- a/torch/csrc/api/include/torch/nn/cursor.h
+++ b/torch/csrc/api/include/torch/nn/cursor.h
@@ -125,13 +125,13 @@ class CursorBase {
   template <typename Iterator, typename Function>
   void map(Iterator output_iterator, Function function) {
     for (auto& item : items_) {
-      *output_iterator = function(*item);
+      *output_iterator++ = function(*item);
     }
   }
   template <typename Iterator, typename Function>
   void map(Iterator output_iterator, Function function) const {
     for (auto& item : items_) {
-      *output_iterator = function(*item);
+      *output_iterator++ = function(*item);
     }
   }
 
@@ -142,13 +142,13 @@ class CursorBase {
   template <typename Iterator, typename Function>
   void map_items(Iterator output_iterator, Function function) {
     for (auto& item : items_) {
-      *output_iterator = function(item.key, item.value);
+      *output_iterator++ = function(item.key, item.value);
     }
   }
   template <typename Iterator, typename Function>
   void map_items(Iterator output_iterator, Function function) const {
     for (auto& item : items_) {
-      *output_iterator = function(item.key, item.value);
+      *output_iterator++ = function(item.key, item.value);
     }
   }
 
diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h
index be5fd3a6702826..595baee2532d23 100644
--- a/torch/csrc/api/include/torch/nn/modules/any.h
+++ b/torch/csrc/api/include/torch/nn/modules/any.h
@@ -186,7 +186,7 @@ class AnyModule::Value {
 
  private:
   friend class AnyModule;
-  friend class TestValue;
+  friend struct TestValue;
 
   /// Constructs the `Value` from value type.
   template <
diff --git a/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
index 25c75b85a7df71..d4dd669a286fef 100644
--- a/torch/csrc/api/include/torch/nn/modules/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@@ -19,9 +19,8 @@ struct BatchNormOptions {
 
 class BatchNormImpl : public torch::nn::Cloneable<BatchNormImpl> {
  public:
-  template <typename... Ts>
-  explicit BatchNormImpl(Ts&&... ts)
-      : BatchNormImpl(BatchNormOptions(std::forward<Ts>(ts)...)) {}
+  explicit BatchNormImpl(int64_t features)
+      : BatchNormImpl(BatchNormOptions(features)) {}
   explicit BatchNormImpl(BatchNormOptions options);
 
   void reset() override;
diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h
index f7a7cc0b142460..3f562eab1c5a52 100644
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -32,9 +32,11 @@ struct ConvOptions {
 template <size_t D, typename Derived>
 class ConvImpl : public torch::nn::Cloneable<Derived> {
  public:
-  template <typename... Ts>
-  explicit ConvImpl(Ts&&... ts)
-      : ConvImpl(ConvOptions<D>(std::forward<Ts>(ts)...)) {}
+  ConvImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<D> kernel_size)
+      : ConvImpl(ConvOptions<D>(input_channels, output_channels, kernel_size)) {}
   explicit ConvImpl(ConvOptions<D> options);
 
   void reset() override;
diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h
index 91f4c5b244dd85..23c3e4f127d97f 100644
--- a/torch/csrc/api/include/torch/nn/modules/dropout.h
+++ b/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -18,9 +18,8 @@ namespace detail {
 template <typename Derived>
 class DropoutImplBase : public torch::nn::Cloneable<Derived> {
  public:
-  template <typename... Ts>
-  explicit DropoutImplBase(Ts&&... ts)
-      : DropoutImplBase(DropoutOptions(std::forward<Ts>(ts)...)) {}
+  explicit DropoutImplBase(double rate)
+      : DropoutImplBase(DropoutOptions(rate)) {}
   explicit DropoutImplBase(DropoutOptions options_);
 
   void reset() override;
diff --git a/torch/csrc/api/include/torch/nn/modules/embedding.h b/torch/csrc/api/include/torch/nn/modules/embedding.h
index f35cd05cb7ca2f..3b80d1044a2c18 100644
--- a/torch/csrc/api/include/torch/nn/modules/embedding.h
+++ b/torch/csrc/api/include/torch/nn/modules/embedding.h
@@ -18,9 +18,8 @@ struct EmbeddingOptions {
 
 class EmbeddingImpl : public torch::nn::Cloneable<EmbeddingImpl> {
  public:
-  template <typename... Ts>
-  explicit EmbeddingImpl(Ts&&... ts)
-      : EmbeddingImpl(EmbeddingOptions(std::forward<Ts>(ts)...)) {}
+  EmbeddingImpl(int64_t count, int64_t dimension)
+      : EmbeddingImpl(EmbeddingOptions(count, dimension)) {}
   explicit EmbeddingImpl(EmbeddingOptions options);
 
   void reset() override;
diff --git a/torch/csrc/api/include/torch/nn/modules/linear.h b/torch/csrc/api/include/torch/nn/modules/linear.h
index 34f674991b1e14..5642a9acc58c78 100644
--- a/torch/csrc/api/include/torch/nn/modules/linear.h
+++ b/torch/csrc/api/include/torch/nn/modules/linear.h
@@ -19,9 +19,7 @@ struct LinearOptions {
 
 class LinearImpl : public Cloneable<LinearImpl> {
  public:
-  template <typename... Ts>
-  explicit LinearImpl(Ts&&... ts)
-      : LinearImpl(LinearOptions(std::forward<Ts>(ts)...)) {}
+  LinearImpl(int64_t in, int64_t out) : LinearImpl(LinearOptions(in, out)) {}
   explicit LinearImpl(LinearOptions options);
 
   void reset() override;
diff --git a/torch/csrc/api/include/torch/nn/modules/rnn.h b/torch/csrc/api/include/torch/nn/modules/rnn.h
index e6d2ea918e9ec6..3523eff4764fec 100644
--- a/torch/csrc/api/include/torch/nn/modules/rnn.h
+++ b/torch/csrc/api/include/torch/nn/modules/rnn.h
@@ -121,8 +121,8 @@ struct RNNOptions {
 
 class RNNImpl : public detail::RNNImplBase<RNNImpl> {
  public:
-  template <typename... Ts>
-  explicit RNNImpl(Ts&&... ts) : RNNImpl(RNNOptions(std::forward<Ts>(ts)...)) {}
+  RNNImpl(int64_t input_size, int64_t hidden_size)
+      : RNNImpl(RNNOptions(input_size, hidden_size)) {}
   explicit RNNImpl(RNNOptions options);
 
   RNNOptions options;
@@ -140,9 +140,8 @@ using LSTMOptions = detail::RNNOptionsBase;
 
 class LSTMImpl : public detail::RNNImplBase<LSTMImpl> {
  public:
-  template <typename... Ts>
-  explicit LSTMImpl(Ts&&... ts)
-      : LSTMImpl(LSTMOptions(std::forward<Ts>(ts)...)) {}
+  LSTMImpl(int64_t input_size, int64_t hidden_size)
+      : LSTMImpl(LSTMOptions(input_size, hidden_size)) {}
   explicit LSTMImpl(LSTMOptions options);
 
  private:
@@ -157,8 +156,8 @@ using GRUOptions = detail::RNNOptionsBase;
 
 class GRUImpl : public detail::RNNImplBase<GRUImpl> {
  public:
-  template <typename... Ts>
-  explicit GRUImpl(Ts&&... ts) : GRUImpl(GRUOptions(std::forward<Ts>(ts)...)) {}
+  GRUImpl(int64_t input_size, int64_t hidden_size)
+      : GRUImpl(GRUOptions(input_size, hidden_size)) {}
   explicit GRUImpl(GRUOptions options);
 
  private:
diff --git a/torch/csrc/api/include/torch/nn/parallel/data_parallel.h b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
new file mode 100644
index 00000000000000..82150658dcffe2
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
@@ -0,0 +1,176 @@
+#pragma once
+
+#include <torch/cuda.h>
+#include <torch/nn/module.h>
+#include <torch/nn/pimpl.h>
+#include <torch/tensor.h>
+
+#include <torch/csrc/autograd/functions/comm.h>
+#include <torch/csrc/cuda/comm.h>
+
+#include <ATen/Device.h>
+#include <ATen/Error.h>
+#include <ATen/OptionsGuard.h>
+#include <ATen/Parallel.h>
+#include <ATen/TensorOptions.h>
+#include <ATen/optional.h>
+
+#include <cstddef>
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+namespace torch {
+namespace nn {
+namespace parallel {
+
+/// Replicates a module on the given list of devices.
+/// A replica is created by calling `clone()` on the module. For this, the
+/// module must inherit from `nn::Cloneable`, or define its own `clone()`
+/// method, which is expected to perform a deep copy of the module.
+template <typename ModuleType>
+std::vector<std::shared_ptr<ModuleType>> replicate(
+    const std::shared_ptr<ModuleType>& module,
+    const std::vector<Device>& devices) {
+  std::vector<std::shared_ptr<ModuleType>> replicas;
+  replicas.reserve(devices.size());
+  for (const auto& device : devices) {
+    // Here we rely on the property tensors are never (or should never be)
+    // allocated on any particular device, but always the default device, e.g.
+    // in `torch::ones({3, 4})`, the device is unspecified and pulled from the
+    // current thread local default options. As such, we can here modify these
+    // thread local default options and thereby cause all tensors in the cloned
+    // module to be constructed directly on the device we want.
+    OptionsGuard guard(device);
+    replicas.push_back(std::static_pointer_cast<ModuleType>(module->clone()));
+  }
+  return replicas;
+}
+
+/// Replicates a module holder on the given list of devices.
+/// This method allows calling `replicate()` with a module holder, such as
+/// `Linear`.
+template <typename ModuleType>
+std::vector<ModuleHolder<ModuleType>> replicate(
+    const ModuleHolder<ModuleType>& module,
+    const std::vector<Device>& devices) {
+  auto ptrs = replicate(module.ptr(), devices);
+  return std::vector<ModuleHolder<ModuleType>>(ptrs.begin(), ptrs.end());
+}
+
+/// Applies the given inputs to the given modules in a parallel fashion.
+/// Conceptually, a thread is spawned for each `(module, input)` pair, in which
+/// `forward()` is called on the module with its corresponding input. The
+/// outputs of the individual calls are stored in a vector and returned.
+///
+/// The first exception caught by any thread is stashed and rethrown after all
+/// threads have completed their operation.
+///
+/// Further remarks:
+/// 1. The length of the module container must match the length of the inputs.
+/// 2. If a list of devices is supplied, it must match the list of modules in
+/// length. Each device will be set to the current default device during the
+/// invocation of the respective module. This means any tensors allocated on the
+/// default device inside the module will be constructed on this device.
+template <typename ModuleType>
+std::vector<Tensor> parallel_apply(
+    std::vector<ModuleType>& modules,
+    const std::vector<Tensor>& inputs,
+    const at::optional<std::vector<Device>>& devices = at::nullopt) {
+  AT_CHECK(
+      modules.size() == inputs.size(), "Must have as many inputs as modules");
+  if (devices) {
+    AT_CHECK(
+        modules.size() == devices->size(),
+        "Must have as many devices as modules");
+  }
+
+  std::vector<Tensor> outputs(modules.size());
+  std::mutex mutex;
+
+  // std::exception_ptr can be passed between threads:
+  // > An instance of std::exception_ptr may be passed to another function,
+  // > possibly on another thread, where the exception may be rethrown [...].
+  // https://en.cppreference.com/w/cpp/error/exception_ptr
+  std::exception_ptr exception;
+
+  at::parallel_for(
+      /*begin=*/0,
+      /*end=*/modules.size(),
+      /*grain_size=*/1,
+      [&modules, &inputs, &devices, &outputs, &mutex, &exception](
+          int64_t index, int64_t stop) {
+        for (; index < stop; ++index) {
+          try {
+            torch::OptionsGuard options_guard(
+                devices ? (*devices)[index] : inputs[index].device());
+            auto output = modules[index]->forward(inputs[index]);
+            std::lock_guard<std::mutex> lock(mutex);
+            outputs[index] = output;
+          } catch (...) {
+            std::lock_guard<std::mutex> lock(mutex);
+            if (!exception) {
+              exception = std::current_exception();
+            }
+          }
+        }
+      });
+
+  if (exception) {
+    std::rethrow_exception(exception);
+  }
+
+  return outputs;
+}
+
+/// Evaluates `module(input)` in parallel across the given `devices`. If
+/// `devices` is not supplied, the invocation is parallelized across all
+/// available CUDA devices. If `output_device` is supplied, the final, combined
+/// tensor will be placed on this device. If not, it defaults to the first
+/// device in `devices`.
+///
+/// In detail, this method performs the following four distinct steps:
+/// 1. *Scatter* the input to the given devices,
+/// 2. *Replicate* (deep clone) the model on each device,
+/// 3. *Evaluate* each module with its input on its device,
+/// 4. *Gather* the outputs of each replica into a single output tensor, located
+/// on the `output_device`.
+template <typename ModuleType>
+Tensor data_parallel(
+    ModuleType module,
+    Tensor input,
+    at::optional<std::vector<Device>> devices = at::nullopt,
+    at::optional<Device> output_device = at::nullopt,
+    int64_t dim = 0) {
+  if (!devices) {
+    const auto device_count = torch::cuda::device_count();
+    AT_CHECK(device_count > 0, "Expected at least one CUDA device");
+    devices.emplace();
+    devices->reserve(device_count);
+    for (size_t index = 0; index < device_count; ++index) {
+      devices->emplace_back(kCUDA, index);
+    }
+  }
+  if (!output_device) {
+    output_device = devices->front();
+  }
+
+  if (devices->size() == 1) {
+    OptionsGuard guard(devices->front());
+    return module->forward(std::move(input)).to(*output_device);
+  }
+
+  autograd::Scatter scatter(*devices, /*chunk_sizes=*/at::nullopt, dim);
+  auto scattered_inputs = scatter.apply({std::move(input)});
+
+  auto replicas = replicate(module, *devices);
+  auto outputs = parallel_apply(replicas, scattered_inputs, *devices);
+  return autograd::Gather(*output_device, dim)
+      .apply(std::move(outputs))
+      .front();
+}
+
+} // namespace parallel
+} // namespace nn
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/pimpl.h b/torch/csrc/api/include/torch/nn/pimpl.h
index e4b6aa76b5a985..772e19b5e53070 100644
--- a/torch/csrc/api/include/torch/nn/pimpl.h
+++ b/torch/csrc/api/include/torch/nn/pimpl.h
@@ -38,10 +38,22 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator {
  public:
   using ContainedType = Contained;
 
+  /// Default constructs the contained module if if has a default constructor,
+  /// else produces a static error. NOTE: This uses the behavior of template
+  /// classes in C++ that constructors (or any methods) are only compiled when
+  /// actually used.
+  ModuleHolder() : impl_(default_construct()) {
+    static_assert(
+        std::is_default_constructible<Contained>::value,
+        "You are trying to default construct a module which has "
+        "no default constructor. Use = nullptr to give it the empty state "
+        "(e.g. `Linear linear = nullptr;` instead of `Linear linear;`).");
+  }
+
   /// Constructs the `ModuleHolder` with an empty contained value. Access to
   /// the underlying module is not permitted and will throw an exception, until
   /// a value is assigned.
-  explicit ModuleHolder(std::nullptr_t) : impl_(nullptr) {}
+  /* implicit */ ModuleHolder(std::nullptr_t) : impl_(nullptr) {}
 
   /// Constructs the `ModuleHolder` with a contained module, forwarding all
   /// arguments to its constructor.
@@ -115,6 +127,30 @@ class ModuleHolder : torch::detail::ModuleHolderIndicator {
   bool is_empty() const noexcept {
     return impl_ == nullptr;
   }
+
+ private:
+  /// In C++17, the two methods below could be written as the following:
+  /// if constexpr (std::is_default_constructible_v<Contained>) {
+  ///   return std::make_shared<Contained>();
+  /// } else {
+  ///   return nullptr;
+  /// }
+  /// In C++11, we use SFINAE instead of `if constexpr`.
+
+  template <
+      typename T = Contained,
+      typename = torch::enable_if_t<std::is_default_constructible<T>::value>>
+  std::shared_ptr<Contained> default_construct() {
+    return std::make_shared<Contained>();
+  }
+
+  template <typename T = Contained>
+  torch::disable_if_t<
+      std::is_default_constructible<T>::value,
+      std::shared_ptr<Contained>>
+  default_construct() {
+    return nullptr;
+  }
 };
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/optim/optimizer.h b/torch/csrc/api/include/torch/optim/optimizer.h
index 6ba46a2bd95127..eed600ab759bc4 100644
--- a/torch/csrc/api/include/torch/optim/optimizer.h
+++ b/torch/csrc/api/include/torch/optim/optimizer.h
@@ -21,26 +21,33 @@ class OptimizerBase {
   using ParameterCursor = torch::detail::CursorBase<Tensor>;
 
   /// Constructs the `Optimizer` from a vector of parameters.
-  explicit OptimizerBase(std::vector<Tensor> parameters)
-      : parameters_(std::move(parameters)) {}
+  explicit OptimizerBase(std::vector<Tensor> parameters);
 
   /// Constructs the `Optimizer` from a ParameterCursor, such as
   /// `nn::Module::parameters()` returns.
-  explicit OptimizerBase(ParameterCursor cursor) {
-    parameters_.reserve(cursor.size());
-    for (const auto& parameter : cursor) {
-      parameters_.push_back(*parameter);
-    }
-  }
+  explicit OptimizerBase(const ParameterCursor& cursor);
 
   virtual ~OptimizerBase() = default;
 
+  /// Adds the given vector of parameters to the optimizer's parameter list.
+  /// Override this method if you want to modify the way parameters are added to
+  /// the `Optimizer`.
+  virtual void add_parameters(const std::vector<Tensor>& parameters);
+
+  /// Adds the `ParameterCursor`'s parameters to the optimizer's parameter list.
+  /// NOTE: Calls the `vector<Tensor>` overload of `add_parameters` -- override
+  /// that method if you want to modify the behavior of `add_parameters`.
+  virtual void add_parameters(const ParameterCursor& cursor);
+
   /// Zeros out the gradients of all parameters.
   virtual void zero_grad();
 
   /// Provides a reference to the parameters this optimizer holds.
   const std::vector<Tensor>& parameters() const noexcept;
 
+  /// Returns the number of parameters referenced by the optimizer.
+  size_t size() const noexcept;
+
  protected:
   OptimizerBase() = default;
 
diff --git a/torch/csrc/api/src/nn/module.cpp b/torch/csrc/api/src/nn/module.cpp
index e2809204a1aa3c..f21f6c5511b600 100644
--- a/torch/csrc/api/src/nn/module.cpp
+++ b/torch/csrc/api/src/nn/module.cpp
@@ -38,8 +38,7 @@ std::shared_ptr<Module> Module::clone() const {
   AT_ERROR(
       "clone() has not been implemented for ",
       name(),
-      ". Use the copy constructor if you don't require polymorphic cloning. "
-      "Otherwise, subclass torch::nn::Cloneable<",
+      ". Subclass torch::nn::Cloneable<",
       name(),
       "> instead of torch::nn::Module to inherit the ability to clone.");
 }
diff --git a/torch/csrc/api/src/optim/optimizer.cpp b/torch/csrc/api/src/optim/optimizer.cpp
index 57c300df07e030..47f2f36423d9d4 100644
--- a/torch/csrc/api/src/optim/optimizer.cpp
+++ b/torch/csrc/api/src/optim/optimizer.cpp
@@ -1,12 +1,32 @@
 #include <torch/optim/optimizer.h>
 
+#include <torch/nn/cursor.h>
 #include <torch/tensor.h>
 
-#include <ATen/Error.h>
+#include <utility>
+#include <vector>
 
 namespace torch {
 namespace optim {
 namespace detail {
+
+OptimizerBase::OptimizerBase(std::vector<Tensor> parameters)
+    : parameters_(std::move(parameters)) {}
+
+OptimizerBase::OptimizerBase(const ParameterCursor& cursor) {
+  add_parameters(cursor);
+}
+
+void OptimizerBase::add_parameters(const std::vector<Tensor>& parameters) {
+  parameters_.insert(parameters_.end(), parameters.begin(), parameters.end());
+}
+
+void OptimizerBase::add_parameters(const ParameterCursor& cursor) {
+  std::vector<Tensor> tensors(cursor.size());
+  cursor.map(tensors.begin(), [](const Tensor& tensor) { return tensor; });
+  add_parameters(tensors);
+}
+
 void OptimizerBase::zero_grad() {
   for (auto& parameter : parameters_) {
     auto& grad = parameter.grad();
@@ -16,6 +36,10 @@ void OptimizerBase::zero_grad() {
     }
   }
 }
+
+size_t OptimizerBase::size() const noexcept {
+  return parameters_.size();
+}
 } // namespace detail
 } // namespace optim
 } // namespace torch
diff --git a/torch/csrc/autograd/anomaly_mode.h b/torch/csrc/autograd/anomaly_mode.h
index df1e92cec258d5..7327d03f11b887 100644
--- a/torch/csrc/autograd/anomaly_mode.h
+++ b/torch/csrc/autograd/anomaly_mode.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
 namespace torch { namespace autograd {
 
 struct AnomalyMode {
@@ -11,7 +13,7 @@ struct AnomalyMode {
   }
 
 private:
-  static bool _enabled;
+ TORCH_API static bool _enabled;
 };
 
 
diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp
index af5e410686c7f0..56ea7f7d290710 100644
--- a/torch/csrc/autograd/function.cpp
+++ b/torch/csrc/autograd/function.cpp
@@ -17,7 +17,13 @@
 
 namespace torch { namespace autograd {
 
-thread_local uint64_t Function::next_sequence_nr_ = 0;
+/// Monotonically incrementing (thread local!) counter to supply sequence
+/// numbers.
+thread_local uint64_t Function_next_sequence_nr_ = 0;
+
+uint64_t& Function::get_next_sequence_nr() {
+  return Function_next_sequence_nr_;
+}
 
 auto Function::name() const -> std::string {
   return at::demangle(typeid(*this).name());
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index 90189e4a3c4d69..b65a7063f15af5 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -8,7 +8,6 @@
 #include "torch/csrc/autograd/saved_variable.h"
 #include "torch/csrc/autograd/type_and_shape.h"
 #include "torch/csrc/autograd/variable.h"
-#include "torch/csrc/utils/auto_unique_ptr.h"
 #include "torch/csrc/utils/python_stub.h"
 #include "torch/csrc/utils/variadic.h"
 
@@ -101,9 +100,8 @@ struct Function : std::enable_shared_from_this<Function> {
     }
   }
 
-  explicit Function(
-      edge_list&& next_edges = edge_list())
-      : Function(next_sequence_nr_++, std::move(next_edges)) {}
+  explicit Function(edge_list&& next_edges = edge_list())
+      : Function(get_next_sequence_nr()++, std::move(next_edges)) {}
 
   /// Functions are neither copyable nor moveable.
   Function(const Function& other) = delete;
@@ -307,9 +305,7 @@ struct Function : std::enable_shared_from_this<Function> {
   }
 
  protected:
-  /// Monotonically incrementing (thread local!) counter to supply sequence
-  /// numbers.
-  static thread_local uint64_t next_sequence_nr_;
+  static uint64_t& get_next_sequence_nr();
 
   /// Performs the `Function`'s actual operation.
   virtual variable_list apply(variable_list&& inputs) = 0;
diff --git a/torch/csrc/autograd/functions/comm.cpp b/torch/csrc/autograd/functions/comm.cpp
new file mode 100644
index 00000000000000..00e140e81b083d
--- /dev/null
+++ b/torch/csrc/autograd/functions/comm.cpp
@@ -0,0 +1,131 @@
+#include <torch/csrc/autograd/functions/comm.h>
+
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/functions/utils.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/cuda/comm.h>
+#include <torch/csrc/utils/functional.h>
+
+#include <ATen/ATen.h>
+#include <ATen/optional.h>
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+namespace torch {
+namespace autograd {
+Scatter::Scatter(
+    std::vector<at::Device> devices,
+    const at::optional<std::vector<int64_t>>& chunk_sizes,
+    int64_t dim,
+    const at::optional<std::vector<at::CUDAStream>>& streams,
+    bool unsqueeze_scalars)
+    : devices_(std::move(devices)),
+      chunk_sizes_(chunk_sizes),
+      dim_(dim),
+      streams_(streams),
+      unsqueeze_scalars_(unsqueeze_scalars) {}
+
+variable_list Scatter::apply(variable_list&& inputs) {
+#ifdef USE_CUDA
+  AT_ASSERT(inputs.size() == 1);
+  auto& input = inputs.front();
+
+  std::shared_ptr<Function> grad_fn;
+  if (compute_requires_grad(input)) {
+    grad_fn =
+        std::make_shared<Gather>(/*destination_device=*/input.device(), dim_);
+    grad_fn->set_next_edges(collect_next_edges(input));
+  }
+
+  auto device_indices = fmap(devices_, [](const at::Device& device) -> int64_t {
+    return device.index();
+  });
+  auto tensors = torch::cuda::scatter(
+      std::move(input), device_indices, chunk_sizes_, dim_, streams_);
+
+  std::vector<Variable> variables;
+  variables.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    AT_ASSERT(tensor.defined());
+    if (unsqueeze_scalars_) {
+      AT_ASSERT(tensor.dim() == 1 && tensor.numel() == 1);
+      variables.push_back(tensor[0]);
+    } else {
+      variables.push_back(std::move(tensor));
+    }
+  }
+
+  set_history(variables, grad_fn);
+
+  return variables;
+#else
+  AT_ERROR("Scatter is only supported in CUDA environments");
+#endif
+}
+
+Gather::Gather(const at::Device& destination_device, int64_t dim)
+    : destination_device_(destination_device), dim_(dim) {}
+
+variable_list Gather::apply(variable_list&& inputs) {
+#ifdef USE_CUDA
+  bool all_are_zero_dim = true;
+  for (const auto& input : inputs) {
+    AT_CHECK(
+        input.is_cuda(),
+        "All inputs to Gather must be CUDA tensors, got ",
+        input.type());
+    if (input.dim() > 0) {
+      all_are_zero_dim = false;
+    }
+  }
+
+  const bool unsqueeze_scalars = all_are_zero_dim && dim_ == 0;
+  if (unsqueeze_scalars) {
+    AT_WARN(
+        "Was asked to gather along dimension 0, but all "
+        "input tensors were scalars; will instead unsqueeze "
+        "and return a vector.");
+  }
+
+  std::vector<at::Tensor> tensors;
+  tensors.reserve(inputs.size());
+  for (auto& variable : inputs) {
+    if (unsqueeze_scalars) {
+      tensors.push_back(variable.view(1));
+    } else {
+      tensors.push_back(std::move(variable));
+    }
+  }
+
+  std::shared_ptr<Function> grad_fn;
+  if (compute_requires_grad(inputs)) {
+    std::vector<at::Device> source_devices;
+    std::vector<int64_t> input_sizes;
+    for (auto& input : inputs) {
+      source_devices.push_back(input.device());
+      input_sizes.push_back(input.size(dim_));
+    }
+    grad_fn = std::make_shared<Scatter>(
+        std::move(source_devices),
+        std::move(input_sizes),
+        dim_,
+        /*streams=*/at::nullopt,
+        /*unsqueeze_scalars=*/unsqueeze_scalars);
+    grad_fn->set_next_edges(collect_next_edges(inputs));
+  }
+
+  // This is special logic for torch::cuda::gather!
+  const auto destination_index =
+      destination_device_.is_cpu() ? -1 : destination_device_.index();
+  auto variable = torch::cuda::gather(tensors, dim_, destination_index);
+  set_history(variable, grad_fn);
+  return {variable};
+#else
+  AT_ERROR("Gather is only supported in CUDA environments");
+#endif
+}
+
+} // namespace autograd
+} // namespace torch
diff --git a/torch/csrc/autograd/functions/comm.h b/torch/csrc/autograd/functions/comm.h
new file mode 100644
index 00000000000000..7bbd24a169dcbe
--- /dev/null
+++ b/torch/csrc/autograd/functions/comm.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <ATen/ATen.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace autograd {
+
+struct Scatter : public Function {
+  explicit Scatter(
+      std::vector<at::Device> devices,
+      const at::optional<std::vector<int64_t>>& chunk_sizes = at::nullopt,
+      int64_t dim = 0,
+      const at::optional<std::vector<at::CUDAStream>>& streams = at::nullopt,
+      bool unsqueeze_scalars = false);
+
+  variable_list apply(variable_list&& inputs) override;
+
+  std::vector<at::Device> devices_;
+  at::optional<std::vector<int64_t>> chunk_sizes_;
+  int64_t dim_;
+  at::optional<std::vector<at::CUDAStream>> streams_;
+  bool unsqueeze_scalars_;
+};
+
+struct Gather : public Function {
+  explicit Gather(const at::Device& destination_device, int64_t dim = 0);
+
+  variable_list apply(variable_list&& inputs) override;
+
+  at::Device destination_device_;
+  int64_t dim_;
+};
+
+} // namespace autograd
+} // namespace torch
diff --git a/torch/csrc/autograd/functions/utils.h b/torch/csrc/autograd/functions/utils.h
index 5f4d8cdb1c8bc6..a1b7ee74bf8ec6 100644
--- a/torch/csrc/autograd/functions/utils.h
+++ b/torch/csrc/autograd/functions/utils.h
@@ -1,7 +1,10 @@
 #pragma once
 
-#include "torch/csrc/autograd/function.h"
-#include "torch/csrc/autograd/variable.h"
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <ATen/ATen.h>
 
 #include <functional>
 #include <memory>
@@ -18,9 +21,59 @@ using function_constructor = std::function<std::shared_ptr<Function>(edge_list&&
 variable_list wrap_outputs(const variable_list& inputs, tensor_list&& outputs,
                            function_constructor ctr);
 
-/**
- * Checks that inputs contains exactly `args` items and that the first `required_args`
- * items are not nullptr. If not specified, `required_args` defaults to `args`.
- */
+///  Checks that inputs contains exactly `args` items and that the first `required_args`
+/// items are not nullptr. If not specified, `required_args` defaults to `args`.
 void check_input_variables(const char* name, const variable_list& inputs, int args, int required_args=-1);
+
+struct ComputeRequiresGrad : IterArgs<ComputeRequiresGrad> {
+  bool out = false;
+  using IterArgs<ComputeRequiresGrad>::operator();
+  void operator()(const at::Tensor& tensor) {
+    const auto& var = static_cast<const Variable&>(tensor);
+    if (var.defined() && var.requires_grad()) {
+      out = true;
+    }
+  }
+  bool short_circuit() {
+    return out;
+  }
+};
+
+template <typename... Args>
+inline bool compute_requires_grad(Args&&... args) {
+  if (!GradMode::is_enabled()) {
+    return false;
+  }
+  return ComputeRequiresGrad().apply(std::forward<Args>(args)...).out;
+}
+
+inline void set_history(
+    at::Tensor& variable,
+    const std::shared_ptr<Function>& grad_fn) {
+  if (grad_fn) {
+    if (variable.defined()) {
+      auto output_nr =
+          grad_fn->add_input_metadata(variable.type(), variable.sizes());
+      as_variable_ref(variable).set_gradient_edge({grad_fn, output_nr});
+    } else {
+      grad_fn->add_input_metadata(Function::undefined_input());
+    }
+  }
+}
+
+inline void set_history(
+    std::vector<Variable>&& variables,
+    const std::shared_ptr<Function>& grad_fn) {
+  for (auto& variable : variables) {
+    set_history(variable, grad_fn);
+  }
+}
+
+inline void set_history(
+    std::vector<Variable>& variables,
+    const std::shared_ptr<Function>& grad_fn) {
+  for (auto& variable : variables) {
+    set_history(variable, grad_fn);
+  }
+}
 }}
diff --git a/torch/csrc/autograd/grad_mode.cpp b/torch/csrc/autograd/grad_mode.cpp
index 6409c697a3b808..fc438dfad3d6a6 100644
--- a/torch/csrc/autograd/grad_mode.cpp
+++ b/torch/csrc/autograd/grad_mode.cpp
@@ -2,6 +2,13 @@
 
 namespace torch { namespace autograd {
 
-thread_local bool GradMode::_enabled = 1;
+thread_local bool GradMode_enabled = 1;
 
+bool GradMode::is_enabled() {
+  return GradMode_enabled;
+}
+
+void GradMode::set_enabled(bool enabled) {
+  GradMode_enabled = enabled;
+}
 }}
diff --git a/torch/csrc/autograd/grad_mode.h b/torch/csrc/autograd/grad_mode.h
index 31a514744a5643..e7d15446bee1fd 100644
--- a/torch/csrc/autograd/grad_mode.h
+++ b/torch/csrc/autograd/grad_mode.h
@@ -3,14 +3,8 @@
 namespace torch { namespace autograd {
 
 struct GradMode {
-  static bool is_enabled() {
-    return _enabled;
-  }
-  static void set_enabled(bool enabled) {
-    _enabled = enabled;
-  }
-private:
-  static thread_local bool _enabled;
+  static bool is_enabled();
+  static void set_enabled(bool enabled);
 };
 
 // A RAII, thread local (!) guard that enables or disables grad mode upon
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index efc6cad05a4264..ca1575699cf08f 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -24,14 +24,17 @@ PyObject * THPAutograd_initExtension(PyObject *_unused)
 
   auto m = py::handle(autograd_module).cast<py::module>();
 
-  py::class_<torch::autograd::profiler::Event>(m,"ProfilerEvent")
-  .def("kind",&torch::autograd::profiler::Event::kind)
-  .def("name",&torch::autograd::profiler::Event::name)
-  .def("thread_id",&torch::autograd::profiler::Event::thread_id)
-  .def("device",&torch::autograd::profiler::Event::device)
-  .def("cpu_elapsed_us",&torch::autograd::profiler::Event::cpu_elapsed_us)
-  .def("cuda_elapsed_us",&torch::autograd::profiler::Event::cuda_elapsed_us)
-  .def("has_cuda",&torch::autograd::profiler::Event::has_cuda);
+  py::class_<torch::autograd::profiler::Event>(m, "ProfilerEvent")
+      .def("kind", &torch::autograd::profiler::Event::kind)
+      .def(
+          "name",
+          [](const torch::autograd::profiler::Event& e) { return e.name(); })
+      .def("thread_id", &torch::autograd::profiler::Event::thread_id)
+      .def("device", &torch::autograd::profiler::Event::device)
+      .def("cpu_elapsed_us", &torch::autograd::profiler::Event::cpu_elapsed_us)
+      .def(
+          "cuda_elapsed_us", &torch::autograd::profiler::Event::cuda_elapsed_us)
+      .def("has_cuda", &torch::autograd::profiler::Event::has_cuda);
   py::enum_<torch::autograd::profiler::ProfilerState>(m,"ProfilerState")
   .value("Disabled", torch::autograd::profiler::ProfilerState::Disabled)
   .value("CPU", torch::autograd::profiler::ProfilerState::CPU)
@@ -41,16 +44,10 @@ PyObject * THPAutograd_initExtension(PyObject *_unused)
   m.def("_enable_profiler", torch::autograd::profiler::enableProfiler);
   m.def("_disable_profiler", torch::autograd::profiler::disableProfiler);
 
-  m.def("_push_range", [](const char *name) {
-    using namespace torch::autograd::profiler;
-    if (state  == ProfilerState::Disabled) return;
-    pushRange(name);
-  });
-  m.def("_pop_range", []() {
-    using namespace torch::autograd::profiler;
-    if (state  == ProfilerState::Disabled) return;
-    popRange();
+  m.def("_push_range", [](const char* name) {
+    torch::autograd::profiler::pushRange(name);
   });
+  m.def("_pop_range", []() { torch::autograd::profiler::popRange(); });
 
   Py_RETURN_TRUE;
 }
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index d9f2f37adc16f9..02b36b470c492b 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -10,6 +10,97 @@ std::list<std::shared_ptr<RangeEventList>> all_event_lists;
 thread_local std::shared_ptr<RangeEventList> event_list;
 thread_local int32_t thread_id;
 
+RangeEventList& getEventList() {
+  if (!event_list) {
+    std::lock_guard<std::mutex> guard(all_event_lists_mutex);
+    event_list = std::make_shared<RangeEventList>();
+    thread_id = next_thread_id++;
+    all_event_lists.emplace_front(event_list);
+  }
+  return *event_list;
+}
+
+void mark(std::string name, bool include_cuda /* = true */) {
+  if (state == ProfilerState::NVTX) {
+#ifdef USE_CUDA
+    nvtxMarkA(name.c_str());
+#else
+    throw std::logic_error(
+        "mark called with NVTX tracing, but compiled without CUDA");
+#endif
+  } else {
+    getEventList().record(
+        EventKind::Mark,
+        std::move(name),
+        thread_id,
+        include_cuda && state == ProfilerState::CUDA);
+  }
+}
+
+void pushRange(std::string name) {
+  if (state == ProfilerState::Disabled) {
+    return;
+  }
+  if (state == ProfilerState::NVTX) {
+#ifdef USE_CUDA
+    nvtxRangePushA(name.c_str());
+#else
+    throw std::logic_error(
+        "pushRange called with NVTX tracing, but compiled without CUDA");
+#endif
+  } else {
+    getEventList().record(
+        EventKind::PushRange,
+        std::move(name),
+        thread_id,
+        state == ProfilerState::CUDA);
+  }
+}
+
+void popRange() {
+  if (state == ProfilerState::Disabled) {
+    return;
+  }
+  if (state == ProfilerState::NVTX) {
+#ifdef USE_CUDA
+    nvtxRangePop();
+#else
+    throw std::logic_error(
+        "popRange called with NVTX tracing, but compiled without CUDA");
+#endif
+  } else {
+    getEventList().record(
+        EventKind::PopRange,
+        std::string(),
+        thread_id,
+        state == ProfilerState::CUDA);
+  }
+}
+
+RecordFunction::RecordFunction(Function* fn) {
+  if (state == ProfilerState::Disabled)
+    return;
+  pushFunctionRange(fn);
+}
+
+RecordFunction::RecordFunction(std::string name) {
+  if (state == ProfilerState::Disabled)
+    return;
+  pushRange(std::move(name));
+}
+
+RecordFunction::RecordFunction(const char* name) {
+  if (state == ProfilerState::Disabled)
+    return;
+  pushRange(name);
+}
+
+RecordFunction::~RecordFunction() {
+  if (state == ProfilerState::Disabled)
+    return;
+  popRange();
+}
+
 void RecordFunction::pushFunctionRange(Function* fn) {
   pushRange(fn->name());
 }
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index c842e00a3b90d7..3df34c728844bc 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -162,80 +162,19 @@ enum class ProfilerState {
     NVTX,  // only emit NVTX markers
 };
 
-extern ProfilerState state;
-extern uint32_t next_thread_id;
-extern std::mutex all_event_lists_mutex;
-extern std::list<std::shared_ptr<RangeEventList>> all_event_lists;
-
-extern thread_local std::shared_ptr<RangeEventList> event_list;
-extern thread_local int32_t thread_id;
-
-inline RangeEventList& getEventList() {
-  if (!event_list) {
-    std::lock_guard<std::mutex> guard(all_event_lists_mutex);
-    event_list = std::make_shared<RangeEventList>();
-    thread_id = next_thread_id++;
-    all_event_lists.emplace_front(event_list);
-  }
-  return *event_list;
-}
-
-inline void mark(std::string name, bool include_cuda = true) {
-  if (state == ProfilerState::NVTX) {
-#ifdef USE_CUDA
-    nvtxMarkA(name.c_str());
-#else
-    throw std::logic_error("mark called with NVTX tracing, but compiled without CUDA");
-#endif
-  } else {
-    getEventList().record(EventKind::Mark, std::move(name), thread_id, include_cuda && state == ProfilerState::CUDA);
-  }
-}
-
-inline void pushRange(std::string name) {
-  if (state == ProfilerState::NVTX) {
-#ifdef USE_CUDA
-    nvtxRangePushA(name.c_str());
-#else
-    throw std::logic_error("pushRange called with NVTX tracing, but compiled without CUDA");
-#endif
-  } else {
-    getEventList().record(EventKind::PushRange, std::move(name), thread_id, state == ProfilerState::CUDA);
-  }
-}
-
-inline void popRange() {
-  if (state == ProfilerState::NVTX) {
-#ifdef USE_CUDA
-    nvtxRangePop();
-#else
-    throw std::logic_error("popRange called with NVTX tracing, but compiled without CUDA");
-#endif
-  } else {
-    getEventList().record(EventKind::PopRange, std::string(), thread_id, state == ProfilerState::CUDA);
-  }
-}
+RangeEventList& getEventList();
+void mark(std::string name, bool include_cuda = true);
+void pushRange(std::string name);
+void popRange();
 
 struct RecordFunction {
-  explicit RecordFunction(Function *fn) {
-    if (state == ProfilerState::Disabled) return;
-    pushFunctionRange(fn);
-  }
+  explicit RecordFunction(Function* fn);
 
-  explicit RecordFunction(std::string name) {
-    if (state == ProfilerState::Disabled) return;
-    pushRange(std::move(name));
-  }
+  explicit RecordFunction(std::string name);
 
-  explicit RecordFunction(const char *name) {
-    if (state == ProfilerState::Disabled) return;
-    pushRange(name);
-  }
+  explicit RecordFunction(const char* name);
 
-  ~RecordFunction() {
-    if (state == ProfilerState::Disabled) return;
-    popRange();
-  }
+  ~RecordFunction();
 
   // Needed only because we don't have Function defined yet.
   void pushFunctionRange(Function *fn);
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 54943342932447..8c20c6c48e3e16 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -550,7 +550,7 @@ std::pair<UnpackedInput, InputFlags> unpack_input(PyObject *args) {
 }
 
 static void _assert_not_tracing(const char* name, const variable_list& input_vars) {
-  if (tracer::isTracingVar(input_vars)) {
+  if (tracer::isTracing()) {
     std::ostringstream oss;
     oss << "Attempted to trace " << name;
     oss << ", but tracing of legacy functions is not supported";
@@ -562,7 +562,7 @@ static jit::tracer::PreTraceInfo _trace_pre_record(
     PyObject* op_obj,
     PyObject *input_objects,
     const variable_list& input_vars) {
-  if (!tracer::isTracingVar(input_vars)) {
+  if (!jit::tracer::isTracing()) {
     return jit::tracer::PreTraceInfo();
   }
 
@@ -598,7 +598,7 @@ static void _trace_post_record(
     const variable_list& input_vars,
     PyObject *output_objects,
     bool is_inplace) {
-  if (!trace_info.state) {
+  if (!jit::tracer::isTracing()) {
     return;
   }
 
@@ -612,7 +612,6 @@ static void _trace_post_record(
 
   jit::tracer::postRecordTrace(trace_info, output_vars);
 
-  auto state_lock = trace_info.state->lock();
   trace_info.n->i_(attr::inplace, is_inplace);
 
 }
@@ -640,11 +639,6 @@ PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const Unpacked
 
   bool is_inplace = static_cast<bool>(grad_fn->dirty_tensors);
   _wrap_outputs(grad_fn, inputs, raw_output, outputs, is_executable);
-  // NOTE: _trace_post_record has to run before _save_variables, because we need
-  // to assign traces to outputs before we convert them to SavedVariables.
-  // On the other hand, it needs to go after _mark_non_differentiable, because
-  // it might be wraping backwards in Evals, and _mark_non_differentiable uses
-  // grad_fn pointer equality for error checking.
   _trace_post_record(trace_info, op_obj, unpacked.input_vars, outputs, is_inplace);
   if (is_executable) {
     _save_variables(grad_fn);
@@ -715,10 +709,6 @@ PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs)
 
   // Record input nodes if tracing
   auto trace_info = _trace_pre_record(cls, inputs, unpacked_input.input_vars);
-  if (trace_info.state) {
-    // TODO: ezyang suggests this is unused and can be removed
-    ctx->is_traced = true;
-  }
 
   // Initialize backward function (and ctx)
   bool is_executable = input_info.is_executable;
@@ -1009,7 +999,6 @@ static struct PyGetSetDef THPFunction_properties[] = {
   {"dirty_tensors", &getObject<&THPFunction::dirty_tensors>, &setObject<&THPFunction::dirty_tensors>, nullptr, nullptr},
   {"needs_input_grad", &getObject<&THPFunction::needs_input_grad>, nullptr, nullptr, nullptr},
   {"requires_grad", getRequiresGrad, nullptr, nullptr, nullptr},
-  {"_is_tracing", &getMember<char, &THPFunction::is_traced, PyBool_FromLong>, nullptr, nullptr, nullptr},
   {"metadata", (getter)THPFunction_metadata, nullptr, nullptr, nullptr},
   {nullptr}
 };
diff --git a/torch/csrc/autograd/python_function.h b/torch/csrc/autograd/python_function.h
index 7bc7548e125f43..bdbca1016ebcfa 100644
--- a/torch/csrc/autograd/python_function.h
+++ b/torch/csrc/autograd/python_function.h
@@ -90,7 +90,6 @@ struct THPFunction {
     // For each input, true if the input is a THPVariable
     std::vector<bool> is_variable_input;
     char has_freed_buffers;
-    char is_traced;
 
     // The C++ wrapper for this Python function.
     // See a comment in THPFunction_asFunction for details about this field.
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index a93f4e68fbe31e..607f3b739cdda0 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -16,7 +16,6 @@
 #include "torch/csrc/autograd/generated/VariableType.h"
 #include "torch/csrc/autograd/utils/python_error_messages.h"
 #include "torch/csrc/autograd/utils/wrap_outputs.h"
-#include "torch/csrc/jit/tracer_state.h"
 #include "torch/csrc/tensor/python_tensor.h"
 #include "torch/csrc/utils/auto_gil.h"
 #include "torch/csrc/utils/cuda_lazy_init.h"
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
index 889f456c8b07d8..0f93c13997c8f6 100644
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -3,7 +3,6 @@
 #include "torch/csrc/autograd/edge.h"
 #include "torch/csrc/autograd/function.h"
 #include "torch/csrc/autograd/variable.h"
-#include "torch/csrc/jit/tracer_state.h"
 
 #include <ATen/Tensor.h>
 
@@ -29,10 +28,6 @@ SavedVariable::SavedVariable(const Variable& variable, bool is_output) {
     }
     version_counter_ = variable.version_counter();
     saved_version_ = version_counter_.current_version();
-    if (variable.has_tracing_state()) {
-      tracing_state_.reset(
-          new jit::tracer::ValueTracingState(variable.tracing_state()));
-    }
   }
 }
 
@@ -78,9 +73,6 @@ Variable SavedVariable::unpack(std::shared_ptr<Function> saved_for) const {
   if (requires_grad_ && !var.grad_fn() && grad_accumulator_.expired())
     throw std::logic_error("No grad accumulator for a saved leaf!");
   var.set_grad_accumulator(grad_accumulator_);
-  if (tracing_state_) {
-    var.set_tracing_state(new jit::tracer::ValueTracingState(*tracing_state_));
-  }
 
   return var;
 }
diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h
index 7372d10c6fa30e..ff5a36ba04c03a 100644
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@@ -1,7 +1,7 @@
 #pragma once
 
+#include "torch/csrc/WindowsTorchApiMacro.h"
 #include "torch/csrc/autograd/variable_version.h"
-#include "torch/csrc/jit/tracer_state.h"
 
 #include <ATen/ATen.h>
 
@@ -14,7 +14,7 @@ namespace torch { namespace autograd {
 struct Variable;
 struct Function;
 
-extern const char* ERR_BACKWARD_TWICE;
+TORCH_API extern const char* ERR_BACKWARD_TWICE;
 
 /// A snapshot of a variable at a certain version. A `SavedVariable` stores
 /// enough information to reconstruct a variable from a certain point in time.
@@ -43,7 +43,6 @@ class SavedVariable {
   // passed in to the unpack function when reconstructing the Variable.
   std::shared_ptr<Function> grad_fn_;
   std::weak_ptr<Function> grad_accumulator_;
-  std::unique_ptr<jit::tracer::ValueTracingState> tracing_state_;
   VariableVersion version_counter_;
 
   uint32_t saved_version_;
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index 16e8105090ecfd..7654c4ee4c4b82 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -9,8 +9,6 @@
 #include "torch/csrc/autograd/generated/Functions.h"
 #include "torch/csrc/autograd/generated/VariableType.h"
 #include "torch/csrc/autograd/variable_version.h"
-#include "torch/csrc/jit/tracer_state.h"
-#include "torch/csrc/utils/auto_unique_ptr.h"
 
 #include <ATen/ATen.h>
 
@@ -141,7 +139,6 @@ void Variable::Impl::release_resources() {
   grad_.reset();
   grad_fn_.reset();
   hooks_.clear();
-  tracing_state_.reset();
 }
 
 Variable::ViewImpl::ViewImpl(Variable base, at::Tensor data, Edge gradient_edge)
@@ -205,13 +202,4 @@ void Variable::rebase_history(Edge gradient_edge) {
   }
 }
 
-void Variable::set_tracing_state(
-    jit::tracer::ValueTracingState* new_tracing_state) {
-  get()->tracing_state_.reset(new_tracing_state);
-}
-
-jit::tracer::ValueTracingState& Variable::tracing_state() const noexcept {
-  return *get()->tracing_state_;
-}
-
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index a6d670ae55f703..2def489e3ae540 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -2,11 +2,11 @@
 
 #include "torch/csrc/utils/python_stub.h"
 
+#include "torch/csrc/WindowsTorchApiMacro.h"
 #include "torch/csrc/assertions.h"
 #include "torch/csrc/autograd/edge.h"
 #include "torch/csrc/autograd/function_hook.h"
 #include "torch/csrc/autograd/variable_version.h"
-#include "torch/csrc/utils/auto_unique_ptr.h"
 
 #include <ATen/ATen.h>
 #include <ATen/Error.h>
@@ -19,20 +19,10 @@
 #include <utility>
 #include <vector>
 
-namespace torch {
-namespace autograd {
-struct Function;
-} // namespace autograd
-namespace jit { namespace tracer {
-// Has to be forward declared because tracer_state.h has a dependency on
-// variable.h.
-struct ValueTracingStateElem;
-using ValueTracingState = std::list<ValueTracingStateElem>;
-}} // namespace jit::tracer
-} // namespace torch
-
 namespace torch { namespace autograd {
 
+struct Function;
+
 ///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ///                                Variable
 ///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -227,15 +217,6 @@ struct Variable : public at::Tensor {
   const std::vector<std::shared_ptr<FunctionPreHook>>& hooks() const noexcept;
   void clear_hooks();
 
-  // JIT Tracing
-  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-  void set_tracing_state(jit::tracer::ValueTracingState* new_tracing_state);
-  jit::tracer::ValueTracingState& tracing_state() const noexcept;
-
-  /// Returns true if the `Variable`'s tracing state is not null.
-  bool has_tracing_state() const noexcept;
-
   // View Variables
   //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -280,7 +261,7 @@ struct Variable : public at::Tensor {
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 struct Variable::Impl : public at::TensorImpl {
-  explicit Impl(
+  TORCH_API explicit Impl(
       at::Tensor data,
       bool requires_grad = false,
       Edge edge = Edge());
@@ -378,9 +359,6 @@ struct Variable::Impl : public at::TensorImpl {
   // state are still thread-safe. Used by get_grad_fn and
   // get_grad_accumulator.
   std::mutex mutex_;
-
-  // For use in torch::jit::tracer
-  auto_unique_ptr<jit::tracer::ValueTracingState> tracing_state_;
 };
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -563,13 +541,6 @@ inline void Variable::clear_hooks() {
   get()->hooks_.clear();
 }
 
-// JIT Tracing
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-inline bool Variable::has_tracing_state() const noexcept {
-  return get()->tracing_state_ != nullptr;
-}
-
 // View Variables
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index d7c3b76c64b2b3..52a27ea0a44734 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -7,10 +7,6 @@
 #include <torch/csrc/cuda/nccl.h>
 #endif
 
-#include <torch/csrc/utils/auto_stream.h>
-
-#include <THC/THC.h>
-
 #include <ATen/ATen.h>
 #include <ATen/optional.h>
 
@@ -18,7 +14,6 @@
 #include <vector>
 
 namespace torch { namespace cuda {
-
 using namespace at;
 
 // Some operations can be performed more efficiently if we're handling tensors
@@ -123,7 +118,7 @@ std::vector<at::Tensor> scatter(
     at::IntList devices,
     const at::optional<std::vector<int64_t>>& chunk_sizes,
     int64_t dim,
-    const at::optional<std::vector<THCStream*>>& streams) {
+    const at::optional<std::vector<at::CUDAStream>>& streams) {
   std::vector<at::Tensor> chunks;
   if (chunk_sizes) {
     const int64_t chunk_size_sum =
@@ -145,18 +140,20 @@ std::vector<at::Tensor> scatter(
   } else {
     chunks = tensor.chunk(/*chunks=*/devices.size(), /*dim=*/dim);
   }
-  auto* thc_state = at::globalContext().lazyInitCUDA();
+  at::CUDAGuard cuda_guard;
   for (size_t chunk = 0; chunk < chunks.size(); ++chunk) {
-    const int32_t device_index = devices[chunk];
-    // We must set the current device before setting the current stream.
-    const at::DeviceGuard device_guard({at::kCUDA, device_index});
-    const AutoStream stream_guard(
-        streams ? (*streams)[chunk]
-                : THCState_getStreamOnDevice(thc_state, device_index));
-    // Copy the chunk from its current device to its destination device, which
-    // we set as the default device above, thus specified as -1.
-    chunks[chunk] =
-        chunks[chunk].contiguous().to({at::kCUDA, -1}, /*non_blocking=*/true);
+    const auto device_index = static_cast<int32_t>(devices[chunk]);
+    if (streams) {
+      AT_CHECK(
+          (*streams)[chunk].device() == device_index,
+          "Expected the device associated with the stream at index ",
+          chunk, " (was ", (*streams)[chunk].device(), ") ",
+          "to match the device supplied at that index ",
+          "(expected ", device_index, ")");
+      cuda_guard.set_stream((*streams)[chunk]);
+    }
+    chunks[chunk] = chunks[chunk].contiguous().to(
+        {at::kCUDA, device_index}, /*non_blocking=*/true);
   }
   return chunks;
 }
@@ -165,7 +162,7 @@ at::Tensor gather(
     at::TensorList tensors,
     int64_t dim,
     at::optional<int32_t> destination_index) {
-  AT_ASSERT(!tensors.empty());
+  AT_CHECK(!tensors.empty(), "Expected at least one tensor to gather from");
   at::Tensor result;
   int64_t total_size = 0;
   auto& first = tensors.front();
@@ -174,7 +171,7 @@ at::Tensor gather(
   for (const auto& tensor : tensors) {
     AT_CHECK(
         tensor.type().is_cuda(), "Gather expects all inputs to have CUDA type");
-    AT_CHECK(tensor.ndimension() == static_cast<int64_t>(expected_size.size()));
+    AT_ASSERT(tensor.ndimension() == static_cast<int64_t>(expected_size.size()));
     expected_size[dim] = tensor.size(dim);
     for (size_t dimension = 0; dimension < expected_size.size(); ++dimension) {
       AT_CHECK(
diff --git a/torch/csrc/cuda/comm.h b/torch/csrc/cuda/comm.h
index a87cc455691334..c7009a56e9712f 100644
--- a/torch/csrc/cuda/comm.h
+++ b/torch/csrc/cuda/comm.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <THC/THC.h>
-
 #include <ATen/ATen.h>
 #include <ATen/optional.h>
 
@@ -21,7 +19,7 @@ std::vector<at::Tensor> scatter(
     at::IntList devices,
     const at::optional<std::vector<int64_t>>& chunk_sizes = at::nullopt,
     int64_t dim = 0,
-    const at::optional<std::vector<THCStream*>>& streams = at::nullopt);
+    const at::optional<std::vector<at::CUDAStream>>& streams = at::nullopt);
 
 at::Tensor gather(
     at::TensorList tensors,
diff --git a/torch/csrc/cuda/python_comm.cpp b/torch/csrc/cuda/python_comm.cpp
index 902d5b93339ef7..0ec849a7498549 100644
--- a/torch/csrc/cuda/python_comm.cpp
+++ b/torch/csrc/cuda/python_comm.cpp
@@ -3,6 +3,7 @@
 #include "torch/csrc/cuda/Stream.h"
 #include "torch/csrc/cuda/THCP.h"
 #include "torch/csrc/utils/auto_gil.h"
+#include "torch/csrc/utils/functional.h"
 
 #include <ATen/ATen.h>
 
@@ -27,10 +28,15 @@ void initCommMethods(PyObject *module) {
      at::optional<std::vector<int64_t>> chunk_sizes,
      int64_t dim,
      at::optional<py::object> py_streams) {
-     at::optional<std::vector<THCStream*>> streams;
+     at::optional<std::vector<at::CUDAStream>> streams;
      if (py_streams) {
        py::handle handle = *py_streams;
-       streams = THPUtils_PySequence_to_THCStreamList(handle.ptr());
+       streams = fmap(
+           THPUtils_PySequence_to_THCStreamList(handle.ptr()),
+           [](THCStream* stream) {
+             at::detail::CUDAStream_retain(stream);
+             return at::CUDAStream(stream);
+           });
      }
      // Note: We're holding the GIL up to here.
      AutoNoGIL no_gil;
diff --git a/torch/csrc/cuda/utils.cpp b/torch/csrc/cuda/utils.cpp
index fe17a8c31c952c..5090ad6647b0f3 100644
--- a/torch/csrc/cuda/utils.cpp
+++ b/torch/csrc/cuda/utils.cpp
@@ -34,10 +34,4 @@ std::vector <THCStream*> THPUtils_PySequence_to_THCStreamList(PyObject *obj) {
   return streams;
 }
 
-template<>
-void THPPointer<THCTensor>::free() {
-  if (ptr)
-    THCTensor_free(LIBRARY_STATE ptr);
-}
-
 #endif
diff --git a/torch/csrc/finalizer.h b/torch/csrc/finalizer.h
index 4335c50f7198e9..13b9fa5e7bdd04 100644
--- a/torch/csrc/finalizer.h
+++ b/torch/csrc/finalizer.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <Python.h>
-#include <TH/THStorage.hpp>
+#include <TH/THStorageFunctions.hpp>
 #include <torch/csrc/utils/object_ptr.h>
 #include <torch/csrc/utils/auto_gil.h>
 
diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp
index 64772bede8d02c..78087f8d3118c4 100644
--- a/torch/csrc/jit/fusion_compiler.cpp
+++ b/torch/csrc/jit/fusion_compiler.cpp
@@ -190,11 +190,15 @@ std::string valueName(Value * n) {
   return "n" + std::to_string(n->unique());
 }
 
- std::string scalarValue(const at::Tensor & t) {
+std::string scalarValue(const at::Tensor & t) {
   auto s =  at::Scalar(t);
-  return (s.isIntegral()) ?
-    std::to_string(s.toLong()) :
-    (std::to_string(s.toDouble()) + "f");
+  if (s.isIntegral()){ 
+    return std::to_string(s.toLong()); 
+  } else {
+     std::ostringstream out;
+     out << std::scientific << s.toDouble() << "f";
+     return out.str();
+  }
 }
 
 const char * scalarTypeName(at::ScalarType type) {
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index 5ef60d95a47dc5..2b0e0b47115506 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -226,7 +226,7 @@ struct GraphExecutorImpl {
     // there is no need to optimize, but we do need to splice the graph of
     // this excutor into the trace. Otherwise we might unroll control-flow
     // operations.
-    if(isTracing(inputs)) {
+    if(tracer::isTracing()) {
       return runTraced(std::move(inputs));
     }
 
@@ -274,26 +274,11 @@ struct GraphExecutorImpl {
 private:
   friend struct GraphExecutor;
 
-  // TODO: switching tracing to be part of the local thread state, instead of
-  // a per-variable property will make this check significantly faster.
-  // It is along the fast path, so this is important.
-  static bool isTracing(const variable_tensor_list& inputs) {
-    for(auto & i : inputs) {
-      if(i.defined() && tracer::isTracingVar(autograd::as_variable_ref(i)))
-        return true;
-    }
-    return false;
-  }
   variable_tensor_list runTraced(variable_tensor_list inputs) {
-    // TODO: unnecessary copy to variable_list
-    variable_list input_vars(inputs.begin(), inputs.end());
-    auto state = tracer::getTracingState(input_vars);
-    auto input_values = fmap(input_vars, [&](const Variable& v) {
-      return tracer::getValueTrace(state, v);
-    });
+    auto state = tracer::getTracingState();
+    auto input_values = fmap(inputs, tracer::getValueTrace);
 
     ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
-    input_vars.clear(); // don't hold inputs during execution
     auto outputs = runFallback(std::move(inputs));
 
     auto all_dynamic = [](const at::ArrayRef<Value*> xs) {
@@ -316,7 +301,7 @@ struct GraphExecutorImpl {
     auto output_values = script::inlineCallTo(*state->graph, *local_graph, input_values);
 
     for(size_t i = 0; i < outputs.size(); ++i) {
-      tracer::setValueTrace(state, outputs[i], output_values[i]);
+      tracer::setValueTrace(outputs[i], output_values[i]);
     }
     return outputs;
   }
diff --git a/torch/csrc/jit/interned_strings.cpp b/torch/csrc/jit/interned_strings.cpp
index 77ec1848b3679a..d633514256dbb0 100644
--- a/torch/csrc/jit/interned_strings.cpp
+++ b/torch/csrc/jit/interned_strings.cpp
@@ -1,92 +1,76 @@
-#include <vector>
+#include "torch/csrc/jit/interned_strings.h"
 #include <stdint.h>
-#include <string>
-#include <unordered_map>
+#include <iostream>
 #include <mutex>
 #include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
 #include "ATen/optional.h"
-#include "torch/csrc/assertions.h"
-#include "torch/csrc/jit/interned_strings.h"
 #include "string.h"
-#include <iostream>
+#include "torch/csrc/assertions.h"
+#include "torch/csrc/jit/interned_strings_class.h"
 
 namespace torch { namespace jit {
 
-struct InternedStrings {
-  InternedStrings()
-  : sym_to_info_(static_cast<size_t>(_keys::num_symbols)) {
-    #define REGISTER_SYMBOL(n, s) \
-      string_to_sym_[#n "::" #s] = n::s; \
-      sym_to_info_[n::s] = {namespaces::n, #n "::" #s, #s};
+Symbol InternedStrings::symbol(const std::string& s) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  return _symbol(s);
+}
 
-    FORALL_NS_SYMBOLS(REGISTER_SYMBOL)
-    #undef REGISTER_SYMBOL
-  }
-  Symbol symbol(const std::string & s) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    return _symbol(s);
-  }
-  std::pair<const char *, const char *> string(Symbol sym) {
-    // Builtin Symbols are also in the maps, but
-    // we can bypass the need to acquire a lock
-    // to read the map for Builtins because we already
-    // know their string value
-    switch(sym) {
-      #define DEFINE_CASE(ns, s) \
-        case ns::s: return {#ns "::" #s, #s};
-      FORALL_NS_SYMBOLS(DEFINE_CASE)
-      #undef DEFINE_CASE
-        default:
-          return customString(sym);
-    }
+std::pair<const char*, const char*> InternedStrings::string(Symbol sym) {
+  // Builtin Symbols are also in the maps, but
+  // we can bypass the need to acquire a lock
+  // to read the map for Builtins because we already
+  // know their string value
+  switch (sym) {
+#define DEFINE_CASE(ns, s) \
+  case ns::s:              \
+    return {#ns "::" #s, #s};
+    FORALL_NS_SYMBOLS(DEFINE_CASE)
+#undef DEFINE_CASE
+    default:
+      return customString(sym);
   }
-  Symbol ns(Symbol sym) {
-    switch(sym) {
-      #define DEFINE_CASE(ns, s) \
-        case ns::s: return namespaces::ns;
-      FORALL_NS_SYMBOLS(DEFINE_CASE)
-      #undef DEFINE_CASE
-        default: {
-          std::lock_guard<std::mutex> guard(mutex_);
-          return sym_to_info_.at(sym).ns;
-        }
+}
+
+Symbol InternedStrings::ns(Symbol sym) {
+  switch (sym) {
+#define DEFINE_CASE(ns, s) \
+  case ns::s:              \
+    return namespaces::ns;
+    FORALL_NS_SYMBOLS(DEFINE_CASE)
+#undef DEFINE_CASE
+    default: {
+      std::lock_guard<std::mutex> guard(mutex_);
+      return sym_to_info_.at(sym).ns;
     }
   }
-private:
-  // prereq - holding mutex_
-  Symbol _symbol(const std::string & s) {
-    auto it = string_to_sym_.find(s);
-    if(it != string_to_sym_.end())
-      return it->second;
-
-    auto pos = s.find("::");
-    if(pos == std::string::npos) {
-      throw std::runtime_error("all symbols must have a namespace, <namespace>::<string>");
-    }
-    Symbol ns = _symbol("namespaces::" + s.substr(0, pos));
+}
 
-    Symbol sym(sym_to_info_.size());
-    string_to_sym_[s] = sym;
-    sym_to_info_.push_back({ns, s, s.substr(pos + strlen("::"))});
-    return sym;
-  }
+Symbol InternedStrings::_symbol(const std::string& s) {
+  auto it = string_to_sym_.find(s);
+  if (it != string_to_sym_.end())
+    return it->second;
 
-  std::pair<const char *, const char *> customString(Symbol sym) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    SymbolInfo& s = sym_to_info_.at(sym);
-    return {s.qual_name.c_str(), s.unqual_name.c_str()};
+  auto pos = s.find("::");
+  if (pos == std::string::npos) {
+    throw std::runtime_error(
+        "all symbols must have a namespace, <namespace>::<string>");
   }
-  std::unordered_map<std::string, Symbol> string_to_sym_;
+  Symbol ns = _symbol("namespaces::" + s.substr(0, pos));
 
-  struct SymbolInfo {
-    Symbol ns;
-    std::string qual_name;
-    std::string unqual_name;
-  };
-  std::vector<SymbolInfo> sym_to_info_;
+  Symbol sym(sym_to_info_.size());
+  string_to_sym_[s] = sym;
+  sym_to_info_.push_back({ns, s, s.substr(pos + strlen("::"))});
+  return sym;
+}
 
-  std::mutex mutex_;
-};
+std::pair<const char*, const char*> InternedStrings::customString(Symbol sym) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  SymbolInfo& s = sym_to_info_.at(sym);
+  return {s.qual_name.c_str(), s.unqual_name.c_str()};
+}
 
 static InternedStrings & globalStrings() {
   static InternedStrings s;
diff --git a/torch/csrc/jit/interned_strings_class.h b/torch/csrc/jit/interned_strings_class.h
new file mode 100644
index 00000000000000..c4f7b06ed5f1a7
--- /dev/null
+++ b/torch/csrc/jit/interned_strings_class.h
@@ -0,0 +1,39 @@
+#include <stdint.h>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "ATen/optional.h"
+#include "string.h"
+#include "torch/csrc/assertions.h"
+#include "torch/csrc/jit/interned_strings.h"
+
+namespace torch {
+namespace jit {
+
+struct InternedStrings {
+  InternedStrings();
+  Symbol symbol(const std::string& s);
+  std::pair<const char*, const char*> string(Symbol sym);
+  Symbol ns(Symbol sym);
+
+ private:
+  // prereq - holding mutex_
+  Symbol _symbol(const std::string& s);
+  std::pair<const char*, const char*> customString(Symbol sym);
+  std::unordered_map<std::string, Symbol> string_to_sym_;
+
+  struct SymbolInfo {
+    Symbol ns;
+    std::string qual_name;
+    std::string unqual_name;
+  };
+  std::vector<SymbolInfo> sym_to_info_;
+
+  std::mutex mutex_;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index a42915ba4c17a2..2b555029f16b54 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -7,7 +7,6 @@
 #include "torch/csrc/jit/resource_guard.h"
 #include "torch/csrc/jit/source_location.h"
 #include "torch/csrc/jit/type.h"
-#include "torch/csrc/jit/variable_flags.h"
 
 #include "torch/csrc/utils/disallow_copy.h"
 #include "torch/csrc/utils/functional.h"
diff --git a/torch/csrc/jit/passes/onnx.h b/torch/csrc/jit/passes/onnx.h
index bd6f6e4444fcc9..a58d421a458d2c 100644
--- a/torch/csrc/jit/passes/onnx.h
+++ b/torch/csrc/jit/passes/onnx.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "torch/csrc/jit/ir.h"
-#include "torch/csrc/jit/tracer_state.h"
 #include "torch/csrc/onnx/onnx.h"
 
 namespace torch { namespace jit {
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 534297aa3f174a..f03edfe2d6b3bf 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -444,7 +444,7 @@ void initPythonIRBindings(PyObject * module_) {
       return t.expect<TensorType>()->strides();
     })
     .def("contiguous",[](Type& t) {
-      return t.expect<TensorType>()->contiguous();
+      return std::static_pointer_cast<Type>(t.expect<TensorType>()->contiguous());
     })
     .def("scalarType",[](Type& t) {
       return at::toString(t.expect<TensorType>()->scalarType());
@@ -471,8 +471,5 @@ void initPythonIRBindings(PyObject * module_) {
     }
     return std::make_tuple(graph, variables);
   });
-  m.def("_jit_is_tracing", [](const autograd::Variable& var) {
-    return tracer::isTracing(var);
-  });
 }
 }}
diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp
index 2ad7a79e9a947f..6397877266683c 100644
--- a/torch/csrc/jit/python_tracer.cpp
+++ b/torch/csrc/jit/python_tracer.cpp
@@ -46,21 +46,26 @@ std::shared_ptr<torch::jit::Graph> createGraphByTracing(
         tracer::variable_list trace_inputs,
         size_t num_func_inputs) {
   auto enter_info = tracer::enter(std::move(trace_inputs));
-  py::tuple py_inputs(num_func_inputs);
-  for(size_t i = 0; i < num_func_inputs; ++i) {
-    py_inputs[i] = py::cast(enter_info.second[i]);
-  }
-  auto out = func(*py_inputs);
-  std::vector<autograd::Variable> outputs;
-  if(PyTuple_Check(out.ptr())) {
-    outputs = py::cast<std::vector<autograd::Variable>>(out);
-  } else {
-    outputs.push_back(py::cast<autograd::Variable>(out));
+  try {
+    py::tuple py_inputs(num_func_inputs);
+    for(size_t i = 0; i < num_func_inputs; ++i) {
+      py_inputs[i] = py::cast(enter_info.second[i]);
+    }
+    auto out = func(*py_inputs);
+    std::vector<autograd::Variable> outputs;
+    if(PyTuple_Check(out.ptr())) {
+      outputs = py::cast<std::vector<autograd::Variable>>(out);
+    } else {
+      outputs.push_back(py::cast<autograd::Variable>(out));
+    }
+    tracer::exit(outputs);
+    auto graph = enter_info.first->graph;
+    EliminateDeadCode(graph);
+    return graph;
+  } catch (...) {
+    tracer::abandon();
+    throw;
   }
-  tracer::exit(outputs);
-  auto graph = enter_info.first->graph;
-  EliminateDeadCode(graph);
-  return graph;
 }
 
 PreTraceInfo preRecordPythonTrace(THPObjectPtr pyobj,
@@ -119,17 +124,17 @@ void initPythonTracerBindings(PyObject* module_) {
   m.def("_tracer_exit", [](variable_list var_outputs) {
     tracer::exit(var_outputs);
   });
-  m.def("_get_tracing_state", [](const variable_list& vars) {
-    return getTracingState(vars);
+  m.def("_tracer_abandon", []() {
+    tracer::abandon();
   });
-  m.def("_get_value_trace", [](std::shared_ptr<TracingState>& state, const Variable& var) {
-    return getValueTrace(state, var);
+  m.def("_get_tracing_state", []() {
+    return getTracingState();
   });
-  m.def("_set_value_trace", [](std::shared_ptr<TracingState>& state, const Variable& var, Value* value) {
-    return setValueTrace(state, var, value);
+  m.def("_get_value_trace", [](const Variable& var) {
+    return getValueTrace(var);
   });
-  m.def("_is_tracing", [](const variable_list& vars) {
-    return isTracingVar(vars);
+  m.def("_set_value_trace", [](const Variable& var, Value* value) {
+    return setValueTrace(var, value);
   });
 }
 
diff --git a/torch/csrc/jit/register_symbols.cpp b/torch/csrc/jit/register_symbols.cpp
new file mode 100644
index 00000000000000..d08a11dff1a724
--- /dev/null
+++ b/torch/csrc/jit/register_symbols.cpp
@@ -0,0 +1,18 @@
+#include "torch/csrc/jit/interned_strings_class.h"
+
+// This file is compiled with -O0 because the fully-macro-expanded
+// function is huge and only called once at startup.
+
+namespace torch {
+namespace jit {
+InternedStrings::InternedStrings()
+    : sym_to_info_(static_cast<size_t>(_keys::num_symbols)) {
+#define REGISTER_SYMBOL(n, s)        \
+  string_to_sym_[#n "::" #s] = n::s; \
+  sym_to_info_[n::s] = {namespaces::n, #n "::" #s, #s};
+
+  FORALL_NS_SYMBOLS(REGISTER_SYMBOL)
+#undef REGISTER_SYMBOL
+}
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index 0fda8352909283..a86059f3004953 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -13,6 +13,28 @@
 
 namespace torch { namespace jit { namespace tracer {
 
+////////////////////////////////////////////////////////////////////////////////
+// Recording the traces
+////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+
+thread_local std::shared_ptr<TracingState> tracing_state;
+
+} // namespace detail
+
+const std::shared_ptr<TracingState>& getTracingState() {
+  return detail::tracing_state;
+}
+
+void setTracingState(std::shared_ptr<TracingState> state) {
+  detail::tracing_state = std::move(state);
+}
+
+TracingState::TracingState()
+    : graph(new Graph()) {}
+
+TracingState::~TracingState() = default;
+
 PreTraceInfo preRecordTrace(Symbol op,
                             at::ArrayRef<Variable> inputs) {
   return makePreTraceInfo(inputs, [&op](const std::shared_ptr<TracingState>& state, Graph& graph) {
@@ -22,14 +44,10 @@ PreTraceInfo preRecordTrace(Symbol op,
 
 void postRecordTrace(const PreTraceInfo& info,
                      at::ArrayRef<Variable> outputs) {
-  // TODO: Technically, we could reduce the scope of the lock, but since we
-  // haven't actually specified what the locking contract is, be conservative.
-  auto state_lock = info.state->lock();
-
   auto assignOutput = [&info](const Variable & output, Value * value) {
     if (output.defined()) {
       value->inferTypeFrom(output.data());
-      setValueTrace(info.state, output, value);
+      setValueTrace(output, value);
     }
   };
 
@@ -38,35 +56,39 @@ void postRecordTrace(const PreTraceInfo& info,
   }
 }
 
-thread_local ArgumentStash ArgumentStash::stash;
-
-void ArgumentStash::stashIntListElem(const std::string& arg_name, size_t size, size_t idx, const Variable& var) {
-  // TODO: check type?
-  if (!isTracing(var)) return;
-  auto tracing_state = getTracingState({var});
-  auto & list_trace = stash.intlists.emplace(arg_name, size).first->second;
-  JIT_ASSERT(size == list_trace.size());
-  JIT_ASSERT(idx < list_trace.size());
-  JIT_ASSERT(list_trace[idx] == nullptr);
-  list_trace[idx] = getValueTrace(tracing_state, var);
-}
-
 autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim) {
-  auto tracing_state = getTracingState({var});
+  auto & tracing_state = getTracingState();
   auto & graph = tracing_state->graph;
 
   auto size_var = autograd::make_variable(at::Scalar(var.size(dim)).toTensor());
-  auto* value = getValueTrace(tracing_state, var);
+  auto* value = getValueTrace(var);
   auto* node = graph->create(aten::size, {value})
                     ->i_(attr::dim, dim);
   node->output()->inferTypeFrom(size_var);
   graph->appendNode(node);
-  setValueTrace(tracing_state, size_var, node->output());
+  setValueTrace(size_var, node->output());
 
   return size_var;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Argument stash
+////////////////////////////////////////////////////////////////////////////////
+thread_local ArgumentStash ArgumentStash::stash;
+
+void ArgumentStash::stashIntListElem(const std::string& arg_name, size_t size, size_t idx, const Variable& var) {
+  // TODO: check type?
+  if (!isTracing()) return;
+  auto & list_trace = stash.intlists.emplace(arg_name, size).first->second;
+  JIT_ASSERT(size == list_trace.size());
+  JIT_ASSERT(idx < list_trace.size());
+  JIT_ASSERT(list_trace[idx] == nullptr);
+  list_trace[idx] = getValueTrace(var);
+}
 
+////////////////////////////////////////////////////////////////////////////////
+// Stack trace recording
+////////////////////////////////////////////////////////////////////////////////
 // no python present so we just do not record source information
 void defaultRecordSourceLocation(Node* n) {}
 std::atomic<decltype(&defaultRecordSourceLocation)> record_source_location(defaultRecordSourceLocation);
diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h
index 5775091f5b8e69..2b8f32e8034f97 100644
--- a/torch/csrc/jit/tracer.h
+++ b/torch/csrc/jit/tracer.h
@@ -1,13 +1,12 @@
 #pragma once
 
 #include "torch/csrc/jit/ir.h"
-#include "torch/csrc/jit/tracer_state.h"
 #include "torch/csrc/assertions.h"
 #include "torch/csrc/utils/functional.h"
 #include "torch/csrc/utils/variadic.h"
 #include "torch/csrc/autograd/function_hook.h"
 #include "torch/csrc/autograd/variable.h"
-#include "torch/csrc/utils/auto_unique_ptr.h"
+
 #include <memory>
 #include <mutex>
 #include <vector>
@@ -20,38 +19,27 @@ namespace torch { namespace jit { namespace tracer {
 using torch::autograd::Variable;
 using variable_list = std::vector<Variable>;
 
-namespace detail {
-
-inline ValueTracingStateElem* getValueState(const std::shared_ptr<TracingState>& state, const Variable& var, bool alloc = true) {
-  auto& tracing_state = var.tracing_state();
-  for (auto it = tracing_state.begin(); it != tracing_state.end();) {
-    auto ts = it->state.lock();
-    // GC of invalidated tracing states
-    if (!ts) {
-      auto current_it = it++;
-      tracing_state.erase(current_it);
-      continue;
-    } else if (ts == state) {
-      return &(*it);
+struct TracingState : public std::enable_shared_from_this<TracingState> {
+  TracingState();
+  ~TracingState();
+
+  using WeakTensor = at::WeakTensor;
+
+  struct WeakTensorHasher {
+    size_t operator()(const WeakTensor& t) const {
+      return std::hash<void*>()(t.unsafeGetTensorImpl());
     }
-    ++it;
-  }
-  if (alloc) {
-    tracing_state.emplace_front();
-    auto & vts = tracing_state.front();
-    vts.state = state;
-    return &vts;
-  } else {
-    return nullptr;
-  }
-}
+  };
 
-inline bool isElemActive(const ValueTracingStateElem& vts) {
-  auto state = vts.state.lock();
-  return state && state->active;
-}
+  struct WeakTensorEq {
+    bool operator()(const WeakTensor& t1, const WeakTensor& t2) const {
+      return t1.unsafeGetTensorImpl() == t2.unsafeGetTensorImpl();
+    }
+  };
 
-} // namespace detail
+  std::unordered_map<WeakTensor, Value*, WeakTensorHasher, WeakTensorEq> value_map;
+  std::shared_ptr<Graph> graph;
+};
 
 
 // This is meant to be used as a thread local place, where we can store extra
@@ -91,76 +79,27 @@ struct ArgumentStash {
   std::unordered_map<std::string, IntListTrace> intlists;
 };
 
-// Should a function which takes 'vars' as inputs be traced?
-// It suffices for ONE variable to be tracing: any "untraced" variables
-// are treated as constants.
-//
-// NB: This code lives in the hotpath; make sure it is fast
-//
-// NB: Variable overload is not variadic because we don't actually
-// need it (in most cases if we have a variable_list it is already
-// flattened).
-inline bool isTracingVar(const Variable& var) {
-  if (!var.defined() || !var.has_tracing_state()) return false;
-  return std::any_of(var.tracing_state().begin(), var.tracing_state().end(), detail::isElemActive);
-}
-
-inline bool isTracingVar(at::ArrayRef<Variable> vars) {
-  // Reference to avoid refcount bump
-  for (const Variable& var : vars) {
-    if (isTracingVar(var)) return true;
-  }
-  return false;
-}
-
-struct IsTracing : IterArgs<IsTracing> {
-  bool out = false;
-  using IterArgs<IsTracing>::operator();
-  void operator()(const at::Tensor& var) {
-    out = out || isTracingVar(var);
-  }
-  bool short_circuit() { return out; }
-};
-
-// To be called with Tensor arguments from generated code
-template<typename... Args>
-inline bool isTracing(Args&&... args) {
-  return IsTracing().apply(std::forward<Args>(args)...).out;
-}
+// Retrieve or set the current tracing state. Returns a nullptr if tracing is disabled.
+const std::shared_ptr<TracingState>& getTracingState();
+void setTracingState(std::shared_ptr<TracingState> state);
 
-// Retrieve the tracing state which a function applied with 'vars' should
-// be recorded to.  Precondition: isTracing(vars) == true.  At the moment,
-// we don't support mixing up variables from different traces; this code
-// will need to be revisited if that ever becomes supported.
-inline std::shared_ptr<TracingState> getTracingState(const variable_list& vars) {
-  std::shared_ptr<TracingState> state;
-  for (auto& var : vars) {
-    if (!var.defined() || !var.has_tracing_state()) continue;
-    for (auto & vts : var.tracing_state()) {
-      auto var_state = vts.state.lock();
-      if (!var_state || !var_state->active) continue;
-      if (!state) state = var_state;
-      JIT_ASSERT(var_state == state);
-    }
-  }
-  JIT_ASSERT(state);
-  return state;
+inline bool isTracing() {
+  return static_cast<bool>(getTracingState());
 }
 
-// Having finished adding a new 'node' to the graph IR owned by TracingState 'state',
-// 'setValueTrace' associates this node with an output variable, so that further operations
-// involving this variable know which node in the IR to reference.
-inline void setValueTrace(const std::shared_ptr<TracingState>& state, const Variable& var, Value *value) {
+// Having finished adding a new 'node' to the graph IR 'setValueTrace' associates
+// this node with an output variable, so that further operations involving this
+// variable know which node in the IR to reference.
+inline void setValueTrace(const Variable& var, Value *value) {
   JIT_ASSERT(var.defined());
-  auto vts = detail::getValueState(state, var);
-  vts->trace = value;
+  getTracingState()->value_map[var] = value;
 }
 
 // Given a variable 'var', return the 'node' which represents the instruction
-// which computes the value of this variable in the IR.  When 'mustExist' is
-// false, we interpret untraced variables as constants that are just embedded
+// which computes the value of this variable in the IR.
+// Here, we interpret untraced variables as constants that are just embedded
 // in the graph.  This is useful to handle code which does things like this
-// (from torch.autograd.variable):
+// (from torch.autograd.variable, now moved to C++):
 //
 //    def mm(self, matrix):
 //      output = Variable(self.data.new(self.data.size(0), matrix.data.size(1)))
@@ -170,19 +109,21 @@ inline void setValueTrace(const std::shared_ptr<TracingState>& state, const Vari
 // update on, but subsequently ignores it because the alpha scaling factor is zero.
 // This is one of the cases where a Variable can be created inside of a trace, and
 // if we treat it as a constant, everything will work out.
-inline Value* getValueTrace(const std::shared_ptr<TracingState>& state, const Variable& var) {
+inline Value* getValueTrace(const Variable& var) {
+  auto &state = getTracingState();
   if (!var.defined()) {
     Node *n = state->graph->createUndefined();
     return state->graph->appendNode(n)->output();
   }
 
-  auto vts = detail::getValueState(state, var, true);
-  if (vts->trace) return vts->trace;
-
-  Value *constant = state->graph->appendNode(state->graph->createConstant(var.data()))->output();
-  constant->inferTypeFrom(var.data());
-  setValueTrace(state, var, constant);
-  return constant;
+  auto & value_map = getTracingState()->value_map;
+  auto it = value_map.find(var);
+  if (it == value_map.end()) {
+    Value *constant = state->graph->appendNode(state->graph->createConstant(var.data()))->output();
+    constant->inferTypeFrom(var.data());
+    it = value_map.emplace_hint(it, var, constant);
+  }
+  return it->second;
 }
 
 inline Value* getOutputTrace(const std::shared_ptr<TracingState>& state, const Variable& var, size_t output_no) {
@@ -191,36 +132,37 @@ inline Value* getOutputTrace(const std::shared_ptr<TracingState>& state, const V
     return state->graph->appendNode(n)->output();
   }
 
-  auto vts = detail::getValueState(state, var, false);
-  if (!vts) {
+  auto & value_map = getTracingState()->value_map;
+  auto it = value_map.find(var);
+  if (it == value_map.end()) {
     std::ostringstream os;
     os << "output " << output_no << " of traced region did not have observable "
        << "data dependence with trace inputs; this probably indicates your program "
        << "cannot be understood by the tracer.";
     throw std::runtime_error(os.str());
   }
-  return vts->trace;
+  return it->second;
 }
 
 // Start tracing, treating 'inputs' as inputs to the trace, which can be
 // varied on subsequent invocations of the trace.  Any other variables
 // will be treated as constants.
-//
-// NB: Why does this take an rvalue reference?  We need to get a non-const
-// reference to at::Tensor buffer to call unsafeGetTH, but you can't get this
-// out of a const vector (silly std::vector...)
 inline std::pair<std::shared_ptr<TracingState>, variable_list> enter(
     variable_list inputs) {
+  if (isTracing()) {
+    AT_ERROR("Tracing can't be nested");
+  }
   auto state = std::make_shared<TracingState>();
+  setTracingState(state);
   for (auto& input : inputs) {
-    auto * value_state = detail::getValueState(state, input, false);
+    auto * value_state = state->value_map[input];
     if (value_state) {
       // See Note [Repeated inputs] in tracer.cpp
       input = input.view(input.sizes());
     }
     auto input_node = state->graph->addInput(input.name());
-    setValueTrace(state, input, input_node);
     input_node->inferTypeFrom(input.data());
+    state->value_map[input] = input_node;
   }
   return std::make_pair(state, inputs);
 }
@@ -229,27 +171,29 @@ inline std::pair<std::shared_ptr<TracingState>, variable_list> enter(
 // are the variables whose values will be computed upon subsequent
 // invocations of the trace.
 inline void exit(const variable_list& outputs) {
-  auto state = getTracingState(outputs);
+  auto & state = getTracingState();
   size_t i = 0;
   for (auto& output : outputs) {
     state->graph->registerOutput(getOutputTrace(state, output, i));
     i++;
   }
-  state->active = false;
+  setTracingState(nullptr);
+}
+
+// Abort tracing. Used to reset the state in case of errors.
+inline void abandon() {
+  setTracingState(nullptr);
 }
 
 // Pre-recorded information about the trace before we actually carry
 // out the trace
 struct PreTraceInfo {
-  std::shared_ptr<TracingState> state;
   Node *n;
 };
 
 PreTraceInfo preRecordTrace(Symbol op, at::ArrayRef<Variable> inputs);
 void postRecordTrace(const PreTraceInfo& info, at::ArrayRef<Variable> outputs);
 
-autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim);
-
 void recordSourceLocation(Node* n);
 void setRecordSourceLocation(void (*v)(Node*));
 
@@ -259,15 +203,14 @@ void setRecordSourceLocation(void (*v)(Node*));
 template<typename F>
 PreTraceInfo makePreTraceInfo(at::ArrayRef<Variable> inputs, F ctor) {
   PreTraceInfo info;
-  info.state = getTracingState(inputs);
-  auto& graph = info.state->graph;
-  auto state_lock = info.state->lock();
+  auto & state = getTracingState();
+  auto & graph = state->graph;
 
-  Node *n = ctor(info.state, *graph);
+  Node *n = ctor(state, *graph);
   recordSourceLocation(n);
 
-  for (Variable input : inputs) {
-    n->addInput(getValueTrace(info.state, input));
+  for (const Variable & input : inputs) {
+    n->addInput(getValueTrace(input));
   }
 
   // NB: Order matters. This must append after inputs but before outputs.
@@ -278,4 +221,6 @@ PreTraceInfo makePreTraceInfo(at::ArrayRef<Variable> inputs, F ctor) {
   return info;
 }
 
+autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim);
+
 }}} // namespace torch::jit::tracer
diff --git a/torch/csrc/jit/tracer_state.cpp b/torch/csrc/jit/tracer_state.cpp
deleted file mode 100644
index 6f445625fd6b73..00000000000000
--- a/torch/csrc/jit/tracer_state.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include "torch/csrc/jit/tracer_state.h"
-#include "torch/csrc/jit/ir.h"
-
-namespace torch { namespace jit { namespace tracer {
-
-TracingState::TracingState()
-    : graph(new Graph())
-    , active(true) {}
-
-TracingState::~TracingState() = default;
-
-}}} // namespace torch::jit::tracer
diff --git a/torch/csrc/jit/tracer_state.h b/torch/csrc/jit/tracer_state.h
deleted file mode 100644
index 887ad94dced892..00000000000000
--- a/torch/csrc/jit/tracer_state.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#pragma once
-
-#include "torch/csrc/autograd/edge.h"
-#include "torch/csrc/autograd/variable.h"
-
-#include <atomic>
-#include <cstdint>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace torch { namespace jit {
-struct Graph;
-struct Value;
-}} // namespace torch::jit
-
-namespace torch { namespace jit { namespace tracer {
-
-// TracingState tracks the necessary state when we are tracing the execution of
-// autograd code; most importantly, it holds a reference to the actual IR
-// graph which we are recording the trace to.
-//
-// The liveness of a TracingState is expected to be a superset of the region
-// of code being traced; in particular, Variables do not keep a TracingState
-// live.  Instead, they hold weak pointers to TracingState, to prevent leaks
-// from arising when a variable that participated in a trace outlives the
-// actual trace itself.
-
-struct TracingState : public std::enable_shared_from_this<TracingState> {
-  TracingState();
-  ~TracingState();
-
-  std::shared_ptr<Graph> graph;
-  std::mutex mutex;
-  bool active;
-
-  std::unique_lock<std::mutex> lock() {
-    return std::unique_lock<std::mutex>(mutex);
-  }
-};
-
-struct ValueTracingStateElem {
-  std::weak_ptr<TracingState> state;
-  // it's only valid to use this field if !state.exired()
-  Value* trace = nullptr;
-
-  void reset() {
-    state.reset();
-    trace = nullptr;
-  }
-};
-
-using ValueTracingState = std::list<ValueTracingStateElem>;
-
-}}} // namespace torch::jit::tracer
diff --git a/torch/csrc/jit/variable_flags.cpp b/torch/csrc/jit/variable_flags.cpp
deleted file mode 100644
index 8ab565d1a23f59..00000000000000
--- a/torch/csrc/jit/variable_flags.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "torch/csrc/jit/variable_flags.h"
-
-#include "torch/csrc/autograd/variable.h"
-#include "torch/csrc/jit/tracer_state.h"
-
-using torch::autograd::Variable;
-
-namespace torch { namespace jit {
-
-// These definitions require Variable struct to be defined, so they can't be
-// in tracer_state.h
-VariableFlags VariableFlags::of(const Variable& var) {
-  VariableFlags f;
-  f.defined = var.defined();
-  f.requires_grad = f.defined && var.requires_grad();
-  return f;
-}
-
-}}
diff --git a/torch/csrc/jit/variable_flags.h b/torch/csrc/jit/variable_flags.h
deleted file mode 100644
index 43c3ef9bf89a1a..00000000000000
--- a/torch/csrc/jit/variable_flags.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-#include <iostream>
-namespace torch { namespace autograd {
-struct Variable;
-}}
-
-namespace torch { namespace jit {
-
-struct VariableFlags {
-  static VariableFlags of(const autograd::Variable& var);
-
-  bool requires_grad;
-  bool defined;
-};
-
-static inline std::ostream & operator<<(std::ostream & out, const VariableFlags& v) {
-  return out
-    << "(requires_grad=" << v.requires_grad
-    << ", defined=" << v.defined << ")";
-}
-
-}}
diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
index bcee9df993c27c..56f18033ef088f 100644
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@@ -230,6 +230,7 @@ bool maybeThrowBackCompatKeepdimWarn(char *func) {
 
 template<>
 void THPPointer<THTensor>::free() {
-  if (ptr)
+  if (ptr) {
     THTensor_free(LIBRARY_STATE ptr);
+  }
 }
diff --git a/torch/csrc/utils/auto_stream.h b/torch/csrc/utils/auto_stream.h
deleted file mode 100644
index 8d7b4d76008727..00000000000000
--- a/torch/csrc/utils/auto_stream.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-
-// RAII structs to set CUDA stream
-
-#ifdef USE_CUDA
-#include <THC/THC.h>
-extern THCState* state;
-#endif
-
-struct AutoStream {
-#ifdef USE_CUDA
-  explicit AutoStream(THCStream* stream)
-    : original_stream(THCState_getStream(state))
-  {
-    THCStream_retain(original_stream);
-    THCState_setStream(state, stream);
-  }
-
-  ~AutoStream() {
-    THCState_setStream(state, original_stream);
-    THCStream_free(original_stream);
-  }
-
-  THCStream* original_stream;
-#endif
-};
diff --git a/torch/csrc/utils/auto_unique_ptr.h b/torch/csrc/utils/auto_unique_ptr.h
deleted file mode 100644
index d49a03608e447c..00000000000000
--- a/torch/csrc/utils/auto_unique_ptr.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#pragma once
-
-#include <memory>
-
-namespace torch {
-
-// A unique_ptr that automatically constructs the object on first dereference.
-template<typename T>
-struct auto_unique_ptr : public std::unique_ptr<T> {
-  T& operator*() {
-    if (!this->get()) this->reset(new T());
-    return *this->get();
-  }
-
-  T* operator->() {
-    if (!this->get()) this->reset(new T());
-    return this->get();
-  }
-};
-
-} // namespace torch
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index e9cfe030637a05..b1ecc5e93316fc 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -1,4 +1,5 @@
 import torch
+from torch._six import nan
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
 from torch.distributions.utils import probs_to_logits, logits_to_probs, lazy_property, broadcast_all
@@ -72,11 +73,11 @@ def param_shape(self):
 
     @property
     def mean(self):
-        return self.probs.new_tensor(float('nan')).expand(self._extended_shape())
+        return self.probs.new_tensor(nan).expand(self._extended_shape())
 
     @property
     def variance(self):
-        return self.probs.new_tensor(float('nan')).expand(self._extended_shape())
+        return self.probs.new_tensor(nan).expand(self._extended_shape())
 
     def sample(self, sample_shape=torch.Size()):
         sample_shape = self._extended_shape(sample_shape)
diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py
index 0b4f92d9b7d11b..dec9cfafe134fb 100644
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@@ -1,4 +1,5 @@
 import math
+from torch._six import inf, nan
 from numbers import Number
 
 import torch
@@ -37,11 +38,11 @@ def __init__(self, loc, scale, validate_args=None):
 
     @property
     def mean(self):
-        return self.loc.new_tensor(float('nan')).expand(self._extended_shape())
+        return self.loc.new_tensor(nan).expand(self._extended_shape())
 
     @property
     def variance(self):
-        return self.loc.new_tensor(float('inf')).expand(self._extended_shape())
+        return self.loc.new_tensor(inf).expand(self._extended_shape())
 
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
diff --git a/torch/distributions/fishersnedecor.py b/torch/distributions/fishersnedecor.py
index a4338ef903320b..23915598c356b7 100644
--- a/torch/distributions/fishersnedecor.py
+++ b/torch/distributions/fishersnedecor.py
@@ -1,6 +1,7 @@
 from numbers import Number
 import torch
 import math
+from torch._six import nan
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
 from torch.distributions.gamma import Gamma
@@ -39,13 +40,13 @@ def __init__(self, df1, df2, validate_args=None):
     @property
     def mean(self):
         df2 = self.df2.clone()
-        df2[df2 <= 2] = float('nan')
+        df2[df2 <= 2] = nan
         return df2 / (df2 - 2)
 
     @property
     def variance(self):
         df2 = self.df2.clone()
-        df2[df2 <= 4] = float('nan')
+        df2[df2 <= 4] = nan
         return 2 * df2.pow(2) * (self.df1 + df2 - 2) / (self.df1 * (df2 - 2).pow(2) * (df2 - 4))
 
     def rsample(self, sample_shape=torch.Size(())):
diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py
index 8b979e2137dfbf..77a50d3f03c49f 100644
--- a/torch/distributions/half_cauchy.py
+++ b/torch/distributions/half_cauchy.py
@@ -1,5 +1,6 @@
 import math
 
+from torch._six import inf
 from torch.distributions import constraints
 from torch.distributions.transforms import AbsTransform
 from torch.distributions.cauchy import Cauchy
@@ -44,7 +45,7 @@ def variance(self):
 
     def log_prob(self, value):
         log_prob = self.base_dist.log_prob(value) + math.log(2)
-        log_prob[value.expand(log_prob.shape) < 0] = -float('inf')
+        log_prob[value.expand(log_prob.shape) < 0] = -inf
         return log_prob
 
     def cdf(self, value):
diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py
index 165045b7614092..059f3837604a63 100644
--- a/torch/distributions/half_normal.py
+++ b/torch/distributions/half_normal.py
@@ -1,5 +1,6 @@
 import math
 
+from torch._six import inf
 from torch.distributions import constraints
 from torch.distributions.transforms import AbsTransform
 from torch.distributions.normal import Normal
@@ -44,7 +45,7 @@ def variance(self):
 
     def log_prob(self, value):
         log_prob = self.base_dist.log_prob(value) + math.log(2)
-        log_prob[value.expand(log_prob.shape) < 0] = -float('inf')
+        log_prob[value.expand(log_prob.shape) < 0] = -inf
         return log_prob
 
     def cdf(self, value):
diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py
index 2ae67fc28ccbcd..caedb3e93a1335 100644
--- a/torch/distributions/kl.py
+++ b/torch/distributions/kl.py
@@ -3,6 +3,7 @@
 from functools import total_ordering
 
 import torch
+from torch._six import inf
 
 from .bernoulli import Bernoulli
 from .beta import Beta
@@ -113,7 +114,7 @@ def _infinite_like(tensor):
     """
     Helper function for obtaining infinite KL Divergence throughout
     """
-    return tensor.new_tensor(float('inf')).expand_as(tensor)
+    return tensor.new_tensor(inf).expand_as(tensor)
 
 
 def _x_log_x(tensor):
@@ -173,10 +174,10 @@ def kl_divergence(p, q):
 @register_kl(Bernoulli, Bernoulli)
 def _kl_bernoulli_bernoulli(p, q):
     t1 = p.probs * (p.probs / q.probs).log()
-    t1[q.probs == 0] = float('inf')
+    t1[q.probs == 0] = inf
     t1[p.probs == 0] = 0
     t2 = (1 - p.probs) * ((1 - p.probs) / (1 - q.probs)).log()
-    t2[q.probs == 1] = float('inf')
+    t2[q.probs == 1] = inf
     t2[p.probs == 1] = 0
     return t1 + t2
 
@@ -208,7 +209,7 @@ def _kl_binomial_binomial(p, q):
 @register_kl(Categorical, Categorical)
 def _kl_categorical_categorical(p, q):
     t = p.probs * (p.logits - q.logits)
-    t[q.probs == 0] = float('inf')
+    t[q.probs == 0] = inf
     t[p.probs == 0] = 0
     return t.sum(-1)
 
@@ -322,7 +323,7 @@ def _kl_pareto_pareto(p, q):
     t1 = q.alpha * scale_ratio.log()
     t2 = -alpha_ratio.log()
     result = t1 + t2 + alpha_ratio - 1
-    result[p.support.lower_bound < q.support.lower_bound] = float('inf')
+    result[p.support.lower_bound < q.support.lower_bound] = inf
     return result
 
 
@@ -346,7 +347,7 @@ def _kl_transformed_transformed(p, q):
 @register_kl(Uniform, Uniform)
 def _kl_uniform_uniform(p, q):
     result = ((q.high - q.low) / (p.high - p.low)).log()
-    result[(q.low > p.low) | (q.high < p.high)] = float('inf')
+    result[(q.low > p.low) | (q.high < p.high)] = inf
     return result
 
 
@@ -392,7 +393,7 @@ def _kl_beta_normal(p, q):
 @register_kl(Beta, Uniform)
 def _kl_beta_uniform(p, q):
     result = -p.entropy() + (q.high - q.low).log()
-    result[(q.low > p.support.lower_bound) | (q.high < p.support.upper_bound)] = float('inf')
+    result[(q.low > p.support.lower_bound) | (q.high < p.support.upper_bound)] = inf
     return result
 
 
@@ -543,7 +544,7 @@ def _kl_pareto_exponential(p, q):
     t2 = p.alpha.reciprocal()
     t3 = p.alpha * scale_rate_prod / (p.alpha - 1)
     result = t1 - t2 + t3 - 1
-    result[p.alpha <= 1] = float('inf')
+    result[p.alpha <= 1] = inf
     return result
 
 
@@ -555,7 +556,7 @@ def _kl_pareto_gamma(p, q):
     t3 = (1 - q.concentration) * common_term
     t4 = q.rate * p.alpha * p.scale / (p.alpha - 1)
     result = t1 + t2 + t3 + t4 - 1
-    result[p.alpha <= 1] = float('inf')
+    result[p.alpha <= 1] = inf
     return result
 
 # TODO: Add Pareto-Laplace KL Divergence
@@ -570,7 +571,7 @@ def _kl_pareto_normal(p, q):
     t3 = p.alpha * common_term.pow(2) / (p.alpha - 2)
     t4 = (p.alpha * common_term - q.loc).pow(2)
     result = t1 - t2 + (t3 + t4) / var_normal - 1
-    result[p.alpha <= 2] = float('inf')
+    result[p.alpha <= 2] = inf
     return result
 
 
@@ -588,14 +589,14 @@ def _kl_uniform_beta(p, q):
     t3 = (q.concentration0 - 1) * (_x_log_x((1 - p.high)) - _x_log_x((1 - p.low)) + common_term) / common_term
     t4 = q.concentration1.lgamma() + q.concentration0.lgamma() - (q.concentration1 + q.concentration0).lgamma()
     result = t3 + t4 - t1 - t2
-    result[(p.high > q.support.upper_bound) | (p.low < q.support.lower_bound)] = float('inf')
+    result[(p.high > q.support.upper_bound) | (p.low < q.support.lower_bound)] = inf
     return result
 
 
 @register_kl(Uniform, Exponential)
 def _kl_uniform_exponetial(p, q):
     result = q.rate * (p.high + p.low) / 2 - ((p.high - p.low) * q.rate).log()
-    result[p.low < q.support.lower_bound] = float('inf')
+    result[p.low < q.support.lower_bound] = inf
     return result
 
 
@@ -607,7 +608,7 @@ def _kl_uniform_gamma(p, q):
     t3 = (1 - q.concentration) * (_x_log_x(p.high) - _x_log_x(p.low) - common_term) / common_term
     t4 = q.rate * (p.high + p.low) / 2
     result = -t1 + t2 + t3 + t4
-    result[p.low < q.support.lower_bound] = float('inf')
+    result[p.low < q.support.lower_bound] = inf
     return result
 
 
@@ -638,5 +639,5 @@ def _kl_uniform_pareto(p, q):
     t1 = (q.alpha * q.scale.pow(q.alpha) * (support_uniform)).log()
     t2 = (_x_log_x(p.high) - _x_log_x(p.low) - support_uniform) / support_uniform
     result = t2 * (q.alpha + 1) - t1
-    result[p.low < q.support.lower_bound] = float('inf')
+    result[p.low < q.support.lower_bound] = inf
     return result
diff --git a/torch/distributions/multinomial.py b/torch/distributions/multinomial.py
index 57a5853ff10b5c..c045557df8ee6c 100644
--- a/torch/distributions/multinomial.py
+++ b/torch/distributions/multinomial.py
@@ -1,4 +1,5 @@
 import torch
+from torch._six import inf
 from torch.distributions.distribution import Distribution
 from torch.distributions import Categorical
 from numbers import Number
@@ -93,6 +94,6 @@ def log_prob(self, value):
         logits, value = broadcast_all(self.logits.clone(), value)
         log_factorial_n = torch.lgamma(value.sum(-1) + 1)
         log_factorial_xs = torch.lgamma(value + 1).sum(-1)
-        logits[(value == 0) & (logits == -float('inf'))] = 0
+        logits[(value == 0) & (logits == -inf)] = 0
         log_powers = (logits * value).sum(-1)
         return log_factorial_n - log_factorial_xs + log_powers
diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py
index e1c7cbe533a6ab..e91c7cf88176cf 100644
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@@ -1,5 +1,6 @@
 from numbers import Number
 import torch
+from torch._six import inf, nan
 import math
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
@@ -27,15 +28,15 @@ class StudentT(Distribution):
     @property
     def mean(self):
         m = self.loc.clone()
-        m[self.df <= 1] = float('nan')
+        m[self.df <= 1] = nan
         return m
 
     @property
     def variance(self):
         m = self.df.clone()
         m[self.df > 2] = self.scale[self.df > 2].pow(2) * self.df[self.df > 2] / (self.df[self.df > 2] - 2)
-        m[(self.df <= 2) & (self.df > 1)] = float('inf')
-        m[self.df <= 1] = float('nan')
+        m[(self.df <= 2) & (self.df > 1)] = inf
+        m[self.df <= 1] = nan
         return m
 
     def __init__(self, df, loc=0., scale=1., validate_args=None):
diff --git a/torch/functional.py b/torch/functional.py
index adc99f40fcd20f..19d47f394fa757 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -1,4 +1,6 @@
 import torch
+import torch.nn.functional as F
+from torch._six import inf
 from operator import mul
 from functools import reduce
 import math
@@ -8,9 +10,11 @@
     'argmin',
     'btrifact',
     'btriunpack',
+    'isfinite',
     'isinf',
     'isnan',
     'split',
+    'stft',
     'unique',
 ]
 
@@ -136,6 +140,25 @@ def btriunpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
     return P, L, U
 
 
+def isfinite(tensor):
+    r"""Returns a new tensor with boolean elements representing if each element is `Finite` or not.
+
+    Arguments:
+        tensor (Tensor): A tensor to check
+
+    Returns:
+        Tensor: A ``torch.ByteTensor`` containing a 1 at each location of finite elements and 0 otherwise
+
+    Example::
+
+        >>> torch.isfinite(torch.Tensor([1, float('inf'), 2, float('-inf'), float('nan')]))
+        tensor([ 1,  0,  1,  0,  0], dtype=torch.uint8)
+    """
+    if not isinstance(tensor, torch.Tensor):
+        raise ValueError("The argument is not a tensor", str(tensor))
+    return (tensor == tensor) & (tensor.abs() != inf)
+
+
 def isinf(tensor):
     r"""Returns a new tensor with boolean elements representing if each element is `+/-INF` or not.
 
@@ -152,7 +175,100 @@ def isinf(tensor):
     """
     if not isinstance(tensor, torch.Tensor):
         raise ValueError("The argument is not a tensor", str(tensor))
-    return tensor.abs() == float('inf')
+    return tensor.abs() == inf
+
+
+def stft(input, n_fft, hop_length=None, win_length=None, window=None,
+         center=True, pad_mode='reflect', normalized=False, onesided=True):
+    r"""Short-time Fourier transform (STFT).
+
+    Ignoring the optional batch dimension, this method computes the following
+    expression:
+
+    .. math::
+        X[m, \omega] = \sum_{k = 0}^{\text{win_length}}%
+                            window[k]\ input[m \times hop_length + k]\ %
+                            e^{- j \frac{2 \pi \cdot \omega k}{\text{win_length}}},
+
+    where :math:`m` is the index of the sliding window, and :math:`\omega` is
+    the frequency that :math:`0 \leq \omega < \text{n_fft}`. When
+    :attr:`onesided` is the default value ``True``,
+
+    * :attr:`input` must be either a 1-D time sequenceor 2-D a batch of time
+      sequences.
+
+    * If :attr:`hop_length` is ``None`` (default), it is treated as equal to
+      ``floor(n_fft / 4)``.
+
+    * If :attr:`win_length` is ``None`` (default), it is treated as equal to
+      :attr:`n_fft`.
+
+    * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from
+      :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is
+      treated as if having :math:`1` everywhere in the window. If
+      :math:`\text{win_length} < \text{n_fft}`, :attr:`window` will be padded on
+      both sides to length :attr:`n_fft` before being applied.
+
+    * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on
+      both sides so that the :math:`t`-th frame is centered at time
+      :math:`t \times \text{hop_length}`. Otherwise, the :math:`t`-th frame
+      begins at time  :math:`t \times \text{hop_length}`.
+
+    * :attr:`pad_mode` determines the padding method used on :attr:`input` when
+      :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for
+      all available options. Default is ``"reflect"``.
+
+    * If :attr:`onesided` is ``True`` (default), only values for :math:`\omega`
+      in :math:`\left[0, 1, 2, \dots, \left\lfloor \frac{\text{n_fft}}{2} \right\rfloor + 1\right]`
+      are returned because the real-to-complex Fourier transform satisfies the
+      conjugate symmetry, i.e., :math:`X[m, \omega] = X[m, \text{n_fft} - \omega]^*`.
+
+    * If :attr:`normalized` is ``True`` (default is ``False``), the function
+      returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame_length})^{-0.5}`.
+
+    Returns the real and the imaginary parts together as one tensor of size
+    :math:`(* \times N \times T \times 2)`, where :math:`*` is the optional
+    batch size of :attr:`input`, :math:`N` is the number of frequencies where
+    STFT is applied, :math:`T` is the total number of frames used, and each pair
+    in the last dimension represents a complex number as the real part and the
+    imaginary part.
+
+    .. warning::
+      This function changed signature at version 0.4.1. Calling with the
+      previous signature may cause error or return incorrect result.
+
+    Arguments:
+        input (Tensor): the input tensor
+        n_fft (int, optional): size of Fourier transform
+        hop_length (int): the distance between neighboring sliding window
+            frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``)
+        win_length (int): the size of window frame and STFT filter.
+            Default: ``None``  (treated as equal to :attr:`n_fft`)
+        window (Tensor, optional): the optional window function.
+            Default: ``None`` (treated as window of all :math:`1`s)
+        center (bool, optional): whether to pad :attr:`input` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop_length}`.
+            Default: ``True``
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. Default: ``"reflect"``
+        normalized (bool, optional): controls whether to return the normalized STFT results
+             Default: ``False``
+        onesided (bool, optional): controls whether to return half of results to
+            avoid redundancy Default: ``True``
+
+    Returns:
+        Tensor: A tensor containing the STFT result with shape described above
+
+    """
+    # TODO: after having proper ways to map Python strings to ATen Enum, move
+    #       this and F.pad to ATen.
+    if center:
+        signal_dim = input.dim()
+        extended_shape = [1] * (3 - signal_dim) + list(input.size())
+        pad = int(n_fft // 2)
+        input = F.pad(input.view(extended_shape), (pad, pad), pad_mode)
+        input = input.view(input.shape[-signal_dim:])
+    return torch._C._VariableFunctions.stft(input, n_fft, hop_length, win_length, window, normalized, onesided)
 
 
 def isnan(tensor):
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index fbf3fabbcfc113..4b605412dcd3a8 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -24,21 +24,10 @@
 _jit_script_compile = torch._C._jit_script_compile
 BatchTensor = torch._C._jit.BatchTensor
 
-# This global variable is set when we are tracing a *forwards* computation.
-# It is intended to be a cheap way to test if tracing has occurred, before
-# doing the slower path using `get_tracing_state` (below.)
-_tracing = False
-
-
-def get_tracing_state(args):
-    if not torch._C._is_tracing(args):
-        return None
-    return torch._C._get_tracing_state(args)
-
 
 @contextlib.contextmanager
-def scope(scope_name, *vars):
-    tracing_state = get_tracing_state(vars)
+def scope(scope_name):
+    tracing_state = torch._C._get_tracing_state()
     if tracing_state:
         tracing_state.push_scope(scope_name)
     try:
@@ -98,18 +87,19 @@ def __init__(self, inner):
         self.inner = inner
 
     def forward(self, *args):
-        global _tracing
         in_vars, in_desc = _flatten(args)
         # NOTE: use full state, because we need it for BatchNorm export
         # This differs from the compiler path, which doesn't support it at the moment.
         module_state = list(_unique_state_dict(self, keep_vars=True).values())
         trace, all_trace_inputs = torch._C._tracer_enter(in_vars + module_state)
-        _tracing = True
-        trace_inputs = _unflatten(all_trace_inputs[:len(in_vars)], in_desc)
-        out = self.inner(*trace_inputs)
-        out_vars, _ = _flatten(out)
-        _tracing = False
-        torch._C._tracer_exit(out_vars)
+        try:
+            trace_inputs = _unflatten(all_trace_inputs[:len(in_vars)], in_desc)
+            out = self.inner(*trace_inputs)
+            out_vars, _ = _flatten(out)
+            torch._C._tracer_exit(out_vars)
+        except Exception:
+            torch._C._tracer_abandon()
+            raise
         return trace, out
 
 
@@ -289,13 +279,7 @@ def wrapper(func):
         if len(kwargs) != 0:
             raise TypeError("got unexpected keyword arguments: {}".format(", ".join(kwargs.keys())))
 
-        if isinstance(func, torch.nn.Module):
-            orig = func
-        else:
-            # traced functions become a method on an Empty module
-            orig = Module()
-
-        module = TopLevelTracedModule(orig, **executor_options)
+        module = TopLevelTracedModule(func, **executor_options)
         module._create_method_from_trace('forward', func, args)
         return module
 
@@ -683,10 +667,17 @@ class TracedModule(ScriptModule):
     __frozen = False
 
     def __init__(self, orig, id_set=None, optimize=True):
+        # XXX: orig can be a nn.Module or a function!
         super(TracedModule, self).__init__(optimize=optimize)
         if id_set is None:
             id_set = set()
 
+        if not isinstance(orig, torch.nn.Module):
+            self._name = orig.__name__
+            orig = torch.nn.Module()
+        else:
+            self._name = 'TracedModule[' + type(orig).__name__ + ']'
+
         def check_unique(param):
             if param in id_set:
                 raise ValueError("TracedModules don't support parameter sharing between modules")
@@ -702,7 +693,6 @@ def check_unique(param):
             if buf is not None:
                 self._buffers[name] = buf
                 check_unique(buf)
-        self._orig_class = type(orig)
 
         if orig._backward_hooks or orig._forward_hooks or orig._forward_pre_hooks:
             raise ValueError("Modules that have hooks assigned can't be compiled")
@@ -719,7 +709,7 @@ def _freeze(self):
         self.__frozen = True
 
     def _get_name(self):
-        return 'TracedModule[' + self._orig_class.__name__ + ']'
+        return self._name
 
     def __setattr__(self, attr, value):
         if not self.__frozen or hasattr(self, attr):
diff --git a/torch/legacy/nn/Normalize.py b/torch/legacy/nn/Normalize.py
index 1c22f37af84155..1704bdf32b318a 100644
--- a/torch/legacy/nn/Normalize.py
+++ b/torch/legacy/nn/Normalize.py
@@ -1,4 +1,5 @@
 import torch
+from torch._six import inf
 from .Module import Module
 from .utils import clear
 
@@ -34,7 +35,7 @@ def updateOutput(self, input):
         self._output.resize_as_(input)
 
         # specialization for the infinity norm
-        if self.p == float('inf'):
+        if self.p == inf:
             if not self._indices:
                 self._indices = torch.cuda.FloatTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' \
                     else torch.LongTensor()
@@ -72,7 +73,7 @@ def updateGradInput(self, input, gradOutput):
             self.cross = input.new()
         # compute diagonal term with gradOutput
         self._gradInput.resize_(n, d)
-        if self.p == float('inf'):
+        if self.p == inf:
                 # specialization for the inf case
             torch.mul(self.norm.view(n, 1, 1).expand(n, d, 1), gradOutput, out=self._gradInput)
             self.buffer.resize_as_(input).zero_()
@@ -113,7 +114,7 @@ def updateGradInput(self, input, gradOutput):
         self._gradInput.add_(-1, self.buffer)
 
         # reuse cross buffer for normalization
-        if self.p == float('inf'):
+        if self.p == inf:
             torch.mul(self.norm, self.norm, out=self.cross)
         else:
             torch.mul(self.normp, self.norm, out=self.cross)
diff --git a/torch/legacy/optim/cg.py b/torch/legacy/optim/cg.py
index 118de3bd96aac8..7880489edd6f8d 100644
--- a/torch/legacy/optim/cg.py
+++ b/torch/legacy/optim/cg.py
@@ -1,10 +1,11 @@
 import math
 
 INFINITY = float('inf')
+NAN = float('nan')
 
 
 def sqrt_nothrow(x):
-    return math.sqrt(x) if x >= 0 else float('nan')
+    return math.sqrt(x) if x >= 0 else NAN
 
 
 def cg(opfunc, x, config, state=None):
@@ -145,7 +146,7 @@ def cg(opfunc, x, config, state=None):
             A = 6 * (f2 - f3) / z3 + 3 * (d2 + d3)
             B = 3 * (f3 - f2) - z3 * (d3 + 2 * d2)
             _denom = (B + sqrt_nothrow(B * B - A * d2 * z3 * z3))
-            z2 = -d2 * z3 * z3 / _denom if _denom != 0 else float('nan')
+            z2 = -d2 * z3 * z3 / _denom if _denom != 0 else NAN
 
             if z2 != z2 or z2 == INFINITY or z2 == -INFINITY or z2 < 0:
                 if limit < -0.5:
diff --git a/torch/lib/THD/master_worker/common/RPC-inl.hpp b/torch/lib/THD/master_worker/common/RPC-inl.hpp
index b6dfb866d8a616..5885b8350504fc 100644
--- a/torch/lib/THD/master_worker/common/RPC-inl.hpp
+++ b/torch/lib/THD/master_worker/common/RPC-inl.hpp
@@ -1,5 +1,5 @@
 #include <cstdint>
-#include "TH/THStorage.h"
+#include "TH/THStorageFunctions.h"
 #include "Traits.hpp"
 
 namespace thd { namespace rpc { namespace detail {
diff --git a/torch/lib/THD/master_worker/common/RPC.hpp b/torch/lib/THD/master_worker/common/RPC.hpp
index af6e8045f99947..99b45942b0a7dc 100644
--- a/torch/lib/THD/master_worker/common/RPC.hpp
+++ b/torch/lib/THD/master_worker/common/RPC.hpp
@@ -1,7 +1,7 @@
 #pragma once
 #include "../master/THDTensor.h"
 #include "ByteArray.hpp"
-#include "TH/THStorage.h"
+#include "TH/THStorageFunctions.h"
 #include "RPCType.hpp"
 
 #include <cstdint>
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensor.cpp b/torch/lib/THD/master_worker/master/generic/THDTensor.cpp
index e0e174ed6a17e8..93dd5d4b7246ac 100644
--- a/torch/lib/THD/master_worker/master/generic/THDTensor.cpp
+++ b/torch/lib/THD/master_worker/master/generic/THDTensor.cpp
@@ -826,8 +826,7 @@ ptrdiff_t THDTensor_(nElement)(const THDTensor *self) {
 }
 
 void THDTensor_(retain)(THDTensor *tensor) {
-  if (tensor->flag & TH_TENSOR_REFCOUNTED)
-    tensor->refcount++;
+  tensor->refcount++;
 }
 
 void THDTensor_(free)(THDTensor *tensor) {
diff --git a/torch/lib/THD/master_worker/master/generic/THDTensorMeta.cpp b/torch/lib/THD/master_worker/master/generic/THDTensorMeta.cpp
index 80214f0637bed1..05ec09748ce0ad 100644
--- a/torch/lib/THD/master_worker/master/generic/THDTensorMeta.cpp
+++ b/torch/lib/THD/master_worker/master/generic/THDTensorMeta.cpp
@@ -141,7 +141,6 @@ static THDTensor *THDTensor_(_alloc)() {
   new_tensor->storageOffset = 0;
 
   new_tensor->refcount = 1;
-  new_tensor->flag = TH_TENSOR_REFCOUNTED;
 
   new_tensor->tensor_id = THDState::s_nextId++;
   return new_tensor;
diff --git a/torch/lib/THD/master_worker/worker/Dispatch.cpp b/torch/lib/THD/master_worker/worker/Dispatch.cpp
index 1c5f3a793230c0..35e7a38731d5cb 100644
--- a/torch/lib/THD/master_worker/worker/Dispatch.cpp
+++ b/torch/lib/THD/master_worker/worker/Dispatch.cpp
@@ -1,4 +1,4 @@
-#include <TH/THStorage.h>
+#include <TH/THStorageFunctions.h>
 #include <cstdint>
 #include <unordered_map>
 #include <memory>
diff --git a/torch/lib/THD/test/rpc_serialization.cpp b/torch/lib/THD/test/rpc_serialization.cpp
index 8e9a4783489b3d..cc4f437cb9050b 100644
--- a/torch/lib/THD/test/rpc_serialization.cpp
+++ b/torch/lib/THD/test/rpc_serialization.cpp
@@ -8,7 +8,7 @@
 #include <THPP/Type.hpp>
 
 #include "../master_worker/common/RPC.hpp"
-#include "TH/THStorage.h"
+#include "TH/THStorageFunctions.h"
 
 using namespace std;
 using namespace thd;
diff --git a/torch/nn/_functions/dropout.py b/torch/nn/_functions/dropout.py
index 6ad2297030b73e..e35ff86bdfa8db 100644
--- a/torch/nn/_functions/dropout.py
+++ b/torch/nn/_functions/dropout.py
@@ -66,3 +66,65 @@ def symbolic(g, input, p=0.5, train=False, inplace=False):
     def _make_noise(input):
         return input.new().resize_(input.size(0), input.size(1),
                                    *repeat(1, input.dim() - 2))
+
+
+class AlphaDropout(Dropout):
+
+    @staticmethod
+    def symbolic(g, input, p=0.5, train=False, inplace=False):
+        # See Note [Export inplace]
+        # NB: In inference mode, FeatureDropout is exported as an identity op.
+        from torch.onnx.symbolic import _unimplemented
+        if train:
+            return _unimplemented("AlphaDropout", "training mode")
+        return input
+
+    @classmethod
+    def forward(cls, ctx, input, p=0.5, train=False, inplace=False):
+        if p < 0 or p > 1:
+            raise ValueError("dropout probability has to be between 0 and 1, "
+                             "but got {}".format(p))
+        ctx.p = p
+        ctx.train = train
+        ctx.inplace = inplace
+
+        if ctx.p == 0 or not ctx.train:
+            return input
+
+        if ctx.inplace:
+            ctx.mark_dirty(input)
+            output = input
+        else:
+            output = input.clone()
+
+        ctx.noise = cls._make_noise(input)
+        if ctx.p == 1:
+            a = 0
+            b = ctx.noise
+        else:
+            ctx.noise.bernoulli_(1 - ctx.p)
+            alpha = 1.7580993408473766
+            a = ((alpha ** 2 * ctx.p + 1) * (1 - ctx.p)) ** (-0.5)
+            b = ctx.noise.add(-1).mul_(alpha * a).add_(alpha * a * ctx.p)
+        ctx.noise = ctx.noise.mul_(a).expand_as(input)
+        b = b.expand_as(input)
+        output.mul_(ctx.noise).add_(b)
+
+        return output
+
+
+class FeatureAlphaDropout(AlphaDropout):
+
+    @staticmethod
+    def symbolic(g, input, p=0.5, train=False, inplace=False):
+        # See Note [Export inplace]
+        # NB: In inference mode, FeatureDropout is exported as an identity op.
+        from torch.onnx.symbolic import _unimplemented
+        if train:
+            return _unimplemented("FeatureAlphaDropout", "training mode")
+        return input
+
+    @staticmethod
+    def _make_noise(input):
+        return input.new().resize_(input.size(0), input.size(1),
+                                   *repeat(1, input.dim() - 2))
diff --git a/torch/nn/_functions/rnn.py b/torch/nn/_functions/rnn.py
index c7f5d10ccd4df3..1cccb77b78d35f 100644
--- a/torch/nn/_functions/rnn.py
+++ b/torch/nn/_functions/rnn.py
@@ -310,7 +310,7 @@ def forward(input, *fargs, **fkwargs):
         # function gets reconstructed each and every time when RNN() is invoked
         # and we don't want to pay the cost of decorator invocation
         import torch
-        if torch._C._jit_is_tracing(input):
+        if torch._C._get_tracing_state():
             import torch.onnx.symbolic
             sym = torch.onnx.symbolic.RNN_symbolic_builder(*args, **kwargs)
             cell_type = args[0]
@@ -318,7 +318,7 @@ def forward(input, *fargs, **fkwargs):
             bound_symbolic = partial(torch.onnx.symbolic.rnn_trace_override_symbolic,
                                      cell_type, func, sym)
 
-            decorator = torch.onnx.symbolic_override_first_arg_based(bound_symbolic)
+            decorator = torch.onnx.symbolic_override(bound_symbolic)
             func = decorator(func)
 
         return func(input, *fargs, **fkwargs)
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 3de3a00cbd02a7..17a7c09b012da6 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -595,35 +595,12 @@ def dropout(input, p=0.5, training=False, inplace=False):
     return _functions.dropout.Dropout.apply(input, p, training, inplace)
 
 
-def alpha_dropout(input, p=0.5, training=False):
+def alpha_dropout(input, p=0.5, training=False, inplace=False):
     r"""Applies alpha dropout to the input.
 
     See :class:`~torch.nn.AlphaDropout` for details.
-
-    Args:
-        p (float, optional): the drop probability. Default: 0.5
-        training (bool, optional): switch between training and evaluation mode. Default: ``False``
     """
-    if p < 0 or p > 1:
-        raise ValueError("dropout probability has to be between 0 and 1, "
-                         "but got {}".format(p))
-
-    if p == 0 or not training:
-        return input
-
-    alpha = -1.7580993408473766
-    keep_prob = 1 - p
-    # TODO avoid casting to byte after resize
-    noise = input.data.new().resize_(input.size())
-    noise.bernoulli_(p)
-    noise = noise.byte()
-
-    output = input.masked_fill(noise, alpha)
-
-    a = (keep_prob + alpha ** 2 * keep_prob * (1 - keep_prob)) ** (-0.5)
-    b = -a * alpha * (1 - keep_prob)
-
-    return output.mul_(a).add_(b)
+    return _functions.dropout.AlphaDropout.apply(input, p, training, inplace)
 
 
 def dropout2d(input, p=0.5, training=False, inplace=False):
@@ -634,6 +611,10 @@ def dropout3d(input, p=0.5, training=False, inplace=False):
     return _functions.dropout.FeatureDropout.apply(input, p, training, inplace)
 
 
+def feature_alpha_dropout(input, p=0.5, training=False, inplace=False):
+    return _functions.dropout.FeatureAlphaDropout.apply(input, p, training, inplace)
+
+
 def threshold(input, threshold, value, inplace=False):
     r"""Thresholds each element of the input Tensor.
 
@@ -1293,7 +1274,7 @@ def instance_norm(input, running_mean=None, running_var=None, weight=None,
 
     import torch.onnx.symbolic
 
-    @torch.onnx.symbolic_override_first_arg_based(torch.onnx.symbolic.instance_norm)
+    @torch.onnx.symbolic_override(torch.onnx.symbolic.instance_norm)
     def _instance_norm(input, running_mean=None, running_var=None, weight=None,
                        bias=None, use_input_stats=None, momentum=None, eps=None):
         # Repeat stored stats and affine transform params if necessary
diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
index cea0e41f399e56..4d98f482768a63 100644
--- a/torch/nn/modules/__init__.py
+++ b/torch/nn/modules/__init__.py
@@ -16,7 +16,7 @@
 from .batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d
 from .instancenorm import InstanceNorm1d, InstanceNorm2d, InstanceNorm3d
 from .normalization import LocalResponseNorm, CrossMapLRN2d, LayerNorm, GroupNorm
-from .dropout import Dropout, Dropout2d, Dropout3d, AlphaDropout
+from .dropout import Dropout, Dropout2d, Dropout3d, AlphaDropout, FeatureAlphaDropout
 from .padding import ReflectionPad1d, ReflectionPad2d, ReplicationPad1d, ReplicationPad2d, \
     ReplicationPad3d, ZeroPad2d, ConstantPad1d, ConstantPad2d, ConstantPad3d
 from .sparse import Embedding, EmbeddingBag
@@ -40,7 +40,8 @@
     'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d',
     'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', 'FractionalMaxPool2d',
     'LPPool1d', 'LPPool2d', 'LocalResponseNorm', 'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d',
-    'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm', 'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout',
+    'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm',
+    'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout',
     'ReflectionPad1d', 'ReflectionPad2d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d',
     'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell',
     'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d', 'PairwiseDistance',
diff --git a/torch/nn/modules/dropout.py b/torch/nn/modules/dropout.py
index e0900954724932..48415f61929f65 100644
--- a/torch/nn/modules/dropout.py
+++ b/torch/nn/modules/dropout.py
@@ -131,7 +131,7 @@ def forward(self, input):
         return F.dropout3d(input, self.p, self.training, self.inplace)
 
 
-class AlphaDropout(Module):
+class AlphaDropout(_DropoutNd):
     r"""Applies Alpha Dropout over the input.
 
     Alpha Dropout is a type of Dropout that maintains the self-normalizing
@@ -153,6 +153,8 @@ class AlphaDropout(Module):
 
     Args:
         p (float): probability of an element to be dropped. Default: 0.5
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
 
     Shape:
         - Input: `Any`. Input can be of any shape
@@ -167,16 +169,11 @@ class AlphaDropout(Module):
     .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
     """
 
-    def __init__(self, p=0.5):
-        super(AlphaDropout, self).__init__()
-        if p < 0 or p > 1:
-            raise ValueError("dropout probability has to be between 0 and 1, "
-                             "but got {}".format(p))
-        self.p = p
-
     def forward(self, input):
         return F.alpha_dropout(input, self.p, self.training)
 
-    def __repr__(self):
-        return self.__class__.__name__ + '(' \
-            + 'p=' + str(self.p) + ')'
+
+class FeatureAlphaDropout(_DropoutNd):
+
+    def forward(self, input):
+        return F.feature_alpha_dropout(input, self.p, self.training)
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 961ce858f2aab2..489e8998843f98 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -523,7 +523,7 @@ class BCEWithLogitsLoss(_Loss):
     :math:`p_n > 1` increases the recall, :math:`p_n < 1` increases the precision.
 
     For example, if a dataset contains 100 positive and 300 negative examples of a single class,
-    then `pos_weight` for the class should be equal to math:`\frac{300}{100}=3`.
+    then `pos_weight` for the class should be equal to :math:`\frac{300}{100}=3`.
     The loss would act as if the dataset contains math:`3\times 100=300` positive examples.
 
     Args:
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 91bab5c39e2f10..a00ff3dd9c268c 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -450,16 +450,16 @@ def _tracing_name(self, tracing_state):
 
     def _slow_forward(self, *input, **kwargs):
         input_vars = tuple(torch.autograd.function._iter_tensors(input))
-        tracing_state = torch.jit.get_tracing_state(input_vars)
+        tracing_state = torch._C._get_tracing_state()
         if not tracing_state:
             return self.forward(*input, **kwargs)
         if not hasattr(tracing_state, '_traced_module_stack'):
             tracing_state._traced_module_stack = []
         name = self._tracing_name(tracing_state)
         if name:
-            tracing_state.push_scope('%s[%s]' % (self.__class__.__name__, name))
+            tracing_state.push_scope('%s[%s]' % (self._get_name(), name))
         else:
-            tracing_state.push_scope(self.__class__.__name__)
+            tracing_state.push_scope(self._get_name())
         tracing_state._traced_module_stack.append(self)
         try:
             result = self.forward(*input, **kwargs)
@@ -471,7 +471,7 @@ def _slow_forward(self, *input, **kwargs):
     def __call__(self, *input, **kwargs):
         for hook in self._forward_pre_hooks.values():
             hook(self, input)
-        if torch.jit._tracing:
+        if torch._C._get_tracing_state():
             result = self._slow_forward(*input, **kwargs)
         else:
             result = self.forward(*input, **kwargs)
diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py
index bdb770bd75b94a..48b2a77d2d598d 100644
--- a/torch/nn/parallel/_functions.py
+++ b/torch/nn/parallel/_functions.py
@@ -76,19 +76,17 @@ class Scatter(Function):
 
     @staticmethod
     def forward(ctx, target_gpus, chunk_sizes, dim, input):
-        ctx.target_gpus = target_gpus
-        ctx.chunk_sizes = chunk_sizes
         ctx.dim = dim
         ctx.input_device = input.get_device() if input.is_cuda else -1
         streams = None
         if ctx.input_device == -1:
             # Perform CPU to GPU copies in a background stream
-            streams = [_get_stream(device) for device in ctx.target_gpus]
-        outputs = comm.scatter(input, ctx.target_gpus, ctx.chunk_sizes, ctx.dim, streams)
+            streams = [_get_stream(device) for device in target_gpus]
+        outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
         # Synchronize with the copy stream
         if streams is not None:
             for i, output in enumerate(outputs):
-                with torch.cuda.device(ctx.target_gpus[i]):
+                with torch.cuda.device(target_gpus[i]):
                     main_stream = torch.cuda.current_stream()
                     main_stream.wait_stream(streams[i])
                     output.record_stream(main_stream)
diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py
index db808adcf70b29..fcccc1f80bdde4 100644
--- a/torch/nn/utils/clip_grad.py
+++ b/torch/nn/utils/clip_grad.py
@@ -1,5 +1,6 @@
 import warnings
 import torch
+from torch._six import inf
 
 
 def clip_grad_norm_(parameters, max_norm, norm_type=2):
@@ -23,7 +24,7 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2):
     parameters = list(filter(lambda p: p.grad is not None, parameters))
     max_norm = float(max_norm)
     norm_type = float(norm_type)
-    if norm_type == float('inf'):
+    if norm_type == inf:
         total_norm = max(p.grad.data.abs().max() for p in parameters)
     else:
         total_norm = 0
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
index 4c1eee05cba879..d91797f00b8114 100644
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -168,8 +168,7 @@ def pack_padded_sequence_trace_wrapper(input, lengths):
     return tuple(o for o in outputs)
 
 
-pack_padded_sequence = torch.onnx.symbolic_override_first_arg_based(
-    _symbolic_pack_padded_sequence)(pack_padded_sequence)
+pack_padded_sequence = torch.onnx.symbolic_override(_symbolic_pack_padded_sequence)(pack_padded_sequence)
 
 
 def pad_packed_sequence(sequence, batch_first=False, padding_value=0.0, total_length=None):
@@ -264,8 +263,7 @@ def pad_packed_sequence_trace_wrapper(data, batch_sizes):
     return data, lengths
 
 
-pad_packed_sequence = torch.onnx.symbolic_override_packed_sequence_based(
-    _symbolic_pad_packed_sequence)(pad_packed_sequence)
+pad_packed_sequence = torch.onnx.symbolic_override(_symbolic_pad_packed_sequence)(pad_packed_sequence)
 
 
 def pad_sequence(sequences, batch_first=False, padding_value=0):
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index 1807b711a1ccea..0514343da18284 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -56,48 +56,6 @@ def _run_symbolic_method(*args, **kwargs):
     return utils._run_symbolic_method(*args, **kwargs)
 
 
-def _symbolic_override_wrapper_maker(symbolic_fn, might_trace, fn):
-
-    def wrapper(*args, **kwargs):
-        import torch
-        import torch.jit
-        from torch.autograd import Function, function
-
-        # fast pass
-        if not might_trace(args):
-            return fn(*args, **kwargs)
-
-        flat_args = tuple(function._iter_tensors_permissive(args))
-        flat_args_only_tensors = tuple(t for t in flat_args if isinstance(t, torch.Tensor))
-        if not any(map(torch._C._jit_is_tracing, flat_args_only_tensors)):
-            return fn(*args, **kwargs)
-
-        tstate = torch._C._get_tracing_state(flat_args_only_tensors)
-
-        arg_values = [torch._C._get_value_trace(tstate, x) if isinstance(x, torch.Tensor) else x for x in flat_args]
-
-        # This must come after the calls to get_value_trace, lest we
-        # lose information due to in-place operations.
-        output_vars = fn(*args, **kwargs)
-
-        symbolic_args = function._unflatten(arg_values, args)
-        output_vals = symbolic_fn(tstate.graph(), *symbolic_args, **kwargs)
-
-        for var, val in zip(
-                function._iter_tensors(output_vars),
-                function._iter_jit_values(output_vals)):
-            val.inferTypeFrom(var.data)
-            torch._C._set_value_trace(tstate, var, val)
-
-        return output_vars
-
-    # fn might be autograd.Function too, in this case wrapping doesn't work
-    if isinstance(fn, types.FunctionType):
-        wrapper = functools.wraps(fn)(wrapper)
-
-    return wrapper
-
-
 def symbolic_override(symbolic_fn):
     r"""
     Decorator to override ONNX export of the a function with specified subgraph.
@@ -123,47 +81,36 @@ def foo(x, y):
         return x + y[0] + y[1]
     ```
     """
+    def decorator(fn):
+        import torch
+        from torch.autograd import function
 
-    return functools.partial(_symbolic_override_wrapper_maker, symbolic_fn, lambda x: True)
-
-
-def symbolic_override_first_arg_based(symbolic_fn):
-    r"""
-    Decorator to override ONNX export of the a function with specified subgraph.
+        def wrapper(*args, **kwargs):
+            tstate = torch._C._get_tracing_state()
+            if not tstate:
+                return fn(*args, **kwargs)
 
-    Equivalent to :func:`symbolic_override` but checks only the first argument
-    of the function to figure out whether the tracing is on. Thus the first arg
-    needs to be a Tensor.
-    """
+            flat_args = tuple(function._iter_tensors_permissive(args))
+            arg_values = [torch._C._get_value_trace(x) if isinstance(x, torch.Tensor) else x for x in flat_args]
 
-    def might_trace(args):
-        import torch
-        first_arg = args[0]
-        if not isinstance(first_arg, torch.Tensor):
-            raise ValueError('First argument of {} is expected to be a tensor, '
-                             'but got an object of type {}'
-                             .format(symbolic_fn.__name__, type(first_arg)))
-        return torch._C._jit_is_tracing(first_arg)
+            # This must come after the calls to get_value_trace, lest we
+            # lose information due to in-place operations.
+            output_vars = fn(*args, **kwargs)
 
-    return functools.partial(_symbolic_override_wrapper_maker, symbolic_fn, might_trace)
+            symbolic_args = function._unflatten(arg_values, args)
+            output_vals = symbolic_fn(tstate.graph(), *symbolic_args, **kwargs)
 
+            for var, val in zip(
+                    function._iter_tensors(output_vars),
+                    function._iter_jit_values(output_vals)):
+                val.inferTypeFrom(var.data)
+                torch._C._set_value_trace(var, val)
 
-def symbolic_override_packed_sequence_based(symbolic_fn):
-    r"""
-    Decorator to override ONNX export of the a function with specified subgraph.
+            return output_vars
 
-    Equivalent to :func:`symbolic_override` but checks only the first argument
-    of the function to figure out whether the tracing is on. Thus the first arg
-    needs to be a Tensor.
-    """
+        # fn might be autograd.Function too, in this case wrapping doesn't work
+        if isinstance(fn, types.FunctionType):
+            wrapper = functools.wraps(fn)(wrapper)
 
-    def might_trace(args):
-        import torch
-        first_arg = args[0]
-        if not isinstance(first_arg, torch.nn.utils.rnn.PackedSequence):
-            raise ValueError('pad_packed_sequence expects sequence to be a '
-                             'PackedSequence, but got an object of type {}'
-                             .format(type(first_arg)))
-        return torch._C._jit_is_tracing(first_arg[0])
-
-    return functools.partial(_symbolic_override_wrapper_maker, symbolic_fn, might_trace)
+        return wrapper
+    return decorator
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index f5477501080947..a88739c1cc4906 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -745,6 +745,15 @@ def _cast_func_template(to_i, g, input, non_blocking):
     globals()[name] = partial(_cast_func_template, v)
 
 
+def zeros_like(g, input):
+    return g.op("Sub", input, input).setType(input.type().contiguous())
+
+
+def full_like(g, input, fill_value):
+    # TODO: a more efficient implementation (ConstantFill?)
+    return add(g, zeros_like(g, input), fill_value, alpha=torch.tensor(1))
+
+
 def slice(g, self, dim, start, end, step):
     if step != 1:
         _unimplemented("slice", "step!=1 is currently not supported")
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index b40a07580942e7..ad7f780719ccd3 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -1,4 +1,6 @@
 import math
+import torch
+from torch._six import inf
 from bisect import bisect_right
 from functools import partial
 from .optimizer import Optimizer
@@ -367,9 +369,9 @@ def _init_is_better(self, mode, threshold, threshold_mode):
             raise ValueError('threshold mode ' + threshold_mode + ' is unknown!')
 
         if mode == 'min':
-            self.mode_worse = float('inf')
+            self.mode_worse = inf
         else:  # mode == 'max':
-            self.mode_worse = (-float('inf'))
+            self.mode_worse = -inf
 
         self.is_better = partial(self._cmp, mode, threshold_mode, threshold)
 
diff --git a/torch/tensor.py b/torch/tensor.py
index 7bef2a460db5b4..60a50b6b67b454 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -219,18 +219,6 @@ def share_memory_(self):
         self.storage().share_memory_()
         return self
 
-    def view_as(self, tensor):
-        r"""view_as(other) -> Tensor
-
-        View this tensor as the same size as :attr:`other`.
-        ``self.view_as(other)`` is equivalent to ``self.view(other.size())``.
-
-        Args:
-            other (:class:`torch.Tensor`): The result tensor has the same size
-                as :attr:`other.size()`.
-        """
-        return self.view(tensor.size())
-
     def __reversed__(self):
         r"""Reverses the tensor along dimension 0."""
         if self.dim() == 0:
@@ -260,6 +248,17 @@ def btrifact(self, info=None, pivot=True):
         else:
             return super(Tensor, self).btrifact(pivot=pivot)
 
+    def stft(self, n_fft, hop_length=None, win_length=None, window=None,
+             center=True, pad_mode='reflect', normalized=False, onesided=True):
+        r"""See :func:`torch.stft`
+
+        .. warning::
+          This function changed signature at version 0.4.1. Calling with
+          the previous signature may cause error or return incorrect result.
+        """
+        return torch.stft(self, n_fft, hop_length, win_length, window, center,
+                          pad_mode, normalized, onesided)
+
     def resize(self, *sizes):
         warnings.warn("non-inplace resize is deprecated")
         from torch.autograd._functions import Resize
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 914edf94b190e2..16ad4130d5a418 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -303,6 +303,7 @@ def CppExtension(name, sources, *args, **kwargs):
 
         libraries = kwargs.get('libraries', [])
         libraries.append('caffe2')
+        libraries.append('torch')
         libraries.append('_C')
         kwargs['libraries'] = libraries
 
@@ -346,6 +347,7 @@ def CUDAExtension(name, sources, *args, **kwargs):
     libraries.append('cudart')
     if sys.platform == 'win32':
         libraries.append('caffe2')
+        libraries.append('torch')
         libraries.append('caffe2_gpu')
         libraries.append('_C')
     kwargs['libraries'] = libraries
@@ -692,6 +694,7 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose):
         lib_path = os.path.join(torch_path, 'lib')
 
         extra_ldflags.append('caffe2.lib')
+        extra_ldflags.append('torch.lib')
         if with_cuda:
             extra_ldflags.append('caffe2_gpu.lib')
         extra_ldflags.append('_C.lib')