diff --git a/.gitmodules.aten b/.gitmodules.aten
deleted file mode 100644
index e305c927f5a006..00000000000000
--- a/.gitmodules.aten
+++ /dev/null
@@ -1,10 +0,0 @@
-[submodule "third_party/tbb"]
-	path = third_party/tbb
-	url = https://github.com/01org/tbb
-	branch = tbb_2018
-[submodule "third_party/catch"]
-	path = third_party/catch
-	url = https://github.com/catchorg/Catch2.git
-[submodule "third-party/cpuinfo"]
-	path = third_party/cpuinfo
-	url = https://github.com/Maratyszcza/cpuinfo.git
diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
index ae1a9b0756f812..fb44d4be6ad464 100755
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@@ -2,9 +2,15 @@
 
 set -ex
 
+# The INSTALL_PREFIX here must match up with test.sh
+INSTALL_PREFIX="/usr/local/caffe2"
 LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
+CMAKE_ARGS=()
+
 
+# Setup SCCACHE
+###############################################################################
 # Setup sccache if SCCACHE_BUCKET is set
 if [ -n "${SCCACHE_BUCKET}" ]; then
   mkdir -p ./sccache
@@ -61,24 +67,29 @@ report_compile_cache_stats() {
   fi
 }
 
-CMAKE_ARGS=("-DBUILD_BINARY=ON")
-CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
-CMAKE_ARGS+=("-DUSE_ZSTD=ON")
-
-if [[ $BUILD_ENVIRONMENT == *-aten-* ]]; then
-  if [[ CMAKE_ARGS != *USE_ATEN* ]] && [[ CMAKE_ARGS != *BUILD_ATEN* ]]; then
-    CMAKE_ARGS+=("-DBUILD_ATEN=ON")
-  fi
+###############################################################################
+# Explicitly set Python executable.
+###############################################################################
+# On Ubuntu 16.04 the default Python is still 2.7.
+PYTHON="$(which python)"
+if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
+  PYTHON=$(which "python${BASH_REMATCH[1]}")
+  CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}")
 fi
 
-# Run build script from scripts if applicable
+
+###############################################################################
+# Use special scripts for Android, conda, and setup builds
+###############################################################################
 if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
   export ANDROID_NDK=/opt/ndk
+  CMAKE_ARGS+=("-DBUILD_BINARY=ON")
+  CMAKE_ARGS+=("-DBUILD_TEST=ON")
+  CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
+  CMAKE_ARGS+=("-DUSE_ZSTD=ON")
   "${ROOT_DIR}/scripts/build_android.sh" ${CMAKE_ARGS[*]} "$@"
   exit 0
-fi
-if [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then
-
+elif [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then
   # click (required by onnx) wants these set
   # TODO don't think this fixes the problem for conda3 yet
   export LANG=C.UTF-8
@@ -96,51 +107,52 @@ if [[ "${BUILD_ENVIRONMENT}" == conda* ]]; then
   PROTOBUF_INCDIR=/opt/conda/include pip install -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx"
   report_compile_cache_stats
   exit 0
+elif [[ $BUILD_ENVIRONMENT == *setup* ]]; then
+  rm -rf $INSTALL_PREFIX && mkdir $INSTALL_PREFIX
+  PYTHONPATH=$INSTALL_PREFIX $PYTHON setup_caffe2.py develop --install-dir $INSTALL_PREFIX
+  exit 0
 fi
 
-# Run cmake from ./build_caffe2 directory so it doesn't conflict with
-# standard PyTorch build directory. Eventually these won't need to
-# be separate.
-rm -rf build_caffe2
-mkdir build_caffe2
-cd ./build_caffe2
 
-INSTALL_PREFIX="/usr/local/caffe2"
+###############################################################################
+# Set cmake args
+###############################################################################
+CMAKE_ARGS+=("-DBUILD_BINARY=ON")
+CMAKE_ARGS+=("-DBUILD_TEST=ON")
+CMAKE_ARGS+=("-DINSTALL_TEST=ON")
+CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
+CMAKE_ARGS+=("-DUSE_ZSTD=ON")
 CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
 
-# Explicitly set Python executable.
-# On Ubuntu 16.04 the default Python is still 2.7.
-PYTHON="$(which python)"
-if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
-  PYTHON=$(which "python${BASH_REMATCH[1]}")
-  CMAKE_ARGS+=("-DPYTHON_EXECUTABLE=${PYTHON}")
+if [[ $BUILD_ENVIRONMENT == *-aten-* ]]; then
+  if [[ CMAKE_ARGS != *USE_ATEN* ]] && [[ CMAKE_ARGS != *BUILD_ATEN* ]]; then
+    CMAKE_ARGS+=("-DBUILD_ATEN=ON")
+  fi
+fi
+if [[ $BUILD_ENVIRONMENT == *mkl* ]]; then
+  CMAKE_ARGS+=("-DBLAS=MKL")
 fi
+if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then
+  CMAKE_ARGS+=("-DUSE_CUDA=ON")
+  CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell")
+  CMAKE_ARGS+=("-DUSE_NNPACK=OFF")
 
-case "${BUILD_ENVIRONMENT}" in
-  *-mkl*)
-    CMAKE_ARGS+=("-DBLAS=MKL")
-    ;;
-  *-cuda*)
-    CMAKE_ARGS+=("-DUSE_CUDA=ON")
-    CMAKE_ARGS+=("-DCUDA_ARCH_NAME=Maxwell")
-    CMAKE_ARGS+=("-DUSE_NNPACK=OFF")
-
-    # Explicitly set path to NVCC such that the symlink to ccache or sccache is used
-    CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/nvcc")
-
-    # Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit.
-    # Setting PATH to resolve to the right nvcc alone isn't enough.
-    # See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589.
-    export CUDA_PATH="/usr/local/cuda"
-
-    # Ensure the ccache symlink can still find the real nvcc binary.
-    export PATH="/usr/local/cuda/bin:$PATH"
-    ;;
-  *-rocm*)
-    export LANG=C.UTF-8
-    export LC_ALL=C.UTF-8
-    export HCC_AMDGPU_TARGET=gfx900
-esac
+  # Explicitly set path to NVCC such that the symlink to ccache or sccache is used
+  CMAKE_ARGS+=("-DCUDA_NVCC_EXECUTABLE=${CACHE_WRAPPER_DIR}/nvcc")
+
+  # Ensure FindCUDA.cmake can infer the right path to the CUDA toolkit.
+  # Setting PATH to resolve to the right nvcc alone isn't enough.
+  # See /usr/share/cmake-3.5/Modules/FindCUDA.cmake, block at line 589.
+  export CUDA_PATH="/usr/local/cuda"
+
+  # Ensure the ccache symlink can still find the real nvcc binary.
+  export PATH="/usr/local/cuda/bin:$PATH"
+fi
+if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
+  export LANG=C.UTF-8
+  export LC_ALL=C.UTF-8
+  export HCC_AMDGPU_TARGET=gfx900
+fi
 
 # Try to include Redis support for Linux builds
 if [ "$(uname)" == "Linux" ]; then
@@ -154,14 +166,6 @@ if [ "$(uname)" == "Darwin" ]; then
   CMAKE_ARGS+=("-DBUILD_CUSTOM_PROTOBUF=ON")
 fi
 
-# We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04)
-# and use that if so.
-if [[ -x "$(command -v cmake3)" ]]; then
-    CMAKE_BINARY=cmake3
-else
-    CMAKE_BINARY=cmake
-fi
-
 # Use a speciallized onnx namespace in CI to catch hardcoded onnx namespace
 CMAKE_ARGS+=("-DONNX_NAMESPACE=ONNX_NAMESPACE_FOR_C2_CI")
 
@@ -173,10 +177,13 @@ if [[ -n "$INTEGRATED" ]]; then
     CMAKE_ARGS+=("-DCAFFE2_LINK_LOCAL_PROTOBUF=OFF")
 fi
 
-# Configure
-${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@"
-
-# Build
+# We test the presence of cmake3 (for platforms like Centos and Ubuntu 14.04)
+# and use that if so.
+if [[ -x "$(command -v cmake3)" ]]; then
+    CMAKE_BINARY=cmake3
+else
+    CMAKE_BINARY=cmake
+fi
 # sccache will fail for CUDA builds if all cores are used for compiling
 if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]] && [ -n "${SCCACHE}" ]; then
   MAX_JOBS=`expr $(nproc) - 1`
@@ -184,6 +191,21 @@ else
   MAX_JOBS=$(nproc)
 fi
 
+
+###############################################################################
+# Configure and make
+###############################################################################
+# Run cmake from ./build_caffe2 directory so it doesn't conflict with
+# standard PyTorch build directory. Eventually these won't need to
+# be separate.
+rm -rf build_caffe2
+mkdir build_caffe2
+cd ./build_caffe2
+
+# Configure
+${CMAKE_BINARY} "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@"
+
+# Build
 if [ "$(uname)" == "Linux" ]; then
   make "-j${MAX_JOBS}" install
 else
@@ -193,6 +215,11 @@ fi
 
 report_compile_cache_stats
 
+
+###############################################################################
+# Install ONNX
+###############################################################################
+
 # Install ONNX into a local directory
 pip install --user -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx"
 
diff --git a/.jenkins/pytorch/macos-build.sh b/.jenkins/pytorch/macos-build.sh
old mode 100644
new mode 100755
index c020cf1b72c988..ad3ac913d81956
--- a/.jenkins/pytorch/macos-build.sh
+++ b/.jenkins/pytorch/macos-build.sh
@@ -21,10 +21,14 @@ rm -rf ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch*
 git submodule update --init --recursive
 export CMAKE_PREFIX_PATH=${PYTORCH_ENV_DIR}/miniconda3/
 
-# Build and test PyTorch
+# Build PyTorch
 export MACOSX_DEPLOYMENT_TARGET=10.9
 export CXX=clang++
 export CC=clang
+if which sccache > /dev/null; then
+  export CXX="sccache clang++"
+  export CC="sccache clang"
+fi
 # If we run too many parallel jobs, we will OOM
 export MAX_JOBS=2
 
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
old mode 100644
new mode 100755
index 741e10659ce7be..ce05699c6692f5
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -21,7 +21,7 @@ rm -rf ${PYTORCH_ENV_DIR}/miniconda3/lib/python3.6/site-packages/torch*
 git submodule update --init --recursive
 export CMAKE_PREFIX_PATH=${PYTORCH_ENV_DIR}/miniconda3/
 
-# Build and test PyTorch
+# Test PyTorch
 export MACOSX_DEPLOYMENT_TARGET=10.9
 export CXX=clang++
 export CC=clang
diff --git a/.jenkins/pytorch/win-build.sh b/.jenkins/pytorch/win-build.sh
index b7825b0c4fedcb..86db838e0dced3 100755
--- a/.jenkins/pytorch/win-build.sh
+++ b/.jenkins/pytorch/win-build.sh
@@ -44,7 +44,15 @@ set MAGMA_HOME=%cd%\\magma
 
 :: Install sccache
 mkdir %CD%\\tmp_bin
-if "%REBUILD%"=="" ( aws s3 cp s3://ossci-windows/sccache.exe %CD%\\tmp_bin\\sccache.exe --quiet )
+if "%REBUILD%"=="" (
+  :check_sccache
+  %CD%\\tmp_bin\\sccache.exe --show-stats || (
+    taskkill /im sccache.exe /f /t || set ERRORLEVEL=0
+    del %CD%\\tmp_bin\\sccache.exe
+    aws s3 cp s3://ossci-windows/sccache.exe %CD%\\tmp_bin\\sccache.exe
+    goto :check_sccache
+  )
+)
 
 :: Install Miniconda3
 if "%REBUILD%"=="" (
@@ -73,7 +81,7 @@ set CUDNN_ROOT_DIR=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.0
 :: Target only our CI GPU machine's CUDA arch to speed up the build
 set TORCH_CUDA_ARCH_LIST=5.2
 
-sccache --stop-server || set ERRORLEVEL=0
+sccache --stop-server
 sccache --start-server
 sccache --zero-stats
 set CC=sccache cl
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f69091a695e018..a766d0299bdf6b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,6 +30,15 @@ if(NOT DEFINED BLAS_SET_BY_USER)
   set(BLAS_SET_BY_USER ${BLAS_SET_BY_USER} CACHE STRING "Marks whether BLAS was manually set by user or auto-detected")
 endif()
 
+# These lines are an attempt to make find_package(cuda) pick up
+# libcuda.dylib, and not cuda.framework.  It doesn't work all
+# the time, but it seems to help for some users.
+# TODO: replace this with a more robust fix
+if(APPLE)
+  set(CMAKE_FIND_FRAMEWORK LAST)
+  set(CMAKE_FIND_APPBUNDLE LAST)
+endif()
+
 # ---[ Options.
 # Note to developers: if you add an option below, make sure you also add it to
 # cmake/Summary.cmake so that the summary prints out the option values.
@@ -48,8 +57,11 @@ cmake_dependent_option(
     CAFFE2_USE_MSVC_STATIC_RUNTIME "Using MSVC static runtime libraries" ON
     "NOT BUILD_SHARED_LIBS" OFF)
 cmake_dependent_option(
-    BUILD_TEST "Build Caffe2 C++ test binaries (need gtest and gbenchmark)" ON
+    BUILD_TEST "Build Caffe2 C++ test binaries (need gtest and gbenchmark)" OFF
     "BUILD_CAFFE2" OFF)
+cmake_dependent_option(
+    INSTALL_TEST "Install test binaries if BUILD_TEST is on" OFF
+    "BUILD_TEST" OFF)
 option(USE_ACL "Use ARM Compute Library" OFF)
 option(USE_ASAN "Use Address Sanitizer" OFF)
 option(USE_ATEN "Use ATen" OFF)
@@ -210,12 +222,16 @@ if(NOT MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-field-initializers")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-type-limits")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-pragmas")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sign-compare")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-function")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
+  endif()
   if ($ENV{WERROR})
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
   endif($ENV{WERROR})
@@ -273,6 +289,9 @@ if(BUILD_CAFFE2)
   add_subdirectory(caffe/proto)
 endif()
 
+# ---[ Shared build
+add_subdirectory(c10)
+
 # ---[ Main build
 add_subdirectory(caffe2)
 
diff --git a/CODEOWNERS b/CODEOWNERS
index 5721f1688901bf..7cc2b3d47ec2e2 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -11,9 +11,9 @@
 /requirements.txt @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
 /torch/csrc/api/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ebetica @goldsborough
 /test/cpp/api/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ebetica @goldsborough
-/torch/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer
-/torch/csrc/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer
-/torch/csrc/jit/passes/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer
-/test/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer
-/scripts/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer
+/torch/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/torch/csrc/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/torch/csrc/jit/passes/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/test/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
+/scripts/onnx/ @anderspapitto @bddppq @dzhulgakov @ezyang @houseroad @jamesr66a @smessmer @Yangqing
 /torch/lib/c10d/ @apaszke @pietern @teng-li
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index fc8b5afe496ebc..c9e948de418067 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -13,7 +13,6 @@ else()
       USE_CUDNN "Use cuDNN" ON
       "USE_CUDA" OFF)
   option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
-  option(ATEN_NO_CONTRIB "Do not build ATen contrib" OFF)
 
   # Legacy options, which we will eventually remove
   cmake_dependent_option(
@@ -157,13 +156,6 @@ list(APPEND ATen_CPU_INCLUDE
   ${CMAKE_CURRENT_BINARY_DIR}/src/ATen)
 add_subdirectory(src/ATen)
 
-if(ATEN_NO_CONTRIB)
-  message("disable contrib because ATEN_NO_CONTRIB is set")
-else()
-  add_subdirectory(contrib/data)
-  add_subdirectory(contrib/meter)
-endif()
-
 if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
   # Pass source, includes, and libs to parent
   set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
diff --git a/aten/contrib/data/BatchDataset.cc b/aten/contrib/data/BatchDataset.cc
deleted file mode 100644
index e4d4ae458034ec..00000000000000
--- a/aten/contrib/data/BatchDataset.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "BatchDataset.h"
-#include "Dataset.h"
-#include "ATen/ATen.h"
-#include <vector>
-#include <cassert>
-#include <math.h>
-
-using namespace at;
-
-BatchDataset::BatchDataset(Dataset& dataset, uint64_t batchsize) {
-   BatchDataset(dataset, batchsize, true);
-}
-
-BatchDataset::BatchDataset(Dataset& dataset, uint64_t batchsize, bool fullbatches) {
-   dataset_ = &dataset;
-   size_ = dataset_->size();
-   batchsize_ = batchsize;
-   fullbatches_ = fullbatches;
-}
-
-void BatchDataset::getField(uint64_t idx, std::string& fieldkey, at::Tensor& field) {
-
-   // assertions:
-   assert(idx < size());
-   assert(hasField(fieldkey));
-
-   // loop over samples:
-   Tensor singlefield, buffer;
-   uint64_t maxsize = std::min(batchsize_, size_ - idx * batchsize_);
-   for(uint64_t n = 0; n < maxsize; n++) {
-
-      // get sample:
-      uint64_t batchidx = idx * batchsize_ + n;
-      dataset_->getField(batchidx, fieldkey, singlefield);
-
-      // allocate memory for batch:
-      if(n == 0) {
-
-         // determine size of batch:
-         std::vector<int64_t> fieldsize;
-         fieldsize.push_back(maxsize);
-         for(long d = 0; d < singlefield.dim(); ++d) {
-            fieldsize.push_back(singlefield.size(d));
-         }
-
-         // resize buffer:
-         field.resize_(fieldsize);
-      }
-
-      // copy sample into batch:
-      buffer = select(field, 0, n);
-      buffer.copy_(singlefield);
-   }
-}
-
-uint64_t BatchDataset::size() {
-   if(fullbatches_)
-      return floor(size_ / batchsize_);
-   else
-      return ceil(size_ / batchsize_);
-}
diff --git a/aten/contrib/data/BatchDataset.h b/aten/contrib/data/BatchDataset.h
deleted file mode 100644
index 4431541ad17280..00000000000000
--- a/aten/contrib/data/BatchDataset.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef AT_BATCH_DATASET_H
-#define AT_BATCH_DATASET_H
-
-#include "Dataset.h"
-#include "ATen/ATen.h"
-
-class BatchDataset : public Dataset
-{
-public:
-   BatchDataset(Dataset& dataset, uint64_t batchsize);
-   BatchDataset(Dataset& dataset, uint64_t batchsize, bool fullbatches);
-   virtual void getField(uint64_t idx, std::string& fieldkey, at::Tensor& field);
-   virtual uint64_t size();
-private:
-   Dataset* dataset_;
-   uint64_t batchsize_;
-   uint64_t size_;
-   bool fullbatches_;
-};
-
-#endif
diff --git a/aten/contrib/data/CMakeLists.txt b/aten/contrib/data/CMakeLists.txt
deleted file mode 100644
index 07bda18fbdef79..00000000000000
--- a/aten/contrib/data/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-project(ATen)
-
-if(APPLE)
-  set(CMAKE_MACOSX_RPATH ON)
-endif()
-
-if(CMAKE_VERSION VERSION_LESS "3.1")
-  set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}")
-else()
-  set(CMAKE_CXX_STANDARD 11)
-endif()
-
-set(src
-  BatchDataset.cc
-  ConcatDataset.cc
-  Dataset.cc
-  MergeDataset.cc
-  ResampleDataset.cc
-  ShuffleDataset.cc
-  TensorDataset.cc
-  TransformDataset.cc
-)
-
-add_library(xtdata ${TH_LINK_STYLE} ${src})
-target_link_libraries(xtdata ATen_cpu_library)
-target_include_directories(xtdata PRIVATE ${ATen_CPU_INCLUDE})
-target_include_directories(xtdata PRIVATE .)
-
-# add_executable(test-data test/basic.cc)
-# target_link_libraries(test-data xtdata)
diff --git a/aten/contrib/data/ConcatDataset.cc b/aten/contrib/data/ConcatDataset.cc
deleted file mode 100644
index 89ae3c1662b297..00000000000000
--- a/aten/contrib/data/ConcatDataset.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-#include "ConcatDataset.h"
-#include "Dataset.h"
-#include <vector>
-#include <cassert>
-
-using namespace at;
-
-ConcatDataset::ConcatDataset(std::vector<Dataset*>& datasets) {
-   datasets_ = &datasets;
-   size_ = 0;
-   beginindices_  = std::vector<uint64_t>();
-   endindices_    = std::vector<uint64_t>();
-   beginindices_.push_back(0);
-   for(Dataset* dataset : datasets) {
-      size_ += dataset->size();
-      uint64_t curidx = endindices_.back();
-      endindices_.push_back(beginindices_.back() + dataset->size());
-      if(endindices_.size() > 1)
-         beginindices_.push_back(curidx + 1);
-   }
-}
-
-void ConcatDataset::getField(uint64_t idx, std::string& fieldkey, Tensor &field) {
-
-   // assertions:
-   assert(idx < size());
-   assert(hasField(fieldkey));
-
-   // get sample from correct dataset:
-   uint64_t datasetidx = binarySearch(idx);
-   Dataset* curdataset = (*datasets_)[datasetidx];
-   curdataset->getField(idx - beginindices_[datasetidx], fieldkey, field);
-}
-
-uint64_t ConcatDataset::binarySearch(uint64_t idx) {
-   assert(idx < size());  // TODO: Add caching to this method.
-   uint64_t left = 0;
-   uint64_t right = size_ - 1;
-   while(left != right) {
-      uint64_t middle = (right - left) / 2;
-      if(left == middle) {
-         if(idx > endindices_[left])
-            left = right;
-         else
-            right = left;
-      }
-      else {
-         if(idx > endindices_[middle])
-            left = middle;
-         else if(idx < beginindices_[middle])
-            right = middle;
-         else {
-            left  = middle;
-            right = middle;
-         }
-      }
-   }
-   return left;
-}
-
-uint64_t ConcatDataset::size() {
-   return size_;
-}
diff --git a/aten/contrib/data/ConcatDataset.h b/aten/contrib/data/ConcatDataset.h
deleted file mode 100644
index 2bde7ca0f514dd..00000000000000
--- a/aten/contrib/data/ConcatDataset.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef AT_CONCAT_DATASET_H
-#define AT_CONCAT_DATASET_H
-
-#include "Dataset.h"
-
-class ConcatDataset : public Dataset
-{
-public:
-   ConcatDataset(std::vector<Dataset*>& datasets);
-   virtual void getField(uint64_t idx, std::string& fieldkey, at::Tensor &field);
-   virtual uint64_t size();
-private:
-   uint64_t binarySearch(uint64_t idx);
-   std::vector<Dataset*>* datasets_;
-   std::vector<uint64_t> beginindices_;
-   std::vector<uint64_t> endindices_;
-   uint64_t size_;
-};
-
-#endif
diff --git a/aten/contrib/data/Dataset.cc b/aten/contrib/data/Dataset.cc
deleted file mode 100644
index e210be58636d4d..00000000000000
--- a/aten/contrib/data/Dataset.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "Dataset.h"
-#include <cassert>
-
-typedef std::map<std::string, at::Tensor> Fields;
-
-void Dataset::get(int64_t idx, Fields& fields) {
-   for(auto& field : fields) {
-      std::string fieldname = field.first;
-      assert(hasField(fieldname));
-      getField(idx, fieldname, field.second);
-   }
-}
-
-bool Dataset::hasField(std::string& fieldkey) {
-   auto search = fieldkeys_.find(fieldkey);
-   return (search != fieldkeys_.end());
-}
-
-std::set<std::string>& Dataset::fieldKeys() {
-   return fieldkeys_;
-}
-
-void Dataset::addFieldKey(std::string& fieldkey) {
-   fieldkeys_.insert(fieldkey);
-}
-
-Dataset::~Dataset() {
-}
diff --git a/aten/contrib/data/Dataset.h b/aten/contrib/data/Dataset.h
deleted file mode 100644
index 02b45acaf0a35b..00000000000000
--- a/aten/contrib/data/Dataset.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef AT_DATASET_H
-#define AT_DATASET_H
-
-#include "ATen/ATen.h"
-#include <string>
-#include <map>
-#include <set>
-
-typedef std::map<std::string, at::Tensor> Fields;
-
-class Dataset {
-   std::set<std::string> fieldkeys_;
-public:
-   virtual uint64_t size() = 0;  // pure virtual function
-   virtual void getField(uint64_t idx, std::string& fieldkey, at::Tensor& field) = 0;
-   virtual bool hasField(std::string& fieldkey);
-   virtual std::set<std::string>& fieldKeys();
-   virtual void addFieldKey(std::string& fieldkey);
-   virtual void get(int64_t idx, Fields& fields);
-   virtual ~Dataset();
-};
-
-#endif
diff --git a/aten/contrib/data/DatasetIterator.h b/aten/contrib/data/DatasetIterator.h
deleted file mode 100644
index 9f888e265a1a57..00000000000000
--- a/aten/contrib/data/DatasetIterator.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef AT_DATASET_ITERATOR_H
-#define AT_DATASET_ITERATOR_H
-
-#include "Dataset.h"
-#include <iterator>
-
-class DatasetIterator : public std::iterator<std::forward_iterator_tag, Fields> {
-private:
-   uint64_t idx_ = 0;
-   Dataset* dataset_;
-public:
-
-   DatasetIterator(Dataset& dataset)  {
-      dataset_ = &dataset;
-      idx_ = 0;
-   }
-
-   DatasetIterator(DatasetIterator& rhs) {
-      DatasetIterator(*rhs.dataset_);
-   }
-
-   DatasetIterator& operator ++() {
-      ++idx_;
-      return *this;
-   }
-
-   DatasetIterator operator ++ (int) {
-      DatasetIterator tmp(*this);
-      ++idx_;
-      return tmp;
-   }
-
-   friend bool operator == (const DatasetIterator& lhs, const DatasetIterator& rhs);
-   friend bool operator != (const DatasetIterator& lhs, const DatasetIterator& rhs);
-
-   Fields operator* () const {
-      Fields sample;
-      dataset_->get(idx_, sample);
-      return sample;
-   }
-
-   Fields* operator-> () const {
-      Fields sample;
-      dataset_->get(idx_, sample);
-      return &sample;
-   }
-};
-
-bool operator == (const DatasetIterator& lhs, const DatasetIterator& rhs) {
-   return lhs.dataset_ == rhs.dataset_;
-}
-
-bool operator != (const DatasetIterator& lhs, const DatasetIterator& rhs) {
-   return lhs.dataset_ != rhs.dataset_;
-}
-
-typedef DatasetIterator iterator;
-//typedef DatasetIterator<const Fields> const_iterator;
-
-#endif
-
-/**
-iterator {
-    iterator(const iterator&);
-    ~iterator();
-    iterator& operator=(const iterator&);
-    iterator& operator++(); //prefix increment
-    reference operator*() const;
-    friend void swap(iterator& lhs, iterator& rhs); //C++11 I think
-};
-
-input_iterator : public virtual iterator {
-    iterator operator++(int); //postfix increment
-    value_type operator*() const;
-    pointer operator->() const;
-    friend bool operator==(const iterator&, const iterator&);
-    friend bool operator!=(const iterator&, const iterator&);
-};
-//once an input iterator has been dereferenced, it is
-//undefined to dereference one before that.
-
-output_iterator : public virtual iterator {
-    reference operator*() const;
-    iterator operator++(int); //postfix increment
-};
-//dereferences may only be on the left side of an assignment
-//once an input iterator has been dereferenced, it is
-//undefined to dereference one before that.
-
-forward_iterator : input_iterator, output_iterator {
-    forward_iterator();
-};
-**/
diff --git a/aten/contrib/data/MergeDataset.cc b/aten/contrib/data/MergeDataset.cc
deleted file mode 100644
index a3cab619c3012a..00000000000000
--- a/aten/contrib/data/MergeDataset.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-#include "MergeDataset.h"
-#include <cassert>
-
-using namespace at;
-
-MergeDataset::MergeDataset(std::vector<Dataset*>& datasets) {
-   datasets_ = &datasets;
-   uint64_t idx = 0;
-   for(Dataset* dataset : *datasets_) {
-      for(auto& fieldkey : dataset->fieldKeys()) {
-         std::string fieldkeyc = fieldkey;
-         addFieldKey(fieldkeyc);
-         datasetidx_[fieldkeyc] = idx;
-      }
-   }
-}
-
-void MergeDataset::getField(uint64_t idx, std::string& fieldkey, Tensor& field) {
-   assert(idx < size());
-   assert(hasField(fieldkey));
-   Dataset* curdataset = (*datasets_)[datasetidx_[fieldkey]];
-   return curdataset->getField(idx, fieldkey, field);
-}
-
-uint64_t MergeDataset::size() {
-   uint64_t size = 0;
-   for(Dataset* dataset : *datasets_)
-      size += dataset->size();
-   return size;
-}
diff --git a/aten/contrib/data/MergeDataset.h b/aten/contrib/data/MergeDataset.h
deleted file mode 100644
index 925646845bbd9d..00000000000000
--- a/aten/contrib/data/MergeDataset.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef AT_MERGE_DATASET_H
-#define AT_MERGE_DATASET_H
-
-#include "Dataset.h"
-#include <vector>
-#include <string>
-
-class MergeDataset : public Dataset
-{
-public:
-   MergeDataset(std::vector<Dataset*>& datasets);
-   virtual void getField(uint64_t idx, std::string& fieldkey, at::Tensor& field);
-   virtual uint64_t size();
-private:
-   std::vector<Dataset*>* datasets_;
-   std::map<std::string, int> datasetidx_;
-};
-
-#endif
diff --git a/aten/contrib/data/ResampleDataset.cc b/aten/contrib/data/ResampleDataset.cc
deleted file mode 100644
index ce8d3547ddb4fd..00000000000000
--- a/aten/contrib/data/ResampleDataset.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-#include "ResampleDataset.h"
-#include "Dataset.h"
-#include <vector>
-#include <cassert>
-
-using namespace at;
-
-ResampleDataset::ResampleDataset(Dataset& dataset) {
-   dataset_ = &dataset;
-   size_ = dataset.size();
-   perm_ = std::vector<uint64_t>();
-   perm_.reserve(size_);
-   for(uint64_t n = 0; n < size_; ++n)
-      perm_[n] = n;
-}
-
-ResampleDataset::ResampleDataset(Dataset& dataset, std::vector<uint64_t>& perm) {
-   dataset_ = &dataset;
-   size_ = dataset.size();
-   perm_ = perm;
-   assert(perm_.size() == size_);
-}
-
-ResampleDataset::ResampleDataset(Dataset& dataset, std::function<uint64_t(uint64_t)> perm) {
-   dataset_ = &dataset;
-   size_ = dataset.size();
-   permfunc_ = perm;
-   resample();
-}
-
-void ResampleDataset::getField(uint64_t idx, std::string& fieldkey, at::Tensor& field) {
-   assert(idx < size());
-   assert(hasField(fieldkey));
-   dataset_->getField(perm_[idx], fieldkey, field);
-}
-
-void ResampleDataset::resample() {
-   if(permfunc_) {
-      perm_.reserve(size_);
-      for(uint64_t n = 0; n < size_; ++n)
-         perm_[n] = permfunc_(n);
-   }
-}
-
-uint64_t ResampleDataset::size() {
-   return size_;
-}
diff --git a/aten/contrib/data/ResampleDataset.h b/aten/contrib/data/ResampleDataset.h
deleted file mode 100644
index 4f84012402eff7..00000000000000
--- a/aten/contrib/data/ResampleDataset.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef AT_RESAMPLE_DATASET_H
-#define AT_RESAMPLE_DATASET_H
-
-#include <string>
-#include <vector>
-#include <functional>
-#include "ATen/ATen.h"
-#include "Dataset.h"
-
-class ResampleDataset : public Dataset
-{
-public:
-   ResampleDataset(Dataset& dataset);
-   ResampleDataset(Dataset& dataset, std::vector<uint64_t>& perm);
-   ResampleDataset(Dataset& dataset, std::function<uint64_t(uint64_t)> perm);
-   virtual void getField(uint64_t idx, std::string& fieldkey, at::Tensor& field);
-   virtual uint64_t size();
-   virtual void resample();
-protected:
-   std::vector<uint64_t> perm_;
-   uint64_t size_;
-private:
-   Dataset* dataset_;
-   std::function<uint64_t(uint64_t)> permfunc_;
-   std::vector<at::Tensor> fields_;
-};
-
-#endif
diff --git a/aten/contrib/data/ShuffleDataset.cc b/aten/contrib/data/ShuffleDataset.cc
deleted file mode 100644
index 25ebb38c69f9a1..00000000000000
--- a/aten/contrib/data/ShuffleDataset.cc
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "ShuffleDataset.h"
-#include "Dataset.h"
-#include <algorithm>
-
-using namespace at;
-
-ShuffleDataset::ShuffleDataset(Dataset& dataset) : ResampleDataset(dataset) {
-   resample();
-}
-
-void ShuffleDataset::resample() {
-   perm_.reserve(size_);
-   for(size_t n = 0; n < size_; ++n)
-      perm_[n] = n;
-   std::random_shuffle(perm_.begin(), perm_.end());
-}
diff --git a/aten/contrib/data/ShuffleDataset.h b/aten/contrib/data/ShuffleDataset.h
deleted file mode 100644
index 80ac6ce640d032..00000000000000
--- a/aten/contrib/data/ShuffleDataset.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef AT_SHUFFLE_DATASET_H
-#define AT_SHUFFLE_DATASET_H
-
-#include "Dataset.h"
-#include "ResampleDataset.h"
-
-class ShuffleDataset : public ResampleDataset
-{
-public:
-   ShuffleDataset(Dataset& dataset);
-   virtual void resample();
-};
-
-#endif
diff --git a/aten/contrib/data/TensorDataset.cc b/aten/contrib/data/TensorDataset.cc
deleted file mode 100644
index fc31e6eb6a298f..00000000000000
--- a/aten/contrib/data/TensorDataset.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "TensorDataset.h"
-#include "ATen/ATen.h"
-#include <cassert>
-
-using namespace at;
-
-TensorDataset::TensorDataset(Tensor& t, std::string& fieldkey) {
-   t_ = t;
-   fieldkey_ = fieldkey;
-   addFieldKey(fieldkey);
-}
-
-void TensorDataset::getField(uint64_t idx, std::string& fieldkey, Tensor& field) {
-
-   // assertions:
-   assert(idx < size());
-   assert(fieldkey_.compare(fieldkey) == 0);
-
-   // get sample:
-   Tensor buffer = select(t_, 0, idx);
-   field.copy_(buffer);
-
-}
-
-uint64_t TensorDataset::size() {
-   return t_.size(0);
-}
diff --git a/aten/contrib/data/TensorDataset.h b/aten/contrib/data/TensorDataset.h
deleted file mode 100644
index 3d2db04b8e612f..00000000000000
--- a/aten/contrib/data/TensorDataset.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef AT_TENSOR_DATASET_H
-#define AT_TENSOR_DATASET_H
-
-#include "Dataset.h"
-#include "ATen/ATen.h"
-#include <string>
-
-class TensorDataset : public Dataset
-{
-public:
-   TensorDataset(at::Tensor& t, std::string& fieldkey);
-   virtual void getField(uint64_t idx, std::string& fieldkey, at::Tensor& field);
-   virtual uint64_t size();
-private:
-   at::Tensor t_;
-   std::string fieldkey_;
-};
-
-#endif
diff --git a/aten/contrib/data/TransformDataset.cc b/aten/contrib/data/TransformDataset.cc
deleted file mode 100644
index 80bdc7e84e6b46..00000000000000
--- a/aten/contrib/data/TransformDataset.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-#include "TransformDataset.h"
-#include "ATen/ATen.h"
-#include "ATen/ATen.h"
-#include <cassert>
-
-using namespace at;
-
-TransformDataset::TransformDataset(Dataset& dataset, std::string& fieldkey, std::function<Tensor(Tensor)>& transform) {
-   assert(hasField(fieldkey));
-   dataset_ = &dataset;
-   fieldkey_ = fieldkey;
-   transform_ = transform;
-}
-
-void TransformDataset::getField(uint64_t idx, std::string& fieldkey, Tensor& field) {
-   dataset_->getField(idx, fieldkey, field);
-   if(fieldkey.compare(fieldkey_) == 0) {
-      Tensor transformed = transform_(field);
-      field.copy_(transformed);
-   }
-}
-
-uint64_t TransformDataset::size() {
-   return dataset_->size();
-}
diff --git a/aten/contrib/data/TransformDataset.h b/aten/contrib/data/TransformDataset.h
deleted file mode 100644
index ac3737ea74a1c3..00000000000000
--- a/aten/contrib/data/TransformDataset.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef AT_TRANSFORM_DATASET_H
-#define AT_TRANSFORM_DATASET_H
-
-#include "Dataset.h"
-#include "ATen/ATen.h"
-#include <functional>
-#include <string>
-
-using namespace at;
-
-class TransformDataset : public Dataset
-{
-public:
-   TransformDataset(Dataset& dataset, std::string& fieldkey, std::function<Tensor(Tensor)>& transform);
-   virtual void getField(uint64_t idx, std::string& fieldkey, Tensor& field);
-   virtual uint64_t size();
-private:
-   Dataset* dataset_;
-   std::string fieldkey_;
-   std::function<Tensor(Tensor)> transform_;
-};
-
-#endif
diff --git a/aten/contrib/data/test/basic.cc b/aten/contrib/data/test/basic.cc
deleted file mode 100644
index c3e48a2785e4d4..00000000000000
--- a/aten/contrib/data/test/basic.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-#include "Dataset.h"
-#include "DatasetIterator.h"
-#include "TensorDataset.h"
-#include <iostream>
-
-using namespace at;
-
-int main()
-{
-   std::cout << "hello\n";
-
-   Tensor tensor = rand(CPU(kDouble), {256,32});
-
-   TensorDataset dataset(tensor);
-   DatasetIterator datasetiterator(dataset);
-   uint64_t cnt = 0;
-   for(auto& sample : datasetiterator) {
-      std::cout << "got sample " << cnt << std:endl;
-      cnt++;
-   }
-   return 0;
-}
diff --git a/aten/contrib/data/threadpool/ThreadPool.cc b/aten/contrib/data/threadpool/ThreadPool.cc
deleted file mode 100644
index 5a1172f8912e3a..00000000000000
--- a/aten/contrib/data/threadpool/ThreadPool.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// dependencies:
-#include "ThreadPool.h"
-#include <vector>
-#include <queue>
-#include <memory>
-#include <thread>
-#include <mutex>
-#include <condition_variable>
-#include <future>
-#include <functional>
-#include <stdexcept>
-#include <stdlib.h>
-#include <stdarg.h>
-
-// constructor launches the specified number of workers:
-ThreadPool::ThreadPool(uint64_t threads) : stop(false) {
-
-   // loop over all threads:
-   for(uint64_t i = 0; i < threads; ++i) {
-      workers.emplace_back(
-         [this] {
-            for(;;) {
-               std::function<void()> task;
-               {
-                  std::unique_lock<std::mutex> lock(this->queue_mutex);
-                  this->condition.wait(lock,
-                     [this]{ return this->stop || !this->tasks.empty(); });
-                  if(this->stop && this->tasks.empty())
-                     return;
-                  task = std::move(this->tasks.front());
-                  this->tasks.pop();
-               }
-               task();
-            }
-         }
-      );
-   }
-}
-
-// synchronize all the threads:
-void ThreadPool::synchronize() {
-   for(uint64_t i = 0; i < futures.size(); i++)
-      futures[i].first.wait();
-}
-
-// poll the threads for results:
-unsigned int ThreadPool::waitFor() {
-
-   // wait until a task is finished:
-   uint64_t i;
-   std::future_status status;
-   do {
-      for(i = 0; i < futures.size(); i++) {
-         status = futures[i].first.wait_for(std::chrono::microseconds(0));
-         if(status == std::future_status::ready) break;
-      }
-      std::this_thread::sleep_for(std::chrono::microseconds(1));
-   } while (status != std::future_status::ready);
-
-   // get the result and remove the future:
-   futures[i].first.get();
-   unsigned int handle = futures[i].second;
-   iter_swap(futures.begin() + i, futures.end() - 1);
-   futures.pop_back();
-   return handle;
-}
-
-// the destructor joins all threads:
-ThreadPool::~ThreadPool() {
-   {
-      std::unique_lock<std::mutex> lock(queue_mutex);
-      stop = true;
-   }
-   condition.notify_all();
-   for(std::thread &worker: workers)
-      worker.join();
-}
diff --git a/aten/contrib/data/threadpool/ThreadPool.h b/aten/contrib/data/threadpool/ThreadPool.h
deleted file mode 100644
index 3024af43144100..00000000000000
--- a/aten/contrib/data/threadpool/ThreadPool.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef AT_THREADPOOL_H
-#define AT_THREADPOOL_H
-
-// dependencies:
-#include <vector>
-#include <queue>
-#include <thread>
-#include <mutex>
-#include <condition_variable>
-#include <future>
-
-// definition of the ThreadPool class:
-class ThreadPool {
-   public:
-      explicit ThreadPool(uint64_t);
-      void synchronize();
-      unsigned int waitFor();
-      template<class F, class... Args>
-      unsigned int enqueue(F&& f, Args&&... args);
-      ~ThreadPool();
-
-   private:
-
-      // all the threads that can perform work:
-      std::vector< std::thread > workers;
-
-      // the list of futures:
-      unsigned int handle = 0;
-      std::vector< std::pair< std::future<void>, unsigned int > > futures;
-
-      // the task queue:
-      std::queue< std::function<void()> > tasks;
-
-      // synchronization:
-      std::mutex queue_mutex;
-      std::condition_variable condition;
-      bool stop;
-};
-
-// enqueue new work item into the pool:
-template<class F, class... Args>
-unsigned int ThreadPool::enqueue(F&& f, Args&&... args)
-{
-
-   // create the task:
-   auto task = std::make_shared< std::packaged_task<void()> >(
-      std::bind(std::forward<F>(f), std::forward<Args>(args)...)
-   );
-
-   // get future and enqueue the task:
-   std::future<void> future = task->get_future();
-   {
-      std::unique_lock<std::mutex> lock(queue_mutex);
-      if(stop) throw std::runtime_error("enqueue on stopped ThreadPool");
-      tasks.emplace([task](){ (*task)(); });
-   }
-   condition.notify_one();
-
-   // generate handle and store future:
-   handle++;
-   futures.push_back(std::make_pair(std::move(future), handle));
-   return handle;
-}
-
-#endif
diff --git a/aten/contrib/meter/APMeter.cc b/aten/contrib/meter/APMeter.cc
deleted file mode 100644
index 316d4366463317..00000000000000
--- a/aten/contrib/meter/APMeter.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "APMeter.h"
-#include <math.h>
-#include <cassert>
-
-using namespace at;
-
-APMeter::APMeter() {
-   reset();
-}
-
-void APMeter::reset() {
-   outputs_ = CPU(kFloat).tensor();
-   targets_ = CPU(kFloat).tensor();
-   n_ = 0;
-}
-
-void APMeter::add(Tensor& output, Tensor& target) {
-
-   // assertions and allocations:
-   assert(output.dim() == 2 && target.dim() == 2);
-   //assert(isSameSizeAs(output, target));
-   assert(output.size(1) == outputs_.size(1));
-
-   // get current outputs and targets:
-   Tensor curoutputs = getOutputs();
-   Tensor curtargets = getTargets();
-
-   // make sure underlying storages are sufficiently large:
-   if(numel(outputs_) < numel(curoutputs) + numel(output)) {
-      long long newsize = ceil(numel(outputs_) * 1.5);
-      outputs_.resize_({newsize + numel(output)});
-      targets_.resize_({newsize + numel(output)});
-   }
-   n_ += output.size(0);
-
-   // store scores and targets:
-   uint64_t offset = (numel(curoutputs) > 0) ? curoutputs.size(0) : 0;
-   Tensor outputbuffer = outputs_.narrow( 0, offset, output.size(0));
-   Tensor targetbuffer = targets_.narrow( 0, offset, target.size(0));
-
-   outputbuffer.copy_(output);
-   targetbuffer.copy_(target);
-}
-
-Tensor APMeter::getOutputs() {
-   return outputs_.narrow(0, 0, n_);
-}
-Tensor APMeter::getTargets() {
-   return targets_.narrow(0, 0, n_);
-}
-
-void APMeter::value(Tensor& val) {
-
-   // get current outputs and targets:
-   Tensor curoutputs = getOutputs();
-   Tensor curtargets = getTargets();
-
-   // allocate some memory:
-   val.resize_({curoutputs.size(1)});
-   double * val_d = val.data<double>();
-   Tensor outputbuffer, targetbuffer, sortval, sortidx, sorttgt;
-   Tensor truepos, precision;
-   Tensor range = val.type().range(0,curoutputs.size(0));
-
-   // loop over all classes:
-   for(long k = 0; k < curoutputs.size(1); ++k) {
-
-      // sort scores:
-      outputbuffer = curoutputs.narrow( 1, k, 1);
-      targetbuffer = curtargets.narrow(1, k, 1).contiguous().toType(CPU(kDouble));
-      double * targetbuffer_d = targetbuffer.data<double>();
-      std::tie(sortval, sortidx) = sort(curoutputs, 0);
-      sorttgt = index_select(targetbuffer, 0, sortidx);
-
-      // compue true positive sums, and precision:
-      truepos = cumsum(targetbuffer,0);  // NOTE: Cast to double first?
-      precision = div(truepos, range);
-      double * precision_d = precision.data<double>();
-      // compute average precision:
-      val_d[k] = .0;
-      for(long n = 0; n < precision.size(0); ++n) {
-         if(targetbuffer_d[n] != 0.)
-            val_d[k] += precision_d[n];
-      }
-      auto norm = sum(targetbuffer).toCDouble();
-      if(norm > 0)
-        val_d[k] /= norm;
-   }
-}
diff --git a/aten/contrib/meter/APMeter.h b/aten/contrib/meter/APMeter.h
deleted file mode 100644
index 06e8a6330130d7..00000000000000
--- a/aten/contrib/meter/APMeter.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef AT_AP_METER_H
-#define AT_AP_METER_H
-
-#include "Meter.h"
-#include "ATen/ATen.h"
-
-class APMeter : public Meter
-{
-public:
-   APMeter();
-   virtual void add(Tensor& output, Tensor& target);
-   virtual void value(Tensor& val);
-   virtual void reset();
-   virtual Tensor getOutputs();
-   virtual Tensor getTargets();
-private:
-   Tensor outputs_;
-   Tensor targets_;
-   uint64_t n_;
-};
-
-#endif
diff --git a/aten/contrib/meter/AUCMeter.cc b/aten/contrib/meter/AUCMeter.cc
deleted file mode 100644
index fdd352b7014a08..00000000000000
--- a/aten/contrib/meter/AUCMeter.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-#include "AUCMeter.h"
-#include "APMeter.h"
-#include <cassert>
-
-using namespace at;
-
-AUCMeter::AUCMeter() {
-   reset();
-}
-
-void AUCMeter::reset() {
-   meter_ = APMeter();
-}
-
-void AUCMeter::add(Tensor& output, Tensor& target) {
-   meter_.add(output, target);
-}
-
-void AUCMeter::value(Tensor& val) {
-
-   // get data from APMeter:
-   Tensor outputs = meter_.getOutputs();
-   Tensor targets = meter_.getTargets();
-
-   // sort scores:
-   Tensor sortval, sortidx, sorttgt;
-   std::tie(sortval, sortidx) = sort(outputs, 0, true);
-   sorttgt = index_select(targets, 0, sortidx);
-   int64_t * sortidx_d = sortidx.data<int64_t>();
-   int16_t * targets_d = sortidx.data<int16_t>();
-   // construct the ROC curve:
-   Tensor tpr = zeros(CPU(kDouble), {numel(outputs)});
-   Tensor fpr = zeros(CPU(kDouble), {numel(outputs)});
-
-   double * tpr_d = tpr.data<double>();
-   double * fpr_d = fpr.data<double>();
-   for(long n = 1; n <= numel(outputs); ++n) {
-      if(targets_d[sortidx_d[n - 1]] == 1) {
-         tpr_d[n] = tpr_d[n - 1] + 1.;
-         fpr_d[n] = fpr_d[n - 1];
-      } else {
-         tpr_d[n] = tpr_d[n - 1];
-         fpr_d[n] = fpr_d[n - 1] + 1.;
-      }
-   }
-   tpr.div_(sum(targets));
-   fpr.div_(sum(at::add(mul(targets, -1.), 1.)));
-
-   /**
-   local auc = torch.cmul(
-      tpr:narrow(1, 1, tpr:nElement() - 1),
-      fpr:narrow(1, 2, fpr:nElement() - 1) -
-      fpr:narrow(1, 1, fpr:nElement() - 1)):sum()
-   */
-
-   val.resize_({1}).fill_(
-     sum(mul(tpr.narrow(0, 0, numel(tpr) - 1),
-                sub(fpr.narrow(0, 1, numel(tpr) - 1),
-                     fpr.narrow(0, 0, numel(tpr) - 1)))));
-}
diff --git a/aten/contrib/meter/AUCMeter.h b/aten/contrib/meter/AUCMeter.h
deleted file mode 100644
index 9da96e6bab65e9..00000000000000
--- a/aten/contrib/meter/AUCMeter.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef AT_AUC_METER_H
-#define AT_AUC_METER_H
-
-#include "Meter.h"
-#include "APMeter.h"
-#include "ATen/ATen.h"
-
-class AUCMeter : public Meter
-{
-public:
-   AUCMeter();
-   virtual void reset();
-   virtual void add(Tensor& output, Tensor& target);
-   virtual void value(Tensor& val);
-private:
-   APMeter meter_;
-};
-
-#endif
diff --git a/aten/contrib/meter/CMakeLists.txt b/aten/contrib/meter/CMakeLists.txt
deleted file mode 100644
index 1f23ff50e64505..00000000000000
--- a/aten/contrib/meter/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-# requires to be compiled along xttensor
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-# C++11
-if(CMAKE_VERSION VERSION_LESS "3.1")
-  set(CMAKE_CXX_FLAGS "--std=c++11 ${CMAKE_CXX_FLAGS}")
-else()
-  set(CMAKE_CXX_STANDARD 11)
-endif()
-
-set(src
-  APMeter.cc
-  AUCMeter.cc
-  ClassErrorMeter.cc
-  MAPMeter.cc
-  MSEMeter.cc
-)
-
-add_library(xtmeter SHARED ${src})
-target_link_libraries(xtmeter ATen_cpu_library)
-target_include_directories(xtmeter PRIVATE ${ATen_CPU_INCLUDE})
-
-add_executable(test-meter test/basic.cc ${BACKWARD_ENABLE})
-# add_backward(test-meter)
-target_link_libraries(test-meter xtmeter)
-target_include_directories(test-meter PRIVATE ${ATen_CPU_INCLUDE})
diff --git a/aten/contrib/meter/ClassErrorMeter.cc b/aten/contrib/meter/ClassErrorMeter.cc
deleted file mode 100644
index a24e850542aed8..00000000000000
--- a/aten/contrib/meter/ClassErrorMeter.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "ClassErrorMeter.h"
-#include "ATen/ATen.h"
-#include <cassert>
-
-using namespace at;
-
-ClassErrorMeter::ClassErrorMeter() {
-   ClassErrorMeter(1);
-}
-
-ClassErrorMeter::ClassErrorMeter(const int64_t topk) {
-   topkval_ = CPU(kShort).tensor();
-   sumval_ = CPU(kShort).tensor();
-   topkval_.resize_({topk});
-   sumval_.resize_({topk});
-   reset();
-}
-
-void ClassErrorMeter::reset() {
-  range_out(topkval_, 1, numel(topkval_));
-  sumval_.fill_(0.);
-  n_ = 0;
-}
-
-void ClassErrorMeter::add(Tensor& output, Tensor& target) {
-
-   // assertions and allocations:
-   assert(output.dim() == 2 && target.dim() == 1);
-   //assert(isSameSizeAs(output, target));
-   auto sumval_d = sumval_.data<int16_t>();
-   auto target_long = target.contiguous().toType(CPU(kLong));
-   auto target_d = target_long.data<int64_t>();
-   // update counts:
-   Tensor val, idx;
-   std::tie(val, idx) = topk(output, numel(topkval_), 1, true, true);
-   for(long n = 0; n < output.size(0); ++n) {
-      bool targetseen = false;
-      Tensor idx_n = idx.select(0,n);
-      auto idx_n_d = idx_n.data<int64_t>();
-      for(long k = 0; k < numel(topkval_); ++k) {
-         n_++;
-         if(targetseen) {
-            sumval_d[k]++;
-         } else if(idx_n_d[k] == target_d[n]) {
-            targetseen = true;
-            sumval_d[k]++;
-         }
-      }
-   }
-}
-
-void ClassErrorMeter::value(Tensor& val) {
-   val.resize_({numel(topkval_)});
-   auto val_d = val.data<double>();
-   auto sumval_d = sumval_.data<int16_t>();
-   for(long k = 0; k < numel(topkval_); ++k) {
-     val_d[k] = 1.0 - (double(sumval_d[k]) / double(n_));
-   }
-}
diff --git a/aten/contrib/meter/ClassErrorMeter.h b/aten/contrib/meter/ClassErrorMeter.h
deleted file mode 100644
index 87529b17dc716d..00000000000000
--- a/aten/contrib/meter/ClassErrorMeter.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef AT_CLASS_ERROR_METER_H
-#define AT_CLASS_ERROR_METER_H
-
-#include "Meter.h"
-#include "ATen/ATen.h"
-
-class ClassErrorMeter : public Meter
-{
-public:
-   ClassErrorMeter();
-   ClassErrorMeter(const int64_t topk);
-   virtual void reset();
-   virtual void add(Tensor& output, Tensor& target);
-   virtual void value(Tensor& val);
-private:
-   Tensor topkval_;
-   Tensor sumval_;
-   uint64_t n_;
-};
-
-#endif
diff --git a/aten/contrib/meter/MAPMeter.cc b/aten/contrib/meter/MAPMeter.cc
deleted file mode 100644
index a73c541d5ed34f..00000000000000
--- a/aten/contrib/meter/MAPMeter.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "MAPMeter.h"
-
-using namespace at;
-
-MAPMeter::MAPMeter() {
-   reset();
-}
-
-void MAPMeter::reset() {
-   meter_.reset();
-}
-
-void MAPMeter::add(Tensor& output, Tensor& target) {
-   meter_.add(output, target);
-}
-
-void MAPMeter::value(Tensor& val) {
-   //TODO: 0-dim
-   val.resize_({1});
-   Tensor allvalues = val.type().tensor();
-   meter_.value(allvalues);
-   val.fill_(mean(allvalues));
-}
diff --git a/aten/contrib/meter/MAPMeter.h b/aten/contrib/meter/MAPMeter.h
deleted file mode 100644
index ee38a548661fbd..00000000000000
--- a/aten/contrib/meter/MAPMeter.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef AT_MAP_METER_H
-#define AT_MAP_METER_H
-
-#include "Meter.h"
-#include "APMeter.h"
-#include "ATen/ATen.h"
-
-class MAPMeter : public Meter
-{
-public:
-   MAPMeter();
-   virtual void reset();
-   virtual void add(Tensor& output, Tensor& target);
-   virtual void value(Tensor& val);
-private:
-   APMeter meter_;
-};
-
-#endif
diff --git a/aten/contrib/meter/MSEMeter.cc b/aten/contrib/meter/MSEMeter.cc
deleted file mode 100644
index 431415b08ac6a8..00000000000000
--- a/aten/contrib/meter/MSEMeter.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-#include "MSEMeter.h"
-#include <cassert>
-#include <math.h>
-
-using namespace at;
-
-MSEMeter::MSEMeter() {
-   reset();
-}
-
-void MSEMeter::reset() {
-   n_ = 0;
-   val_ = .0;
-}
-
-void MSEMeter::add(Tensor& output, Tensor& target) {
-   //assert(isSameSizeAs(output, output
-  Tensor t = output.sub(target);
-  Tensor result = t.mul(t).contiguous().toType(CPU(kDouble));
-  double * data = result.data<double>();
-  for(long n = 0; n < numel(result); ++n) {
-    n_++;
-    val_ += ( (1. / ((double)n_ - 1.) * val_) +
-              ((1. /  (double)n_) * data[n]));
-  }
-}
-
-void MSEMeter::value(Tensor& val) {
-  //TODO: 0-dim
-  val.resize_({1}).fill_(val_);
-}
diff --git a/aten/contrib/meter/MSEMeter.h b/aten/contrib/meter/MSEMeter.h
deleted file mode 100644
index 46ccf167d4cf13..00000000000000
--- a/aten/contrib/meter/MSEMeter.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef AT_MSE_METER_H
-#define AT_MSE_METER_H
-
-#include "Meter.h"
-#include "ATen/ATen.h"
-
-class MSEMeter : public Meter
-{
-public:
-   MSEMeter();
-   virtual void reset();
-   virtual void add(Tensor& output, Tensor& target);
-   virtual void value(Tensor& val);
-private:
-   double val_;
-   uint64_t n_;
-};
-
-#endif
diff --git a/aten/contrib/meter/Meter.h b/aten/contrib/meter/Meter.h
deleted file mode 100644
index 7f240ea30286fe..00000000000000
--- a/aten/contrib/meter/Meter.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef AT_METER_H
-#define AT_METER_H
-
-#include "ATen/ATen.h"
-
-using namespace at;
-
-class Meter
-{
-public:
-   virtual void add(Tensor& output, Tensor& target) = 0;
-   virtual void value(Tensor& val) = 0;
-   virtual void reset() = 0;
-  virtual ~Meter() {};
-};
-
-#endif
diff --git a/aten/contrib/meter/test/basic.cc b/aten/contrib/meter/test/basic.cc
deleted file mode 100644
index 0c87e5d2330eea..00000000000000
--- a/aten/contrib/meter/test/basic.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-#include "APMeter.h"
-#include <iostream>
-
-using namespace at;
-
-int main()
-{
-   auto && T = CPU(kFloat);
-   std::cout << "hello\n";
-   APMeter meter;
-   Tensor output = at::randn(T, {10, 7});
-   Tensor target = at::zeros(T, {10, 7});
-   for(uint64_t n = 0; n < 10; ++n) {
-     Tensor row = target.select(0,n);
-     auto row_d = row.data<float>();
-     row_d[rand() % 7] = 1.;
-   }
-   std::cout << output;
-   std::cout << target;
-   meter.add(output, target);
-   Tensor val;
-   meter.value(val);
-   std::cout << "value: " << val << std::endl;
-   return 0;
-}
diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h
index e41c2d95655d96..84568880fcf8e6 100644
--- a/aten/src/ATen/ATen.h
+++ b/aten/src/ATen/ATen.h
@@ -15,3 +15,4 @@
 #include "ATen/TensorOperators.h"
 #include "ATen/TensorMethods.h"
 #include "ATen/Dispatch.h"
+#include "ATen/DimVector.h"
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index df7e81cf6cc80c..6eac12a0e54262 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -49,11 +49,13 @@ FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
 FILE(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu")
 FILE(GLOB cudnn_h "cudnn/*.h" "cudnn/*.cuh")
 FILE(GLOB cudnn_cpp "cudnn/*.cpp")
+FILE(GLOB miopen_cpp "miopen/*.cpp")
 FILE(GLOB mkl_cpp "mkl/*.cpp")
 FILE(GLOB mkldnn_cpp "mkldnn/*.cpp")
 
 FILE(GLOB native_cpp "native/*.cpp")
 FILE(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
+FILE(GLOB native_miopen_cpp "native/miopen/*.cpp")
 FILE(GLOB native_cuda_cu "native/cuda/*.cu")
 FILE(GLOB native_cuda_cpp "native/cuda/*.cpp")
 FILE(GLOB native_mkl_cpp "native/mkl/*.cpp")
@@ -70,10 +72,20 @@ endif()
 IF(NOT NO_CUDA OR USE_ROCM)
   list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/cuda)
   set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} ${cuda_cu} ${native_cuda_cu})
-  set(all_cuda_cpp ${native_cudnn_cpp} ${cuda_cpp} ${native_cuda_cpp} ${cuda_generated_cpp} ${ATen_CUDA_SRCS})
+  set(all_cuda_cpp ${cuda_cpp} ${native_cuda_cpp} ${cuda_generated_cpp} ${ATen_CUDA_SRCS})
+
+  IF(USE_ROCM)
+    set(all_cuda_cpp ${native_miopen_cpp} ${all_cuda_cpp})
+  ELSE()
+    set(all_cuda_cpp ${native_cudnn_cpp} ${all_cuda_cpp})
+  ENDIF()
+
   IF(CUDNN_FOUND)
     SET(all_cuda_cpp ${all_cuda_cpp} ${cudnn_cpp})
+  ELSEIF(MIOPEN_FOUND)
+    SET(all_cuda_cpp ${all_cuda_cpp} ${miopen_cpp})
   ENDIF()
+
 endif()
 
 filter_list(generated_h generated_cpp "\\.h$")
@@ -203,7 +215,7 @@ if(NOT MSVC)
   set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE)
 endif()
 
-IF(NOT NO_CUDA)
+IF(NOT NO_CUDA AND NOT USE_ROCM)
   IF ($ENV{ATEN_STATIC_CUDA})
     # CuFFT has a complicated static story (especially around CUDA < 9) because it has device callback support
     # we first have to build a fake lib that links with no device callbacks,
@@ -271,9 +283,15 @@ ENDIF()
 IF(USE_ROCM)
  ### Link in the ROCm libraries BLAS / RNG.
  FIND_LIBRARY(HIPBLAS_LIBRARY hipblas HINTS ${HIPBLAS_PATH}/lib)
- FIND_LIBRARY(HIPRNG_LIBRARY hiprng HINTS ${HIPRNG_PATH}/lib)
+ FIND_LIBRARY(HIPRAND_LIBRARY hiprand HINTS ${HIPRAND_PATH}/lib)
+
+ list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRAND_LIBRARY})
 
- list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRNG_LIBRARY})
+ # Set necessary HIPCC Flags
+ SET(HIP_HCC_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}")
+
+ # Set necessary CXX Flags
+ SET(CMAKE_CXX_FLAGS "-DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${CMAKE_CXX_FLAGS}")
 ENDIF()
 
 # Include CPU paths for CUDA as well
@@ -371,6 +389,9 @@ if(NOT AT_LINK_STYLE STREQUAL "INTERFACE")
     if(NOT NO_CUDA OR USE_ROCM)
       set_property(TARGET ATen_cuda PROPERTY CXX_STANDARD 11)
     endif()
+    if(USE_ROCM)
+      set_target_properties(ATen_cuda PROPERTIES LINKER_LANGUAGE HIP)
+    endif()
   endif()
 endif()
 
diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
index 526efb717e4c84..d46d1dc388bd5d 100644
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -354,28 +354,26 @@ inline void CPU_tensor_parallel_apply1(
     int64_t grain_size = internal::TBB_GRAIN_SIZE) {
   if (!_apply_preamble({tensor1}))
     return;
-  if (tensor1.numel() < grain_size) {
-    CPU_tensor_apply1<scalar1>(tensor1, op);
-    return;
-  }
-  auto range = tbb::blocked_range<size_t>(0, tensor1.numel());
   if (tensor1.ndimension() < 8) {
-    tbb::parallel_for(
-        range, [&tensor1, &op](const tbb::blocked_range<size_t> r) {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        grain_size,
+        [&tensor1, &op](int64_t begin, int64_t end) {
           apply_op(
-              r.end() - r.begin(),
-              r.begin(),
+              end - begin,
+              begin,
               op,
               strided_tensor_iter_fixed<scalar1, 8>(tensor1, true));
         });
   } else {
-    tbb::parallel_for(
-        range, [&tensor1, &op](const tbb::blocked_range<size_t> r) {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        grain_size,
+        [&tensor1, &op](int64_t begin, int64_t end) {
           apply_op(
-              r.end() - r.begin(),
-              r.begin(),
-              op,
-              strided_tensor_iter<scalar1>(tensor1));
+              end - begin, begin, op, strided_tensor_iter<scalar1>(tensor1));
         });
   }
 }
@@ -388,27 +386,28 @@ inline void CPU_tensor_parallel_apply2(
     int64_t grain_size = internal::TBB_GRAIN_SIZE) {
   if (!_apply_preamble({tensor1, tensor2}))
     return;
-  if ((tensor1.numel() + tensor2.numel()) < grain_size) {
-    CPU_tensor_apply2<scalar1, scalar2>(tensor1, tensor2, op);
-    return;
-  }
-  auto range = tbb::blocked_range<size_t>(0, tensor1.numel());
   if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) {
-    tbb::parallel_for(
-        range, [&tensor1, &tensor2, &op](const tbb::blocked_range<size_t> r) {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        grain_size,
+        [&tensor1, &tensor2, &op](int64_t begin, int64_t end) {
           apply_op(
-              r.end() - r.begin(),
-              r.begin(),
+              end - begin,
+              begin,
               op,
               strided_tensor_iter_fixed<scalar1, 8>(tensor1),
               strided_tensor_iter_fixed<scalar2, 8>(tensor2));
         });
   } else {
-    tbb::parallel_for(
-        range, [&tensor1, &tensor2, &op](const tbb::blocked_range<size_t> r) {
+    parallel_for(
+        0,
+        tensor1.numel(),
+        grain_size,
+        [&tensor1, &tensor2, &op](int64_t begin, int64_t end) {
           apply_op(
-              r.end() - r.begin(),
-              r.begin(),
+              end - begin,
+              begin,
               op,
               strided_tensor_iter<scalar1>(tensor1),
               strided_tensor_iter<scalar2>(tensor2));
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 6fcb951d184f5c..b8aff319e6271e 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -86,6 +86,9 @@ class AT_API Context {
   cudaStream_t getCurrentCUDAStream() const {
     return detail::getCUDAHooks().getCurrentCUDAStream(thc_state.get());
   }
+  cudaStream_t getCurrentCUDAStreamOnDevice(int64_t device) const {
+    return detail::getCUDAHooks().getCurrentCUDAStreamOnDevice(thc_state.get(), device);
+  }
   cudaDeviceProp* getCurrentDeviceProperties() const {
     return detail::getCUDAHooks().getCurrentDeviceProperties(thc_state.get());
   }
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
index c2103d20fd4ba5..6d7c7eab0391c1 100644
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -3885,7 +3885,7 @@
   aten_custom_call: |
     if (self_->isScalar()) {
       // Empty tensor
-      return self_->type().toScalarType(kLong).tensor({0});
+      return toBackend(toDense(backend())).toScalarType(kLong).tensor({0});
     }
 ]]
 
diff --git a/aten/src/ATen/DimVector.h b/aten/src/ATen/DimVector.h
new file mode 100644
index 00000000000000..aaa4dc9c07290e
--- /dev/null
+++ b/aten/src/ATen/DimVector.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "SmallVector.h"
+#include <stdint.h>
+
+namespace at {
+
+/// A container for sizes or strides
+using DimVector = SmallVector<int64_t, 5>;
+
+}
diff --git a/aten/src/ATen/Parallel.cpp b/aten/src/ATen/Parallel.cpp
index c404056465cd01..961cc6df31b2a3 100644
--- a/aten/src/ATen/Parallel.cpp
+++ b/aten/src/ATen/Parallel.cpp
@@ -7,6 +7,7 @@
 #include <cassert>
 #include <thread>
 
+
 namespace at { namespace internal {
 
 // thread_local variable with internal linkage
@@ -51,4 +52,5 @@ void init_tbb_num_threads() {
     num_threads_ = num_threads;
   }
 }
-}} // namespace at::internal
+} // namespace internal
+} // namespace at
diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index 660ae48c791ccf..dba4e35ca74488 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -1,13 +1,8 @@
 #pragma once
 #include <ATen/ATen.h>
-#include <tbb/tbb.h>
 #include <cstddef>
+#include <tbb/tbb.h>
 
-#ifdef __PPC64__
-using default_partitioner_type = tbb::simple_partitioner;
-#else
-using default_partitioner_type = tbb::affinity_partitioner;
-#endif
 
 namespace at {
 namespace internal {
@@ -30,68 +25,28 @@ AT_API void init_tbb_num_threads();
 constexpr int64_t TBB_GRAIN_SIZE = 32768;
 } // namespace internal
 
-template <class T, template <class> class OP>
-T parallel_reduce(
-    T (*f)(const T*, size_t, size_t, T),
-    const T* data,
-    size_t start,
-    size_t end,
-    T init_) {
+template <class F>
+void parallel_for(
+    int64_t begin,
+    int64_t end,
+    int64_t grain_size,
+    F f) {
   internal::init_tbb_num_threads();
 
-  T result_;
-  static default_partitioner_type ap;
-
-  if (end - start < internal::TBB_GRAIN_SIZE) {
-    result_ = f(data, start, end, init_);
-  } else {
-    result_ = tbb::parallel_reduce(
-        tbb::blocked_range<size_t>(start, end, internal::TBB_GRAIN_SIZE),
-        init_,
-        [&data, &f](const tbb::blocked_range<size_t> r, T init) -> T {
-          return f(data, r.begin(), r.end(), init);
-        },
-        OP<T>(),
-        ap);
-  }
-  return result_;
-}
-
-template <class T>
-void parallel_reduce_2d(
-    void (*f)(const T*, T*, size_t, size_t),
-    size_t num_rows,
-    size_t num_cols,
-    size_t numel,
-    const T* arr_,
-    T* outarr_) {
-  internal::init_tbb_num_threads();
+#ifdef __PPC64__
+  using default_partitioner_type = tbb::simple_partitioner;
+#else
+  using default_partitioner_type = tbb::affinity_partitioner;
+#endif
 
-  static default_partitioner_type ap;
+  thread_local static default_partitioner_type ap;
 
-  size_t max_i_ =
-      (numel && num_rows && num_cols) ? numel / (num_rows * num_cols) : 0;
-  if (numel < internal::TBB_GRAIN_SIZE) {
-    for (size_t i_ = 0; i_ < max_i_; i_++) {
-      int64_t i = i_ * num_rows * num_cols;
-      int64_t i_r = i_ * num_cols;
-      const T* arr = arr_ + i;
-      T* outarr = outarr_ + i_r;
-      f(arr, outarr, num_rows, num_cols);
-    }
+  if ((end - begin) < grain_size) {
+    f(begin, end);
   } else {
     tbb::parallel_for(
-        tbb::blocked_range<size_t>(0, max_i_, 1),
-        [&arr_, &outarr_, num_rows, num_cols, &f](
-            const tbb::blocked_range<size_t> r) {
-          for (size_t i_ = r.begin(); i_ < r.end(); i_++) {
-            int64_t i = i_ * num_rows * num_cols;
-            int64_t i_r = i_ * num_cols;
-            const T* arr = arr_ + i;
-            T* outarr = outarr_ + i_r;
-            f(arr, outarr, num_rows, num_cols);
-          }
-        },
+        tbb::blocked_range<int64_t>(begin, end, grain_size),
+        [f](const tbb::blocked_range<int64_t>& r) { f(r.begin(), r.end()); },
         ap);
   }
 }
diff --git a/aten/src/ATen/ScalarType.h b/aten/src/ATen/ScalarType.h
index 840d78b48acfb0..3439cd12b9dade 100644
--- a/aten/src/ATen/ScalarType.h
+++ b/aten/src/ATen/ScalarType.h
@@ -89,6 +89,18 @@ static inline const char * toString(ScalarType t) {
 #undef DEFINE_CASE
 }
 
+static inline size_t elementSize(ScalarType t) {
+#define CASE_ELEMENTSIZE_CASE(ctype,name,_2) \
+  case ScalarType:: name : return sizeof(ctype);
+
+  switch(t) {
+    AT_FORALL_SCALAR_TYPES(CASE_ELEMENTSIZE_CASE)
+    default:
+      AT_ERROR("Unknown ScalarType");
+  }
+#undef CASE_ELEMENTSIZE_CASE
+}
+
 static inline bool isIntegralType(ScalarType t) {
   return (t == ScalarType::Byte ||
           t == ScalarType::Char ||
diff --git a/aten/src/ATen/SmallVector.h b/aten/src/ATen/SmallVector.h
index 521e46082bce4a..3a5926a06df8d1 100644
--- a/aten/src/ATen/SmallVector.h
+++ b/aten/src/ATen/SmallVector.h
@@ -921,6 +921,12 @@ class SmallVector : public SmallVectorImpl<T> {
       SmallVectorImpl<T>::operator=(::std::move(RHS));
   }
 
+  template<typename Container>
+  const SmallVector &operator=(const Container &RHS) {
+    this->assign(RHS.begin(), RHS.end());
+    return *this;
+  }
+
   SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
     if (!RHS.empty())
       SmallVectorImpl<T>::operator=(::std::move(RHS));
diff --git a/aten/src/ATen/SparseTensorRef.h b/aten/src/ATen/SparseTensorRef.h
index f0381d51c28c7e..9c9fada2dc7117 100644
--- a/aten/src/ATen/SparseTensorRef.h
+++ b/aten/src/ATen/SparseTensorRef.h
@@ -3,8 +3,8 @@
 namespace at {
 
 struct Tensor;
-struct SparseTensor {
-  explicit SparseTensor(const Tensor& t): tref(t) {}
+struct SparseTensorRef {
+  explicit SparseTensorRef(const Tensor& t): tref(t) {}
   const Tensor& tref;
 };
 
diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
index 617a57a392c314..e13c594db35d10 100644
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -22,7 +22,7 @@ namespace {
 template <class T>
 struct Vec256 {
   static constexpr int size = 32 / sizeof(T);
-  __at_align32__ T values[32 / sizeof(T)];
+  T values[32 / sizeof(T)];
   Vec256() {}
   Vec256(T val) {
     for (int i = 0; i != size; i++) {
diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
index 242796eebbef86..0e453fdcdeb1b7 100644
--- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@@ -236,13 +236,10 @@ __host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) {
   return (a + b - 1) / b;
 }
 
-inline bool getApplyGrid(uint64_t totalElements, dim3& grid) {
-  int curDevice = -1;
-  cudaGetDevice(&curDevice);
+inline bool getApplyGrid(uint64_t totalElements, dim3& grid, int64_t curDevice) {
   if (curDevice == -1) return false;
-
   uint64_t numBlocks = ATenCeilDiv(totalElements, static_cast<uint64_t>(AT_APPLY_THREADS_PER_BLOCK));
-  uint64_t maxGridX = at::globalContext().getCurrentDeviceProperties()->maxGridSize[0];
+  uint64_t maxGridX = at::globalContext().getDeviceProperties(curDevice)->maxGridSize[0];
   if (numBlocks > maxGridX)
       numBlocks = maxGridX;
   grid = dim3(numBlocks);
@@ -286,7 +283,9 @@ bool CUDA_tensor_apply2(at::Tensor a,
   const dim3 block = getApplyBlock();
 
   dim3 grid;
-  if (!getApplyGrid(totalElements, grid)) {
+  int64_t curDevice = current_device();
+  if (curDevice == -1) return false;
+  if (!getApplyGrid(totalElements, grid, curDevice)) {
     return false;
   }
 
@@ -323,7 +322,7 @@ bool CUDA_tensor_apply2(at::Tensor a,
                         scalar1,                                        \
                         scalar2,                                        \
                         TYPE, A, B>                                     \
-   <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(    \
+   <<<grid, block, 0, at::globalContext().getCurrentCUDAStreamOnDevice(curDevice)>>>(    \
        aInfo, bInfo, (TYPE) totalElements, op);
 
 #define HANDLE_B_CASE(TYPE, A, B) {         \
@@ -466,7 +465,9 @@ bool CUDA_tensor_apply3(at::Tensor a,
   const dim3 block = getApplyBlock();
 
   dim3 grid;
-  if (!getApplyGrid(totalElements, grid)) {
+  int64_t curDevice = current_device();
+  if (curDevice == -1) return false;
+  if (!getApplyGrid(totalElements, grid, curDevice)) {
     return false;
   }
 
@@ -501,7 +502,7 @@ bool CUDA_tensor_apply3(at::Tensor a,
                         scalar2,                                        \
                         scalar3,                                        \
                         TYPE, A, B, C>                                  \
-    <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(   \
+    <<<grid, block, 0, at::globalContext().getCurrentCUDAStreamOnDevice(curDevice)>>>(   \
       aInfo, bInfo, cInfo, (TYPE) totalElements, op);
 
 #define HANDLE_C_CASE(TYPE, A, B, C) {      \
@@ -685,7 +686,9 @@ bool CUDA_tensor_apply4(at::Tensor a,
   const dim3 block = getApplyBlock();
 
   dim3 grid;
-  if (!getApplyGrid(totalElements, grid)) {
+  int64_t curDevice = current_device();
+  if (curDevice == -1) return false;
+  if (!getApplyGrid(totalElements, grid, curDevice)) {
     return false;
   }
 
@@ -727,7 +730,7 @@ bool CUDA_tensor_apply4(at::Tensor a,
                         scalar3,                                        \
                         scalar4,                                        \
                         TYPE, A, B, C, D>                               \
-    <<<grid, block, 0, at::globalContext().getCurrentCUDAStream()>>>(   \
+    <<<grid, block, 0, at::globalContext().getCurrentCUDAStreamOnDevice(curDevice)>>>(   \
     aInfo, bInfo, cInfo, dInfo, (TYPE) totalElements, op);
 
 #define HANDLE_D_CASE(TYPE, A, B, C, D) {       \
diff --git a/aten/src/ATen/cuda/CUDAConfig.h.in b/aten/src/ATen/cuda/CUDAConfig.h.in
index 72adee50cf84fb..9e4b3d35c09bec 100644
--- a/aten/src/ATen/cuda/CUDAConfig.h.in
+++ b/aten/src/ATen/cuda/CUDAConfig.h.in
@@ -5,3 +5,4 @@
 //    c.f. https://stackoverflow.com/questions/33759787/generating-an-error-if-checked-boolean-macro-is-not-defined
 
 #define AT_CUDNN_ENABLED() @AT_CUDNN_ENABLED@
+#define AT_MIOPEN_ENABLED() @AT_MIOPEN_ENABLED@
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index fb2d25f54f86c8..05d408786d47b4 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -48,6 +48,9 @@ bool CUDAHooks::hasCuDNN() const {
 cudaStream_t CUDAHooks::getCurrentCUDAStream(THCState* thc_state) const {
   return THCState_getCurrentStream(thc_state);
 }
+cudaStream_t CUDAHooks::getCurrentCUDAStreamOnDevice(THCState* thc_state, int64_t device) const {
+  return THCState_getCurrentStreamOnDevice(thc_state, device);
+}
 struct cudaDeviceProp* CUDAHooks::getCurrentDeviceProperties(THCState* thc_state) const {
   return THCState_getCurrentDeviceProperties(thc_state);
 }
@@ -76,6 +79,10 @@ bool CUDAHooks::compiledWithCuDNN() const {
   return AT_CUDNN_ENABLED();
 }
 
+bool CUDAHooks::compiledWithMIOpen() const {
+  return AT_MIOPEN_ENABLED();
+}
+
 bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
 #if AT_CUDNN_ENABLED()
   cudaDeviceProp* prop = getCurrentDeviceProperties(globalContext().getTHCState());
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index 69b0fd658eebe0..52dd836ace3885 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -15,12 +15,14 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   bool hasCUDA() const override;
   bool hasCuDNN() const override;
   cudaStream_t getCurrentCUDAStream(THCState*) const override;
+  cudaStream_t getCurrentCUDAStreamOnDevice(THCState*, int64_t device) const override;
   struct cudaDeviceProp* getCurrentDeviceProperties(THCState*) const override;
   struct cudaDeviceProp* getDeviceProperties(THCState*, int device) const override;
   int64_t current_device() const override;
   std::unique_ptr<Allocator> newPinnedMemoryAllocator() const override;
   void registerCUDATypes(Context*) const override;
   bool compiledWithCuDNN() const override;
+  bool compiledWithMIOpen() const override;
   bool supportsDilatedConvolutionWithCuDNN() const override;
   long versionCuDNN() const override;
   double batchnormMinEpsilonCuDNN() const override;
diff --git a/aten/src/ATen/cudnn/Handles.h b/aten/src/ATen/cudnn/Handles.h
index 2845dec500f9a1..369b1f3410e03d 100644
--- a/aten/src/ATen/cudnn/Handles.h
+++ b/aten/src/ATen/cudnn/Handles.h
@@ -1,9 +1,10 @@
 #pragma once
 
 #include "cudnn-wrapper.h"
+#include "ATen/cuda/ATenCUDAGeneral.h"
 
 namespace at { namespace native {
 
-cudnnHandle_t getCudnnHandle();
+AT_CUDA_API cudnnHandle_t getCudnnHandle();
 
 }} // namespace
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index 2ec5ad67cbf621..6d87dc4478bfdc 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -62,6 +62,10 @@ struct AT_API CUDAHooksInterface {
     AT_ERROR("cannot getCurrentCUDAStream() without ATen_cuda library");
   }
 
+  virtual cudaStream_t getCurrentCUDAStreamOnDevice(THCState*, int64_t device) const {
+    AT_ERROR("cannot getCurrentCUDAStream() without ATen_cuda library");
+  }
+
   virtual struct cudaDeviceProp* getCurrentDeviceProperties(THCState*) const {
     AT_ERROR("cannot getCurrentDeviceProperties() without ATen_cuda library");
   }
@@ -86,6 +90,10 @@ struct AT_API CUDAHooksInterface {
     return false;
   }
 
+  virtual bool compiledWithMIOpen() const {
+    return false;
+  }
+
   virtual bool supportsDilatedConvolutionWithCuDNN() const {
     return false;
   }
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index f151ecc962b08c..1437101ff2fb12 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -169,7 +169,7 @@ def __init__(self, reason):
 
 TYPE_FORMAL_GENERIC = {
     'THTensor*': 'Tensor &',
-    'THSTensor*': 'SparseTensor',
+    'THSTensor*': 'SparseTensorRef',
     'THBoolTensor*': 'Tensor &',
     'THIndexTensor*': 'Tensor &',
     'THIntegerTensor*': 'Tensor &',
@@ -186,7 +186,7 @@ def __init__(self, reason):
 
 DYNAMIC_TYPE = {
     'THTensor*': 'Tensor',
-    'THSTensor*': 'SparseTensor',
+    'THSTensor*': 'SparseTensorRef',
     'THBoolTensor*': 'BoolTensor',
     'THIndexTensor*': 'IndexTensor',
     'THIntegerTensor*': 'IntegerTensor',
@@ -1133,7 +1133,7 @@ def handle_sparse(env, option):
             return []
         check_name = option['when_sparse_dispatch']
         sparse_actuals = [arg['name']
-                          if arg['name'] != check_name else "SparseTensor({})".format(arg['name'])
+                          if arg['name'] != check_name else "SparseTensorRef({})".format(arg['name'])
                           for arg in option['formals_list']]
         return [SPARSE_CHECK.substitute(env, check_name=check_name, sparse_actuals=sparse_actuals)]
 
diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp
new file mode 100644
index 00000000000000..f3d720c6bcdd19
--- /dev/null
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@@ -0,0 +1,120 @@
+#include "Descriptors.h"
+#include <ATen/ATen.h>
+
+namespace at { namespace native {
+
+namespace {
+
+inline miopenDataType_t getDataType(const at::Type& t) {
+  auto scalar_type = t.scalarType();
+  if (scalar_type == at::kFloat) {
+    return miopenFloat;
+  } else if (scalar_type == at::kHalf) {
+    return miopenHalf;
+  }
+  throw std::runtime_error("TensorDescriptor only supports float and half tensors");
+}
+
+inline miopenDataType_t getDataType(const at::Tensor& t) {
+  return getDataType(t.type());
+}
+
+} // anonymous namespace
+
+
+void TensorDescriptor::set(const at::Tensor &t, size_t pad) {
+  set(getDataType(t), t.sizes(), t.strides(), pad);
+}
+
+static int MIOPEN_DIM_MAX = 4;
+
+void TensorDescriptor::set(miopenDataType_t datatype, IntList t_sizes, IntList t_strides, size_t pad) {
+  size_t dim = t_sizes.size();
+  if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
+#define _STR(X) #X
+#define STR(X) _STR(X)
+    throw std::runtime_error("MIOpen supports only up to " STR(MIOPEN_DIM_MAX) " dimensions");
+#undef _STR
+#undef STR
+  int size[MIOPEN_DIM_MAX];
+  int stride[MIOPEN_DIM_MAX];
+  for (size_t i = 0; i < dim; ++i) {
+    size[i] = static_cast<int>(t_sizes[i]);
+    stride[i] = static_cast<int>(t_strides[i]);
+  }
+  for (size_t i = dim; i < pad; ++i) {
+    size[i] = 1;
+    stride[i] = 1;
+  }
+  set(datatype, static_cast<int>(std::max(dim, pad)), size, stride);
+}
+
+std::string miopenTypeToString(miopenDataType_t dtype) {
+  switch (dtype) {
+    case miopenFloat:
+      return "miopenFloat";
+    case miopenHalf:
+      return "miopenHalf";
+    default:
+      std::ostringstream oss;
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
+      return oss.str();
+  }
+}
+
+std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  int nbDims = 4;
+  int dimA[MIOPEN_DIM_MAX];
+  int strideA[MIOPEN_DIM_MAX];
+  miopenDataType_t dtype;
+  miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
+  out << "    type = " << miopenTypeToString(dtype) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
+  // Read out only nbDims of the arrays!
+  out << "    dimA = ";
+  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
+    out << i << ", ";
+  }
+  out << "\n";
+  out << "    strideA = ";
+  for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
+    out << i << ", ";
+  }
+  out << "\n";
+  return out;
+}
+
+void TensorDescriptor::print() { std::cout << *this; }
+
+void FilterDescriptor::set(const at::Tensor &t, int64_t pad) {
+  auto dim = t.ndimension();
+  if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
+#define _STR(X) #X
+#define STR(X) _STR(X)
+    throw std::runtime_error("MIOpen supports only up to " STR(MIOPEN_DIM_MAX) " dimensions");
+#undef _STR
+#undef STR
+  if (!t.is_contiguous()) {
+    // NB: It is possible for this test to be insufficient, because the
+    // Tensor passed in to set the filter descriptor may not be the actual
+    // Tensor whose data pointer is passed to cuDNN.  Nevertheless,
+    // that is the common case, so we can catch most client errors with this test.
+    throw std::runtime_error("MIOpen filters (a.k.a. weights) must be contiguous");
+  }
+  int size[MIOPEN_DIM_MAX];
+  int stride[MIOPEN_DIM_MAX];
+  for (int i = 0; i < dim; ++i) {
+    size[i] = (int) t.size(i);
+  }
+  for (int i = dim; i < pad; ++i) {
+    size[i] = (int) 1;
+  }
+  for (int i = dim - 1; i >=0; --i) {
+    stride[i] = (i == dim - 1) ? 1 : stride[i+1] * size[i+1];
+  }
+  dim = std::max(dim, pad);
+  set(getDataType(t), (int) dim, size, stride);
+}
+
+}}
diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h
new file mode 100644
index 00000000000000..eb259ea6199f4a
--- /dev/null
+++ b/aten/src/ATen/miopen/Descriptors.h
@@ -0,0 +1,323 @@
+#pragma once
+
+#include "Exceptions.h"
+
+#include "miopen-wrapper.h"
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+#include <cuda.h>
+
+/*
+Note [cuDNN dropout descriptor initialization]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In most cases, setting descriptors in cuDNN is cheap (e.g.,
+cudnnSetTensorNdDescriptor).  However, this is not the case for
+cudnnSetDropoutDescriptor: in cuDNN 6/7 (and possibly others) it does an
+expensive precomputation to initialize the random number generator states.  In
+cuDNN 6, this is the ONLY official mechanism to initialize a dropout descriptor,
+which means that law-abiding clients were expected to generate a dropout
+descriptor once and cache it.  However, our ATen interface is (1) stateless (so
+we can't cache the descriptors) and (2) does not accept arbitrary user types in
+its interface (so we can't pass the descriptor in).  This puts us in a pickle.
+
+In cuDNN 7, a new function, cudnnRestoreDropoutDescriptor was added, which
+forgoes the expensive initialization process, and can initialize the
+descriptor with a pre-initialized state CUDA tensor.  This is great, because
+it means we can simply pass in the state tensor and then initialize the
+descriptor internally.  Unfortunately, this function is not available in
+cuDNN 6.
+
+To work around this, we break the cuDNN abstraction barrier, and have
+the struct layout of the underlaying dropout descriptor.  With this struct,
+we can reimplement cudnnRestoreDropoutDescriptor from scratch. Great!
+*/
+
+namespace at { namespace native {
+
+// TODO: Add constructors for all of the descriptors
+
+inline int dataSize(miopenDataType_t dataType)
+{
+  switch (dataType) {
+    case miopenHalf: return 2;
+    case miopenFloat: return 4;
+    default: return 8;
+  }
+}
+
+// The stride for a size-1 dimensions is not uniquely determined; in
+// fact, it can be anything you want, because the fact that the
+// tensor is size 1 at this dimension means that you will never actually
+// try advancing your pointer by this stride.
+//
+// However, CuDNN has a much more stringent requirement on strides:
+// if you are passing a contiguous input, it better be the case
+// that the stride for dim i is the product of the sizes of dims
+// i+1 to the end.  This stride is indeed uniquely determined.  This
+// function modifies 'stride' in place so this invariant holds.
+static inline void fixSizeOneDimStride(int dim, const int *size, int *stride) {
+  int64_t z = 1;
+  for(int d = dim-1; d >= 0; d--)
+  {
+    if (size[d] == 1) {
+      stride[d] = z;
+    } else {
+      z *= size[d];
+    }
+  }
+}
+
+template <typename T, miopenStatus_t (*dtor)(T*)>
+struct DescriptorDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      MIOPEN_CHECK(dtor(x));
+    }
+  }
+};
+
+// A generic class for wrapping cuDNN descriptor types.  All you need
+// is to give the underlying type the Descriptor_t points to (usually,
+// if it's miopenTensorDescriptor_t it points to miopenTensorStruct),
+// the constructor and the destructor.  Subclasses are responsible
+// for defining a set() function to actually set the descriptor.
+//
+// Descriptors default construct to a nullptr, and have a descriptor
+// initialized the first time you call set() or any other initializing
+// function.
+template <typename T, miopenStatus_t (*ctor)(T**), miopenStatus_t (*dtor)(T*)>
+class Descriptor
+{
+public:
+  // TODO: Figure out why const-correctness doesn't work here
+
+  // Use desc() to access the underlying descriptor pointer in
+  // a read-only fashion.  Most client code should use this.
+  // If the descriptor was never initialized, this will return
+  // nullptr.
+  T* desc() const { return desc_.get(); }
+  T* desc() { return desc_.get(); }
+
+  // Use mut_desc() to access the underlying desciptor pointer
+  // if you intend to modify what it points to (e.g., using
+  // miopenSetFooDescriptor).  This will ensure that the descriptor
+  // is initialized.  Code in this file will use this function.
+  T* mut_desc() { init(); return desc_.get(); }
+protected:
+  void init() {
+    if (desc_ == nullptr) {
+      T* raw_desc;
+      MIOPEN_CHECK(ctor(&raw_desc));
+      desc_.reset(raw_desc);
+    }
+  }
+private:
+  std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
+};
+
+class TensorDescriptor
+  : public Descriptor<miopenTensorDescriptor,
+                      &miopenCreateTensorDescriptor,
+                      &miopenDestroyTensorDescriptor>
+{
+public:
+  TensorDescriptor() {}
+  explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) {
+    set(t, pad);
+  }
+
+  // Note [CuDNN broadcast padding]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // pad specifies the minimum dimensionality of the tensor descriptor
+  // we produce (it doesn't have anything to do with, e.g., convolution
+  // padding).  If 't' is lower-dimensional than 'pad', the remaining
+  // dimensions (on the right) are padded with ones.  This doesn't
+  // affect the underlying data layout.  This is particularly useful for
+  // dealing with a pecularity of the CuDNN API, which is that broadcasting in CuDNN is
+  // done in two steps: first, the client code is expected to pad out
+  // (the dimensions) input tensors to be the same dimension as the
+  // target broadcast, and then second, CuDNN takes of actually
+  // broadcasting size 1 dimensions.
+
+  void set(const at::Tensor &t, size_t pad = 0);
+  void set(miopenDataType_t dataType, IntList sizes, IntList strides, size_t pad = 0);
+
+  void print();
+
+private:
+  void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
+    fixSizeOneDimStride(dim, size, stride);
+    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
+  }
+};
+
+std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
+
+class FilterDescriptor
+  : public Descriptor<miopenTensorDescriptor,
+                      &miopenCreateTensorDescriptor,
+                      &miopenDestroyTensorDescriptor>
+{
+public:
+  void set(const at::Tensor &t, int64_t pad = 0);
+
+private:
+  void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
+    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
+  }
+};
+
+struct ConvolutionDescriptor
+  : public Descriptor<miopenConvolutionDescriptor,
+                      &miopenCreateConvolutionDescriptor,
+                      &miopenDestroyConvolutionDescriptor>
+{
+  void set(miopenDataType_t dataType, int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups) {
+    miopenDataType_t mathType = dataType;
+    if (dataType == miopenHalf) mathType = miopenFloat;
+    //?????????????? What the fuck? It's asking for the stride for the height & stride for the width seperately....!
+    MIOPEN_CHECK(miopenInitConvolutionDescriptor(mut_desc(), miopenConvolution, *pad, *pad, *stride, *stride, 1, 1));
+#if 0
+    CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups));
+    CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH));
+    if(dataType == CUDNN_DATA_HALF)
+      CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
+#endif
+  }
+};
+
+struct SpatialTransformerDescriptor
+  : public Descriptor<miopenSpatialTransformerStruct,
+                      &miopenCreateSpatialTransformerDescriptor,
+                      &miopenDestroySpatialTransformerDescriptor>
+{
+  void set(miopenDataType_t dataType, int dim, int* size) {
+    //?????????????? This function doesn't even exist!
+    MIOPEN_CHECK(miopenSetSpatialTransformerNdDescriptor(mut_desc(), MIOPEN_SAMPLER_BILINEAR, dataType, dim, size));
+  }
+};
+
+#if 0 // DROPOUT NOT IMPLEMENTED IN MIOpen
+
+// See Note [cuDNN dropout descriptor initialization]
+inline cudnnStatus_t cudnnRestoreDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc,
+    cudnnHandle_t handle,
+    float dropout,
+    void *states,
+    size_t stateSizeInBytes,
+    unsigned long long seed) {
+  // Try to accurately simulate cuDNN's behavior, for our cuDNN 6 friends.
+  // This is not entirely accurate but is good enough to catch some API
+  // uses which would not be compatible in cuDNN 7.  Feel free to fix
+  // this if you notice something is wrong.
+  if (states == nullptr) return CUDNN_STATUS_INVALID_VALUE;
+  if (stateSizeInBytes == 0) return CUDNN_STATUS_INVALID_VALUE;
+  size_t expectedStateSizeInBytes;
+  // State size will differ depending on size of GPU
+  auto ret = cudnnDropoutGetStatesSize(handle, &expectedStateSizeInBytes);
+  if (ret != CUDNN_STATUS_SUCCESS) return ret;
+  if (expectedStateSizeInBytes != stateSizeInBytes) return CUDNN_STATUS_INVALID_VALUE;
+  dropoutDesc->dropout = dropout;
+  dropoutDesc->nstates = (int)stateSizeInBytes/sizeof(curandState_t);
+  dropoutDesc->states = states;
+  return CUDNN_STATUS_SUCCESS;
+}
+
+#endif // DROPOUT NOT IMPLEMENTED IN MIOpen
+
+#if 0 // DROPOUT NOT IMPLEMENTED IN MIOpen
+struct DropoutDescriptor
+  : public Descriptor<miopenDropoutStruct,
+                      &miopenCreateDropoutDescriptor,
+                      &miopenDestroyDropoutDescriptor>
+{
+  at::Tensor state;
+
+  // Initialize a dropout descriptor's RNG state.
+  // WARNING: This function is very expensive, avoid calling this function!
+  // NB: it takes a Type so that we can generate a Variable if necessary
+  void initialize_rng(const at::Type& ty, miopenHandle_t handle, float dropout, long long int seed) {
+    AT_ASSERT(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout");
+    size_t state_size;
+    MIOPEN_CHECK(miopenDropoutGetStatesSize(handle, &state_size));
+    AT_ASSERT(ty.is_cuda(), "dropout state type must be CUDA type");
+    AT_ASSERT(ty.scalarType() == kByte, "dropout state type must be byte");
+    state = ty.tensor({static_cast<int64_t>(state_size)});
+    MIOPEN_CHECK(miopenSetDropoutDescriptor(mut_desc(), handle, dropout, state.data_ptr(), state_size, seed));
+  }
+
+  // Restore a dropout descriptor given a dropout probability and existing RNG state.
+  // See Note [cuDNN dropout descriptor initialization]
+  void set(miopenHandle_t handle, float dropout, at::Tensor state_) {
+    AT_ASSERT(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout");
+    state = state_;
+    void *state_ptr = state.data_ptr();
+    size_t state_size = state.size(0);
+    // NB: The seed doesn't actually matter, so we give a dummy value
+    MIOPEN_CHECK(miopenRestoreDropoutDescriptor(mut_desc(), handle, dropout, state_ptr, state_size, 0 /* seed */));
+  }
+
+  // Restore a dropout descriptor corresponding to no dropout
+  // See Note [cuDNN dropout descriptor initialization]
+  void set_no_dropout(miopenHandle_t handle) {
+    // NB: seed doesn't matter when dropout = 0, because no random number
+    // initialization actually takes place when there is no dropout.
+    // NB: Empirically, miopenSetDropoutDescriptor is cheap when
+    // dropoot == 0
+    MIOPEN_CHECK(miopenSetDropoutDescriptor(mut_desc(), handle, 0 /* dropout */, nullptr, 0 /* state_size */, 0 /* seed */));
+  }
+};
+#endif // DROPOUT NOT IMPLEMENTED IN MIOpen
+
+struct RNNDescriptor
+  : public Descriptor<miopenRNNDescriptor,
+                      &miopenCreateRNNDescriptor,
+                      &miopenDestroyRNNDescriptor>
+{
+  DropoutDescriptor dropout_desc_;
+  void set(miopenHandle_t handle, int hidden_size, int num_layers, DropoutDescriptor&& dropout_desc,
+           miopenRNNInputMode_t input_mode, miopenRNNDirectionMode_t bidirectional,
+           miopenRNNMode_t mode, miopenDataType_t datatype) {
+    dropout_desc_ = std::move(dropout_desc);
+    MIOPEN_CHECK(miopenSetRNNDescriptor(
+          handle,
+          mut_desc(),
+          hidden_size,
+          num_layers,
+          dropout_desc_.desc(),
+          input_mode,
+          bidirectional,
+          mode,
+          MIOPEN_RNN_ALGO_STANDARD,
+          datatype));
+#if 0
+  hipDeviceProp_t* prop = globalContext().getCurrentDeviceProperties();
+  if (prop->major >= 7) {
+    if (datatype == CUDNN_DATA_HALF) {
+      cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_TENSOR_OP_MATH);
+    } else {
+      // Technically, as the default it's not necessary to explicitly
+      // set this.
+      cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_DEFAULT_MATH);
+    }
+  }
+#endif
+  }
+};
+
+union Constant
+{
+  float f;
+  double d;
+  Constant(miopenDataType_t dataType, double value) {
+    if (dataType == miopenHalf || dataType == miopenFloat) {
+      f = (float) value;
+    } else {
+      d = value;
+    }
+  }
+};
+
+}}  // namespace
diff --git a/aten/src/ATen/miopen/Exceptions.h b/aten/src/ATen/miopen/Exceptions.h
new file mode 100644
index 00000000000000..06180eb7a7a7e4
--- /dev/null
+++ b/aten/src/ATen/miopen/Exceptions.h
@@ -0,0 +1,74 @@
+gigit #pragma once
+
+#include "miopen-wrapper.h"
+#include <string>
+#include <stdexcept>
+#include <sstream>
+
+struct THCState;
+
+namespace at { namespace native {
+
+class miopen_exception : public std::runtime_error {
+public:
+  miopenStatus_t status;
+  miopen_exception(miopenStatus_t status, const char* msg)
+      : std::runtime_error(msg)
+      , status(status) {}
+  miopen_exception(miopenStatus_t status, const std::string& msg)
+      : std::runtime_error(msg)
+      , status(status) {}
+};
+
+const char* miopenGetErrorString(miopenStatus_t status) {
+    switch (status) {
+      case miopenStatusSuccess:
+          return "miopenStatusSuccess";
+
+      case miopenStatusNotInitialized:
+          return "miopenStatusNotInitialized";
+
+      case miopenStatusInvalidValue:
+          return "miopenStatusInvalidValue";
+
+      case miopenStatusBadParm:
+          return "miopenStatusBadParm";
+
+      case miopenStatusAllocFailed:
+          return "miopenStatusAllocFailed";
+
+      case miopenStatusInternalError:
+          return "miopenStatusInternalError";
+
+      case miopenStatusNotImplemented:
+          return "miopenStatusNotImplemented";
+
+      case miopenStatusUnknownError:
+          return "miopenStatusUnknownError";
+
+      default:
+          return "Unrecognized Status Code";
+    }
+}
+
+inline void MIOPEN_CHECK(miopenStatus_t status)
+{
+  if (status != miopenStatusSuccess) {
+    if (status == miopenStatusNotImplemented) {
+        throw miopen_exception(status, std::string(miopenGetErrorString(status)) +
+                ". This error may appear if you passed in a non-contiguous input.");
+    }
+    throw miopen_exception(status, miopenGetErrorString(status));
+  }
+}
+
+inline void CUDA_CHECK(hipError_t error)
+{
+  if (error != hipSuccess) {
+    std::string msg("HIP error: ");
+    msg += hipErrorString(error);
+    throw std::runtime_error(msg);
+  }
+}
+
+}}  // namespace at::miopen
diff --git a/aten/src/ATen/miopen/Handles.cpp b/aten/src/ATen/miopen/Handles.cpp
new file mode 100644
index 00000000000000..d37b1043fdeca5
--- /dev/null
+++ b/aten/src/ATen/miopen/Handles.cpp
@@ -0,0 +1,42 @@
+#include "Handles.h"
+
+#include "Exceptions.h"
+
+#include <unordered_map>
+#include <mutex>
+
+// TODO: Get rid of the mutex, and just initialize these
+// handles in at::Context along with lazy CUDA initialization
+
+namespace at { namespace native {
+
+namespace {
+
+struct Handle {
+  miopenHandle_t handle;
+  Handle() : handle(NULL) {
+    MIOPEN_CHECK(miopenCreate(&handle));
+  }
+  ~Handle() {
+    if (handle) {
+      miopenDestroy(handle);
+    }
+  }
+};
+
+std::mutex mutex;
+std::unordered_map<int, Handle> handles;
+
+}  // namespace
+
+
+miopenHandle_t getMiopenHandle()
+{
+  int device;
+  CUDA_CHECK(hipGetDevice(&device));
+
+  std::lock_guard<std::mutex> guard(mutex);
+  return handles[device].handle;
+}
+
+}} // namespace at::miopen
diff --git a/aten/src/ATen/miopen/Handles.h b/aten/src/ATen/miopen/Handles.h
new file mode 100644
index 00000000000000..e8df69270f17f3
--- /dev/null
+++ b/aten/src/ATen/miopen/Handles.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "miopen-wrapper.h"
+
+namespace at { namespace native {
+
+miopenHandle_t getMiopenHandle();
+
+}} // namespace
diff --git a/aten/src/ATen/miopen/README.md b/aten/src/ATen/miopen/README.md
new file mode 100644
index 00000000000000..f1e9f16e99f3c7
--- /dev/null
+++ b/aten/src/ATen/miopen/README.md
@@ -0,0 +1,4 @@
+All files living in this directory are written with the assumption that MIOpen is available,
+which means that these code are not guarded by `#if AT_MIOPEN_ENABLED()`. Therefore, whenever
+you need to use definitions from here, please guard the `#include<ATen/miopen/*.h>` and
+definition usages with `#if AT_MIOPEN_ENABLED()` macro, e.g. [native/miopen/BatchNorm.cpp](native/miopen/BatchNorm.cpp).
diff --git a/aten/src/ATen/miopen/Types.cpp b/aten/src/ATen/miopen/Types.cpp
new file mode 100644
index 00000000000000..d35927f58acf40
--- /dev/null
+++ b/aten/src/ATen/miopen/Types.cpp
@@ -0,0 +1,24 @@
+#include "Types.h"
+
+#include <ATen/ATen.h>
+#include "miopen/version.h"
+
+namespace at { namespace native {
+
+miopenDataType_t getMiopenDataType(const at::Tensor& tensor) {
+  if (tensor.type().scalarType() == at::kFloat) {
+    return miopenFloat;
+  } else if (tensor.type().scalarType() == at::kHalf) {
+    return miopenHalf;
+  }
+  std::string msg("getMiopenDataType() not supported for ");
+  msg += at::toString(tensor.type().scalarType());
+  throw std::runtime_error(msg);
+}
+
+int64_t miopen_version() {
+  //return MIOPEN_VERSION_TWEAK;
+  return (MIOPEN_VERSION_MAJOR<<8) + (MIOPEN_VERSION_MINOR<<4) + MIOPEN_VERSION_PATCH;
+}
+
+}}  // namespace at::miopen
diff --git a/aten/src/ATen/miopen/Types.h b/aten/src/ATen/miopen/Types.h
new file mode 100644
index 00000000000000..0034aa4d84a2b1
--- /dev/null
+++ b/aten/src/ATen/miopen/Types.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "miopen-wrapper.h"
+#include <ATen/Tensor.h>
+
+namespace at { namespace native {
+
+miopenDataType_t getMiopenDataType(const at::Tensor& tensor);
+
+int64_t miopen_version();
+
+}}  // namespace at::miopen
diff --git a/aten/src/ATen/miopen/Utils.h b/aten/src/ATen/miopen/Utils.h
new file mode 100644
index 00000000000000..3a1128b482f4a6
--- /dev/null
+++ b/aten/src/ATen/miopen/Utils.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "THC/THC.h"
+#include "miopen-wrapper.h"
+#include "Handles.h"
+
+namespace at { namespace native {
+
+inline void setMIOpenStreamToCurrent() {
+  // TODO: Should getCurrentStream be a method on Context?
+  MIOPEN_CHECK(miopenSetStream(getMiopenHandle(), THCState_getCurrentStream(globalContext().getTHCState())));
+}
+
+// cuDNN has a buggy check for tensor being contiguous (that is, it does
+// not ignore stride for dimension that is equal to 0).  This function
+// makes tensors which have zero stride contiguous, by setting the
+// strides to 1 as cuDNN likes.
+inline Tensor contiguousIfZeroInStrides(const Tensor& t) {
+  for (auto s : t.strides()) {
+    if (s == 0) return t.contiguous();
+  }
+  return t;
+}
+
+}}
diff --git a/aten/src/ATen/miopen/miopen-wrapper.h b/aten/src/ATen/miopen/miopen-wrapper.h
new file mode 100644
index 00000000000000..1e378aedbef17a
--- /dev/null
+++ b/aten/src/ATen/miopen/miopen-wrapper.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <miopen.h>
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 88d1acb2dddd02..a293dbbf7145f7 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -102,6 +102,11 @@ auto ConvParams::view1d_as_2d() -> void {
 }
 
 auto ConvParams::use_cudnn(const at::Tensor& input) const -> bool {
+  if (detail::getCUDAHooks().compiledWithMIOpen()) {
+    if (!input.type().is_cuda() || !cudnn_enabled)
+      return false;
+    return true;
+  }
   if (!detail::getCUDAHooks().compiledWithCuDNN()) {
     return false;
   }
@@ -151,9 +156,9 @@ static void check_input_shape_forward(const at::Tensor& input,
 
   if (weight_dim != k) {
     std::stringstream ss;
-    ss << "Expected " << k << "-dimensional weight for " << k
-       << "-dimensional input " << input.sizes() << ", but got weight of size "
-       << weight.sizes() << " instead";
+    ss << "Expected " << weight_dim << "-dimensional input for " << weight_dim
+       << "-dimensional weight " << weight.sizes() << ", but got input of size "
+       << input.sizes() << " instead";
     throw std::runtime_error(ss.str());
   }
   if (weight.size(0) < groups) {
@@ -167,8 +172,8 @@ static void check_input_shape_forward(const at::Tensor& input,
   if (!transposed) {
     if (input.size(1) != (weight.size(1) * groups)) {
       std::stringstream ss;
-      ss << "Given groups=" << groups << ", weight" << weight.sizes()
-         << ", so expected input" << input.sizes() << " to have "
+      ss << "Given groups=" << groups << ", weight of size " << weight.sizes()
+         << ", expected input" << input.sizes() << " to have "
          << (weight.size(1) * groups) << " channels, but got " << input.size(1)
          << " channels instead";
       throw std::runtime_error(ss.str());
@@ -183,8 +188,8 @@ static void check_input_shape_forward(const at::Tensor& input,
   } else { // transposed
     if (input.size(1) != weight.size(0)) {
       std::stringstream ss;
-      ss << "Given transposed=" << transposed << ", weight" << weight.sizes()
-         << ", so expected input" << input.sizes() << " to have "
+      ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes()
+         << ", expected input" << input.sizes() << " to have "
          << weight.size(0) << " channels, but got " << input.size(1)
          << " channels instead";
       throw std::runtime_error(ss.str());
@@ -295,11 +300,11 @@ at::Tensor _convolution(
   auto input = input_r.contiguous();
   auto weight = weight_r;
   auto bias = bias_r;
-  auto k = input.ndimension();
+  auto k = weight.ndimension();
   int64_t dim = k - 2;
 
   if (dim <= 0) {
-    throw std::runtime_error("input has less dimensions than expected");
+    throw std::runtime_error("weight should have at least two dimensions");
   }
 
   ConvParams params;
diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp
index 65c9b1005fb391..0446b8c8a392c8 100644
--- a/aten/src/ATen/native/Indexing.cpp
+++ b/aten/src/ATen/native/Indexing.cpp
@@ -147,7 +147,7 @@ static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size)
   auto min_idx = index.min().toCLong();
   if (max_idx >= dim_size) {
     AT_ERROR("index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
-  } 
+  }
   if (min_idx < -dim_size) {
     AT_ERROR("index ", min_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
   }
@@ -249,6 +249,18 @@ Tensor index(const Tensor & self, TensorList indices) {
   return src.take(linearIndex);
 }
 
+Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value) {
+  if (indices.size() > (size_t)self.dim()) {
+   AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
+  }
+
+  Tensor src, linearIndex, expandedValue;
+  std::tie(src, linearIndex) = makeLinearIndex(self, indices);
+  std::tie(expandedValue) = expand_inplace(linearIndex, value);
+  Tensor dst = src.clone();
+  return dst.put_(linearIndex, expandedValue);
+}
+
 Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) {
   if (indices.size() > (size_t)self.dim()) {
    AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index c0758c867decb0..02653629e55bfe 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -1,4 +1,3 @@
-#include <iostream>
 #include "ATen/ATen.h"
 #include "ATen/AccumulateType.h"
 #include "ATen/NativeFunctions.h"
@@ -11,11 +10,8 @@ namespace at {
 namespace native {
 namespace {
 
-static default_partitioner_type ap;
-
 template <typename scalar_t, bool LogSoftMax>
 void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
-  internal::init_tbb_num_threads();
   int64_t outer_size = 1;
   int64_t dim_size = input.size(dim);
   int64_t inner_size = 1;
@@ -28,10 +24,10 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
   scalar_t* input_data_base = input.data<scalar_t>();
   scalar_t* output_data_base = output.data<scalar_t>();
   int64_t grain_size = std::min(internal::TBB_GRAIN_SIZE / dim_size, (int64_t)1);
-  tbb::parallel_for(
-      tbb::blocked_range<int64_t>(0, outer_size * inner_size, grain_size),
-      [&](const tbb::blocked_range<int64_t>& r) {
-        for (int64_t i = r.begin(); i < r.end(); i++) {
+  parallel_for(
+      0, outer_size * inner_size, grain_size,
+      [&](int64_t begin, int64_t end) {
+        for (int64_t i = begin; i < end; i++) {
           int64_t outer_idx = i / inner_size;
           int64_t inner_idx = i % inner_size;
           scalar_t* input_data =
@@ -62,8 +58,7 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
             else
               output_data[d * dim_stride] *= tmpsum;
         }
-      },
-      ap);
+      });
 }
 
 template <typename scalar_t, bool LogSoftMax>
@@ -72,7 +67,6 @@ void host_softmax_backward(
     const Tensor& grad,
     const Tensor& output,
     int64_t dim) {
-  internal::init_tbb_num_threads();
 
   int64_t outer_size = 1;
   int64_t dim_size = grad.size(dim);
@@ -87,10 +81,9 @@ void host_softmax_backward(
   scalar_t* output_data_base = output.data<scalar_t>();
   scalar_t* gradOutput_data_base = grad.data<scalar_t>();
   int64_t grain_size = std::min(internal::TBB_GRAIN_SIZE / dim_size, (int64_t)1);
-  tbb::parallel_for(
-      tbb::blocked_range<int64_t>(0, outer_size * inner_size, grain_size),
-      [&](const tbb::blocked_range<int64_t>& r) {
-        for (int64_t i = r.begin(); i < r.end(); i++) {
+  parallel_for(
+      0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
+        for (int64_t i = begin; i < end; i++) {
           int64_t outer_idx = i / inner_size;
           int64_t inner_idx = i % inner_size;
           scalar_t* gradInput_data =
@@ -118,8 +111,7 @@ void host_softmax_backward(
             }
           }
         }
-      },
-      ap);
+      });
 }
 } // namespace
 
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index f9f558a3deed0b..e6fe1ffa2f1253 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -263,18 +263,15 @@ THGenerator* get_generator(at::Generator* gen) {
 
 Tensor randperm(const Type& dtype, int64_t n, Generator* generator) {
   Tensor result = dtype.tensor(n);
-  return at::native::randperm_out(result, n, generator);
+  return at::randperm_out(result, n, generator);
 }
 
-Tensor& randperm_out(Tensor& result, int64_t n, Generator* generator) {
+Tensor& randperm_out_cpu(Tensor& result, int64_t n, Generator* generator) {
   if (n < 0) {
     std::ostringstream oss;
     oss << "n must be non-negative, got " << n;
     throw std::runtime_error(oss.str());
   }
-  if (result.type().backend() != at::kCPU) {
-    throw std::runtime_error("randperm is only implemented for CPU");
-  }
 
   result.resize_({n});
   auto gen = get_generator(generator);
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index 2f1bf85956f85e..ca66f5b547f4a5 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -6,8 +6,14 @@
 
 #include "ATen/Dispatch.h"
 #include "ATen/Parallel.h"
-#include "ATen/optional.h"
 #include "ATen/cpu/vec256/vec256.h"
+#include "ATen/optional.h"
+
+#ifdef __PPC64__
+using default_partitioner_type = tbb::simple_partitioner;
+#else
+using default_partitioner_type = tbb::affinity_partitioner;
+#endif
 
 namespace at { namespace native { namespace {
 
diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
index 357ec1624b5f95..2e0a68b1fed487 100644
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@@ -22,8 +22,6 @@
 namespace at { namespace native {
 namespace {
 
-static default_partitioner_type ap;
-
 template <typename scalar_t>
 inline void _vec_log_softmax_lastdim(
     scalar_t* input_data_base,
@@ -36,15 +34,17 @@ inline void _vec_log_softmax_lastdim(
   if (grain_size < CHUNK_SIZE)
     grain_size = CHUNK_SIZE;
 
-  tbb::parallel_for(
-      tbb::blocked_range<int64_t>(0, outer_size, grain_size),
-      [&](const tbb::blocked_range<int64_t>& r) {
-        for (int64_t ii = r.begin(); ii < r.end(); ii += CHUNK_SIZE) {
+  parallel_for(
+      0,
+      outer_size,
+      grain_size,
+      [&](int64_t begin, int64_t end) {
+        for (int64_t ii = begin; ii < end; ii += CHUNK_SIZE) {
           scalar_t tmp_sum_scalar[CHUNK_SIZE];
           scalar_t max_input_arr[CHUNK_SIZE];
           int64_t loop_end = CHUNK_SIZE;
-          if (ii + CHUNK_SIZE > r.end())
-            loop_end = r.end() - ii;
+          if (ii + CHUNK_SIZE > end)
+            loop_end = end - ii;
           for (int64_t j = 0; j < loop_end; j++) {
             int64_t i = ii + j;
             scalar_t* input_data = input_data_base + i * dim_size;
@@ -83,8 +83,7 @@ inline void _vec_log_softmax_lastdim(
                 dim_size);
           }
         }
-      },
-      ap);
+      });
 }
 
 template <typename scalar_t>
@@ -98,10 +97,12 @@ inline void _vec_softmax_lastdim(
   if (grain_size < 1)
     grain_size = 1;
 
-  tbb::parallel_for(
-      tbb::blocked_range<int64_t>(0, outer_size, grain_size),
-      [&](const tbb::blocked_range<int64_t>& r) {
-        for (int64_t i = r.begin(); i < r.end(); i++) {
+  parallel_for(
+      0,
+      outer_size,
+      grain_size,
+      [&](int64_t begin, int64_t end) {
+        for (int64_t i = begin; i < end; i++) {
           scalar_t* input_data = input_data_base + i * dim_size;
           scalar_t* output_data = output_data_base + i * dim_size;
           scalar_t max_input = vec256::reduce_all<scalar_t>(
@@ -122,8 +123,7 @@ inline void _vec_softmax_lastdim(
               output_data,
               dim_size);
         }
-      },
-      ap);
+      });
 }
 
 template <typename scalar_t, bool log_softmax>
@@ -138,10 +138,12 @@ inline void _vec_host_softmax_backward_lastdim(
   if (grain_size < 1)
     grain_size = 1;
 
-  tbb::parallel_for(
-      tbb::blocked_range<int64_t>(0, outer_size, grain_size),
-      [&](const tbb::blocked_range<int64_t>& r) {
-        for (int64_t i = r.begin(); i < r.end(); i++) {
+  parallel_for(
+      0,
+      outer_size,
+      grain_size,
+      [&](int64_t begin, int64_t end) {
+        for (int64_t i = begin; i < end; i++) {
           scalar_t* grad_input_data = grad_input_data_base + i * dim_size;
           scalar_t* grad_data = grad_data_base + i * dim_size;
           scalar_t* output_data = output_data_base + i * dim_size;
@@ -173,14 +175,12 @@ inline void _vec_host_softmax_backward_lastdim(
                 dim_size);
           }
         }
-      },
-      ap);
+      });
 }
 
 template <typename scalar_t, bool LogSoftMax>
 struct vec_host_softmax_lastdim {
   static void apply(Tensor& output, const Tensor& input) {
-    internal::init_tbb_num_threads();
     int64_t outer_size = 1;
     int64_t dim_size = input.size(input.ndimension() - 1);
     for (int64_t i = 0; i < input.ndimension() - 1; ++i)
@@ -201,7 +201,6 @@ template <typename scalar_t, bool LogSoftMax>
 struct vec_host_softmax_backward_lastdim {
   static void
   apply(Tensor& grad_input, const Tensor& grad, const Tensor& output) {
-    internal::init_tbb_num_threads();
     int64_t outer_size = 1;
     int64_t dim_size = grad.size(grad.ndimension() - 1);
     for (int64_t i = 0; i < grad.ndimension() - 1; ++i)
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index efe7dcf87c852b..ab24d7faba0ba6 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -1,7 +1,6 @@
 #include "ATen/native/cpu/UnaryOpsKernel.h"
 
 #include <cmath>
-#include <iostream>
 #include "ATen/Dispatch.h"
 #include "ATen/Parallel.h"
 #include "ATen/cpu/vec256/vec256.h"
@@ -13,51 +12,51 @@ namespace {
 
 using namespace vec256;
 
-template <class scalar_t, class F>
-static void parallel_apply(Tensor& result, const Tensor& self, F f) {
-  internal::init_tbb_num_threads();
-
-static default_partitioner_type ap;
-
-  auto arr_out = result.data<scalar_t>();
-  auto arr_in = self.data<scalar_t>();
-  int64_t size = self.numel();
-  int64_t grain_size = 2048;
-  if (size < grain_size) {
-    map(f, arr_out, arr_in, size);
-  } else {
-    tbb::parallel_for(
-        tbb::blocked_range<int64_t>(0, size, grain_size),
-        [&](const tbb::blocked_range<int64_t>& r) {
-          map(f, arr_out + r.begin(), arr_in + r.begin(), r.end() - r.begin());
-        },
-        ap);
-  }
-}
-
 static void abs_kernel(Tensor& result, const Tensor& self) {
   AT_DISPATCH_ALL_TYPES(self.type(), "abs", [&] {
-    parallel_apply<scalar_t>(
-        result,
-        self,
-        [](const Vec256<scalar_t>& x) { return x.abs(); });  });
+    scalar_t* out = result.data<scalar_t>();
+    scalar_t* in = self.data<scalar_t>();
+    int64_t size = self.numel();
+    parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) {
+      map([](const Vec256<scalar_t>& x) { return x.abs(); },
+          out + begin,
+          in + begin,
+          end - begin);
+    });
+  });
 }
 
 static void rsqrt_kernel(Tensor& result, const Tensor& self) {
   AT_DISPATCH_FLOATING_TYPES(self.type(), "rsqrt", [&] {
-    parallel_apply<scalar_t>(
-        result,
-        self,
-        [](const Vec256<scalar_t>& x) { return Vec256<scalar_t>((scalar_t)(1)) / x.sqrt(); });  });
+    scalar_t* out = result.data<scalar_t>();
+    scalar_t* in = self.data<scalar_t>();
+    int64_t size = self.numel();
+    parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) {
+      map(
+          [](const Vec256<scalar_t>& x) {
+            return Vec256<scalar_t>((scalar_t)(1)) / x.sqrt();
+          },
+          out + begin,
+          in + begin,
+          end - begin);
+    });
+  });
 }
 
-#define IMPLEMENT_FLOAT_KERNEL(op)                                             \
-  static void op##_kernel(Tensor& result, const Tensor& self) {                \
-    AT_DISPATCH_FLOATING_TYPES(self.type(), #op, [&] {                         \
-      parallel_apply<scalar_t>(                                                \
-          result, self, [](const Vec256<scalar_t>& x) { return x.op(); }); \
-    });                                                                        \
-  }                                                                            \
+#define IMPLEMENT_FLOAT_KERNEL(op)                                        \
+  static void op##_kernel(Tensor& result, const Tensor& self) {           \
+    AT_DISPATCH_FLOATING_TYPES(self.type(), #op, [&] {                    \
+      scalar_t* out = result.data<scalar_t>();                            \
+      scalar_t* in = self.data<scalar_t>();                               \
+      int64_t size = self.numel();                                        \
+      parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) { \
+        map([](const Vec256<scalar_t>& x) { return x.op(); },             \
+            out + begin,                                                  \
+            in + begin,                                                   \
+            end - begin);                                                 \
+      });                                                                 \
+    });                                                                   \
+  }                                                                       \
   REGISTER_DISPATCH(op##Impl, &op##_kernel)
 
 } // anonymous namespace
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
index f6db9bdc681c2b..4f2013c2f19f3e 100644
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -1,4 +1,14 @@
+#include "ATen/ATen.h"
 #include "ATen/NativeFunctions.h"
+#include "ATen/cuda/CUDATypeConversion.cuh"
+
+#include <THC/THCGeneral.h>
+#include <THC/THCThrustAllocator.cuh>
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
 #include <algorithm>
 #include <sstream>
 
@@ -26,4 +36,57 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) {
   return result;
 }
 
+Tensor& randperm_out_cuda(Tensor& result, int64_t n, Generator* generator) {
+  if (n < 0) {
+    std::ostringstream oss;
+    oss << "n must be non-negative, got " << n;
+    throw std::runtime_error(oss.str());
+  }
+
+  if (n > 0) {
+    AT_DISPATCH_ALL_TYPES_AND_HALF(
+      result.type(), "randperm_out_cuda", [&] {
+        AT_CHECK(Scalar(n).to<scalar_t>(),
+          "n is too large for result tensor type: '", result.type().toString(), "'");
+      }
+    );
+  }
+
+  result.resize_({n});
+
+  if (result.type().scalarType() == at::ScalarType::Half) {
+    auto result_float = CUDA(kFloat).tensor({n});
+    result.copy_(randperm_out_cuda(result_float, n, generator));
+  } else {
+    if (n < 30000) {  // For small inputs, we offload it to CPU instead.
+      auto result_cpu = result.type().toBackend(kCPU).tensor({n});
+      randperm_out(result_cpu, n, generator);
+      result.copy_(result_cpu);
+    } else {
+      // Generate random values for the keys array
+      AT_DISPATCH_ALL_TYPES(
+        result.type(), "randperm_out_cuda", [&] {
+          using cuda_scalar_t = cuda::into_type<scalar_t>;
+
+          auto keys = result.type().tensor(result.sizes()).random_(generator);
+
+          auto result_data = thrust::device_ptr<cuda_scalar_t>(result.data<cuda_scalar_t>());
+          auto keys_data = thrust::device_ptr<cuda_scalar_t>(keys.data<cuda_scalar_t>());
+
+          auto state = globalContext().getTHCState();
+          THCThrustAllocator thrustAlloc(state);
+          auto policy = thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state));
+
+          thrust::sequence(policy, result_data, result_data + n);
+
+          // Use the sorted order of keys to rearrange the result array
+          thrust::sort_by_key(policy, keys_data, keys_data + n, result_data);
+        }
+      );
+    }
+  }
+
+  return result;
+}
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp
index bba5b9aa92eb1f..c5475980860d82 100644
--- a/aten/src/ATen/native/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@@ -323,7 +323,7 @@ struct ConvolutionArgs {
 
 // Hashing machinery for ConvolutionParams
 struct ParamsHash {
-  size_t operator()(const ConvolutionParams& params) const {
+  std::size_t operator()(const ConvolutionParams& params) const {
     auto ptr = reinterpret_cast<const uint8_t*>(&params);
     uint32_t value = 0x811C9DC5;
     for (int i = 0; i < (int)sizeof(ConvolutionParams); ++i) {
diff --git a/aten/src/ATen/native/miopen/BatchNorm.cpp b/aten/src/ATen/native/miopen/BatchNorm.cpp
new file mode 100644
index 00000000000000..a87acd8fb68d76
--- /dev/null
+++ b/aten/src/ATen/native/miopen/BatchNorm.cpp
@@ -0,0 +1,213 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Config.h>
+
+#if !AT_MIOPEN_ENABLED
+
+namespace at { namespace native {
+
+// See Note [ATen preprocessor philosophy]
+
+std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm(
+    const Tensor& input, const Tensor& weight,
+    const Tensor& bias, const Tensor& running_mean, const Tensor& running_var,
+    bool training, double exponential_average_factor, double epsilon) {
+  throw std::runtime_error("cudnn_batch_norm: ATen not compiled with MIOpen support");
+}
+
+std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
+    const Tensor& input, const Tensor& grad_output, const Tensor& weight,
+    const Tensor& running_mean, const Tensor& running_var,
+    const Tensor& save_mean, const Tensor& save_var,
+    double epsilon) {
+  throw std::runtime_error("cudnn_batch_norm_backward: ATen not compiled with MIOpen support");
+}
+
+}}  // namespace at::native
+
+#else // AT_MIOPEN_ENABLED
+
+#include <ATen/miopen/Descriptors.h>
+#include <ATen/miopen/Types.h>
+#include <ATen/miopen/Utils.h>
+
+#include <ATen/TensorUtils.h>
+
+namespace at { namespace native {
+
+namespace {
+
+Tensor expandScale(const Tensor& t, int64_t dim) {
+  std::vector<int64_t> size{ 1, t.numel() };
+  while (static_cast<int64_t>(size.size()) < dim) {
+    size.emplace_back(1);
+  }
+  return t.view(size);
+}
+
+}  // namespace
+
+std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm(
+    const Tensor& input_t, const Tensor& weight_t,
+    const Tensor& bias_t, const Tensor& running_mean_t, const Tensor& running_var_t,
+    bool training, double exponential_average_factor, double epsilon)
+{
+  TensorArg input{ input_t, "input", 1 },
+            weight{ weight_t, "weight", 2 },
+            bias{ bias_t, "bias", 3 },
+            running_mean{ running_mean_t, "running_mean", 4 },
+            running_var{ running_var_t, "running_var", 5 };
+  CheckedFrom c = "cudnn_batch_norm";
+  setMIOpenStreamToCurrent();
+
+  checkAllDefined(c, {input, weight, bias});
+  if (!training) {
+    checkAllDefined(c, {running_mean, running_var});
+  }
+  checkAllSameGPU(c, {input, weight, bias, running_mean, running_var});
+  if (input->type().scalarType() == ScalarType::Half) {
+    checkScalarType(c, weight, ScalarType::Float);
+  } else {
+    checkAllSameType(c, {input, weight});
+  }
+  checkAllSameType(c, {weight, bias, running_mean, running_var});
+  // TODO: is weight required to be contiguous?
+  checkAllContiguous(c, {input, weight, bias, running_mean, running_var});
+  checkDimRange(c, input, 2, 6 /* exclusive */);
+  auto num_features = input->size(1);
+  for (auto t : {weight, bias, running_mean, running_var}) {
+    if (t->defined()) {
+      checkNumel(c, t, num_features);
+    }
+  }
+
+  miopenBatchNormMode_t mode;
+  if (input->dim() == 2) {
+    mode = miopenBNPerActivation;
+  } else {
+    mode = miopenBNSpatial;
+  }
+
+  auto output_t = input->type().tensor(input->sizes());
+  TensorArg output{ output_t, "output", 0 };
+
+  auto handle = getMiopenHandle();
+  auto dataType = getMiopenDataType(*input);
+  TensorDescriptor idesc{ *input, 4 };  // input descriptor
+  TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 };  // descriptor for weight, bias, running_mean, etc.
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+  Tensor save_mean, save_var;
+
+  if (training) {
+    int64_t num_features = input_t.size(1);
+    save_mean = weight_t.type().tensor({ num_features });
+    save_var = weight_t.type().tensor({ num_features });
+    MIOPEN_CHECK(miopenBatchNormalizationForwardTraining(
+      handle, mode, &one, &zero,
+      idesc.desc(), input->data_ptr(),
+      idesc.desc(), output->data_ptr(),
+      wdesc.desc(),
+      weight->data_ptr(),
+      bias->data_ptr(),
+      exponential_average_factor,
+      at::maybe_data_ptr(running_mean),
+      at::maybe_data_ptr(running_var),
+      epsilon,
+      save_mean.data_ptr(),
+      save_var.data_ptr()));
+  } else {
+    MIOPEN_CHECK(miopenBatchNormalizationForwardInference(
+      handle, mode, &one, &zero,
+      idesc.desc(), input->data_ptr(),
+      idesc.desc(), output->data_ptr(),
+      wdesc.desc(),
+      weight->data_ptr(),
+      bias->data_ptr(),
+      running_mean->data_ptr(),
+      running_var->data_ptr(),
+      epsilon));
+  }
+
+  // save_mean and save_var can be undefined
+  // If this causes problems, we can initialize them to empty tensors
+  // of the correct type
+  return std::tuple<Tensor, Tensor, Tensor>{output_t, save_mean, save_var};
+}
+
+// NB: CuDNN only implements the backward algorithm for batchnorm
+// in training mode (evaluation mode batchnorm has a different algorithm),
+// which is why this doesn't accept a 'training' parameter.
+std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
+    const Tensor& input_t, const Tensor& grad_output_t, const Tensor& weight_t,
+    // Unused: but we require them to be passed so that double backwards
+    // has access
+    const Tensor& running_mean, const Tensor& running_var,
+    const Tensor& save_mean_t, const Tensor& save_var_t,
+    double epsilon)
+{
+  TensorArg input{ input_t, "input", 1 },
+            grad_output{ grad_output_t, "grad_output", 2 },
+            weight{ weight_t, "weight", 3 },
+            save_mean{ save_mean_t, "save_mean", 4 },
+            save_var{ save_var_t, "save_var", 5 };
+  CheckedFrom c = "cudnn_batch_norm_backward";
+  setMIOpenStreamToCurrent();
+
+  checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
+  checkAllSameGPU(c, {input, grad_output, weight, save_mean, save_var});
+  if (input->type().scalarType() == ScalarType::Half) {
+    checkScalarType(c, weight, ScalarType::Float);
+  } else {
+    checkAllSameType(c, {input, weight});
+  }
+  checkAllSameType(c, {input, grad_output});
+  checkAllSameType(c, {weight, save_mean, save_var});
+  // TODO: is weight required to be contiguous?
+  checkAllContiguous(c, {input, grad_output, save_mean, save_var});
+  checkDimRange(c, input, 2, 6 /* exclusive */);
+  checkSameSize(c, input, grad_output);
+  auto num_features = input->size(1);
+  for (auto t : {weight, save_mean, save_var}) {
+    checkNumel(c, t, num_features);
+  }
+
+  miopenBatchNormMode_t mode;
+  if (input->dim() == 2) {
+    mode = miopenBNPerActivation;
+  } else {
+    mode = miopenBNSpatial;
+  }
+
+  auto grad_input_t  = input->type().tensor(input->sizes());
+  auto grad_weight_t = weight->type().tensor(weight->sizes());
+  auto grad_bias_t   = weight->type().tensor(weight->sizes());
+
+  auto handle = getMiopenHandle();
+  auto dataType = getMiopenDataType(*input);
+
+  TensorDescriptor idesc{ *input, 4 };  // input, output, grad_output descriptor
+  TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 };  // descriptor for weight, bias, save_mean, etc.
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  MIOPEN_CHECK(miopenBatchNormalizationBackward(
+    handle, mode, &one, &zero, &one, &zero,
+    idesc.desc(), input->data_ptr(),
+    idesc.desc(), grad_output->data_ptr(),
+    idesc.desc(), grad_input_t.data_ptr(),
+    wdesc.desc(), weight->data_ptr(),
+    grad_weight_t.data_ptr(),
+    grad_bias_t.data_ptr(),
+    epsilon,
+    save_mean->data_ptr(),
+    save_var->data_ptr()));
+
+  return std::tuple<Tensor,Tensor,Tensor>{grad_input_t, grad_weight_t, grad_bias_t};
+}
+
+}}  // namespace native
+
+#endif
diff --git a/aten/src/ATen/native/miopen/Conv.cpp b/aten/src/ATen/native/miopen/Conv.cpp
new file mode 100644
index 00000000000000..fb9bd89a2919cc
--- /dev/null
+++ b/aten/src/ATen/native/miopen/Conv.cpp
@@ -0,0 +1,1209 @@
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Config.h>
+
+#if !AT_MIOPEN_ENABLED
+
+namespace at { namespace native {
+
+// See Note [ATen preprocessor philosophy]
+
+at::Tensor cudnn_convolution(
+    const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
+    IntList padding, IntList stride, IntList dilation,
+    int64_t groups, bool benchmark, bool deterministic) {
+  throw std::runtime_error("cudnn_convolution: ATen not compiled with MIOpen support");
+}
+
+at::Tensor cudnn_convolution_backward_input(
+    IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+  throw std::runtime_error("cudnn_convolution_backward_input: ATen not compiled with MIOpen support");
+}
+
+at::Tensor cudnn_convolution_backward_weight(
+    IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+  throw std::runtime_error("cudnn_convolution_backward_weight: ATen not compiled with MIOpen support");
+}
+
+at::Tensor cudnn_convolution_backward_bias(
+    const at::Tensor& grad_output) {
+  throw std::runtime_error("cudnn_convolution_backward_bias: ATen not compiled with MIOpen support");
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_backward(
+    const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
+  throw std::runtime_error("cudnn_convolution_backward: ATen not compiled with MIOpen support");
+}
+
+at::Tensor cudnn_convolution_transpose(
+    const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation,
+    int64_t groups, bool benchmark, bool deterministic) {
+  throw std::runtime_error("cudnn_convolution_transpose: ATen not compiled with MIOpen support");
+}
+
+at::Tensor cudnn_convolution_transpose_backward_input(
+    const at::Tensor& grad_output, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation,
+    int64_t groups, bool benchmark, bool deterministic) {
+  throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with MIOpen support");
+}
+
+at::Tensor cudnn_convolution_transpose_backward_weight(
+    IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+  throw std::runtime_error("cudnn_convolution_transpose_backward_weight: ATen not compiled with MIOpen support");
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_transpose_backward(
+    const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
+  throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with MIOpen support");
+}
+
+}}
+
+#else  // AT_MIOPEN_ENABLED
+
+#include "THC/THC.h"
+
+#include <ATen/miopen/miopen-wrapper.h>
+#include <ATen/miopen/Descriptors.h>
+#include <ATen/miopen/Types.h>
+#include <ATen/miopen/Utils.h>
+
+#include <ATen/TensorUtils.h>
+
+#include <functional>
+#include <iterator>
+#include <sstream>
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <stdint.h>
+#include <unordered_map>
+
+namespace at { namespace native {
+
+// TODO: Go through all the checking code again and make sure
+// we haven't missed anything.
+
+// ---------------------------------------------------------------------
+//
+// Math
+//
+// ---------------------------------------------------------------------
+
+constexpr int input_batch_size_dim = 0;  // also grad_input
+constexpr int input_channels_dim = 1;
+constexpr int output_batch_size_dim = 0;  // also grad_output
+constexpr int output_channels_dim = 1;
+constexpr int weight_output_channels_dim = 0;
+constexpr int weight_input_channels_dim = 1;
+
+// Often written as 2 + max_dim (extra dims for batch size and channels)
+constexpr int max_dim = 3;
+
+// NB: conv_output_size and conv_input_size are not bijections,
+// as conv_output_size loses information; this is why conv_input_size
+// takes an extra output_padding argument to resolve the ambiguity.
+
+std::vector<int64_t> conv_output_size(
+    IntList input_size, IntList weight_size,
+    IntList padding, IntList stride, IntList dilation, int64_t groups
+) {
+  // ASSERT(input_size.size() > 2)
+  // ASSERT(input_size.size() == weight_size.size())
+  auto dim = input_size.size();
+  std::vector<int64_t> output_size(dim);
+  output_size[0] = input_size[input_batch_size_dim];
+  output_size[1] = weight_size[weight_output_channels_dim];
+  for (size_t d = 2; d < dim; ++d) {
+    auto kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
+    output_size[d] = (input_size[d] + (2 * padding[d - 2])
+                        - kernel) / stride[d - 2] + 1;
+  }
+  return output_size;
+}
+
+std::vector<int64_t> conv_input_size(
+    IntList output_size, IntList weight_size,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups
+) {
+  // ASSERT(output_size.size() > 2)
+  // ASSERT(output_size.size() == weight_size.size())
+  auto dim = output_size.size();
+  std::vector<int64_t> input_size(dim);
+  input_size[0] = output_size[output_batch_size_dim];
+  input_size[1] = weight_size[weight_input_channels_dim] * groups;
+  for (size_t d = 2; d < dim; ++d) {
+    int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
+    input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) +
+                     kernel + output_padding[d - 2];
+  }
+  return input_size;
+}
+
+std::vector<int64_t> conv_weight_size(
+    IntList input_size, IntList output_size,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups
+) {
+  auto dim = input_size.size();
+  std::vector<int64_t> weight_size(dim);
+  weight_size[0] = output_size[1];
+  weight_size[1] = input_size[1] / groups;
+  for (size_t d = 2; d < dim; ++d) {
+    int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
+               + 2 * padding[d - 2] - output_padding[d - 2];
+    weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
+  }
+  return weight_size;
+}
+
+// TODO: Move this into the standard library, with a better name?
+Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) {
+  auto group_size = t.size(dim) / groups;
+  return t.narrow(dim, group_idx * group_size, group_size);
+}
+
+// ---------------------------------------------------------------------
+//
+// Checking
+//
+// ---------------------------------------------------------------------
+
+// Note [Legacy CuDNN grouped convolution support]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// CuDNN earlier than CuDNN 7 does not directly support group
+// convolution, so we provide support for it by sequentially
+// running a convolution per group  with appropriately
+// adjusted sizes.  https://blog.yani.io/filter-group-tutorial/
+// has a fairly good diagram explaining how it works.
+
+// Used on pad, stride and dilation
+static void check_args(CheckedFrom c, IntList args, size_t expected_size, const char* arg_name)
+{
+  if (args.size() > expected_size){
+    std::stringstream ss;
+    ss << "Too many " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")";
+    throw std::runtime_error(ss.str());
+  }
+  else if (args.size() < expected_size){
+    std::stringstream ss;
+    ss << "Not enough " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")";
+    throw std::runtime_error(ss.str());
+  }
+
+  auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
+  if (num_negative_values > 0){
+    std::stringstream ss;
+    ss << arg_name << " should be greater than zero but got (";
+    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
+    throw std::runtime_error(ss.str());
+  }
+}
+
+
+// NB: For many call sites, it is not strictly necessary to check all of
+// these relationships (for example, for forward convolution, we compute
+// the size of output ourselves, so we don't actually need to check
+// output.  However, writing a single function that does everything
+// means we get to reuse it for both forwards and all backwards
+// variants, even when the set of "real" inputs varies.  The magic of
+// relational computing!
+//
+// (There is one downside, which is that it is slightly harder to write
+// error messages which are able to distinguish between real inputs
+// (which the user can change) and computed inputs (which the user can
+// only indirectly affect).  It would be an interesting exercise to
+// come up with a general framework to handle such situations.)
+static void convolution_shape_check(
+    CheckedFrom c,
+    const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output,
+    IntList padding, IntList stride, IntList dilation, int64_t groups)
+{
+  check_args(c, padding, input->dim() - 2, "padding");
+  check_args(c, stride, padding.size(), "stride");
+  check_args(c, dilation, padding.size(), "dilation");
+
+  // Input
+  checkDimRange(c, input, 3, 6 /* exclusive */);
+  checkSize(c, input, input_channels_dim, weight->size(1) * groups);
+
+  // Weight
+  checkSameDim(c, input, weight);
+
+  // TODO: check that output->size() matches output_sizes
+  // TODO: check that weight matches output->sizes()
+  checkSameDim(c, input, output);
+}
+
+// This POD struct is used to let us easily compute hashes of the
+// parameters
+struct ConvolutionParams
+{
+  miopenDataType_t dataType;
+  int input_size[2 + max_dim];
+  int input_stride[2 + max_dim];
+  int weight_size[2 + max_dim];
+  int padding[max_dim];
+  int stride[max_dim];
+  int dilation[max_dim];
+  int64_t groups;
+  bool deterministic;
+  // NB: transposed purposely omitted: transposed just swaps
+  // forward and backward, so you can reuse the benchmark entry,
+};
+// ConvolutionParams must be a POD because we read out its memory
+// contenst as char* when hashing
+static_assert(std::is_pod<ConvolutionParams>::value, "ConvolutionParams not POD");
+
+// NB: This can't be a constructor, because then ConvolutionParams
+// would not be a POD anymore.
+// TODO: Use TensorGeometry here instead of the entire Tensor, which we
+// don't actually need.  (OTOH: We can always pass in
+// grad_input/grad_output, so this is not very pressing)
+void setConvolutionParams(
+    ConvolutionParams* params,
+    const at::Tensor& input, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation,
+    int64_t groups, bool deterministic) {
+
+  miopenDataType_t dataType = getMiopenDataType(input);
+  memset(params, 0, sizeof(ConvolutionParams));
+  params->dataType = dataType;
+  // ASSERT(weight.dim() == input.dim())
+  for (int i = 0; i != input.dim(); ++i) {
+    params->input_size[i] = (int) input.size(i);
+    params->input_stride[i] = (int) input.stride(i);
+    params->weight_size[i] = (int) weight.size(i);
+  }
+  // ASSERT(padding.size() == stride.size())
+  // ASSERT(padding.size() == dilation.size())
+  for (size_t i = 0; i != padding.size(); ++i) {
+    params->padding[i] = padding[i];
+    params->stride[i] = stride[i];
+    params->dilation[i] = dilation[i];
+  }
+  // In principle, we shouldn't parametrize by groups for legacy
+  // CuDNN, but it doesn't seem worth the effort to actually do this.
+  params->groups = groups;
+  params->deterministic = deterministic;
+}
+
+// Convenience struct for passing around descriptors and data
+// pointers
+struct ConvolutionArgs {
+  miopenHandle_t handle;
+  ConvolutionParams params;
+  TensorDescriptor idesc, odesc;
+  FilterDescriptor wdesc;
+  const Tensor& input, output, weight;
+  ConvolutionDescriptor cdesc;
+
+  ConvolutionArgs(const Tensor& input, const Tensor& output, const Tensor& weight) : input(input), output(output), weight(weight) {
+  }
+};
+
+// ---------------------------------------------------------------------
+//
+// Benchmarking
+//
+// ---------------------------------------------------------------------
+
+// Hashing machinery for ConvolutionParams
+struct ParamsHash {
+  std::size_t operator()(const ConvolutionParams& params) const {
+    auto ptr = reinterpret_cast<const uint8_t*>(&params);
+    uint32_t value = 0x811C9DC5;
+    for (int i = 0; i < (int)sizeof(ConvolutionParams); ++i) {
+      value ^= ptr[i];
+      value *= 0x01000193;
+    }
+    return (size_t)value;
+  }
+};
+
+struct ParamsEqual {
+  bool operator()(const ConvolutionParams& a, const ConvolutionParams& b) const {
+    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
+    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
+    return memcmp(ptr1, ptr2, sizeof(ConvolutionParams)) == 0;
+  }
+};
+
+// TODO: Use something less heavy duty than a big honking mutex
+template <typename T>
+struct BenchmarkCache {
+  std::mutex mutex;
+  std::unordered_map<ConvolutionParams, T, ParamsHash, ParamsEqual> map;
+
+  bool find(const ConvolutionParams& params, T* results) {
+    std::lock_guard<std::mutex> guard(mutex);
+    auto it = map.find(params);
+    if (it == map.end()) {
+      return false;
+    }
+    *results = it->second;
+    return true;
+  }
+
+  void insert(const ConvolutionParams& params, const T& results) {
+    std::lock_guard<std::mutex> guard(mutex);
+    map[params] = results;
+  }
+};
+
+BenchmarkCache<miopenConvFwdAlgorithm_t> fwd_algos;
+BenchmarkCache<miopenConvBwdDataAlgorithm_t> bwd_data_algos;
+BenchmarkCache<miopenConvBwdWeightsAlgorithm_t> bwd_filter_algos;
+
+// TODO: Stop manually allocating CUDA memory; allocate an ATen byte
+// tensor instead.
+struct Workspace {
+  Workspace(size_t size) : size(size), data(NULL) {
+    CUDA_CHECK(THCudaMalloc(globalContext().lazyInitCUDA(), &data, size));
+  }
+  Workspace(const Workspace&) = delete;
+  Workspace(Workspace&&) = default;
+  Workspace& operator=(Workspace&&) = default;
+  ~Workspace() {
+    if (data) {
+      THCudaFree(globalContext().lazyInitCUDA(), data);
+    }
+  }
+
+  size_t size;
+  void* data;
+};
+
+template<typename algo_t>
+struct algorithm_search {
+};
+
+miopenStatus_t getWorkspaceSize(
+    const ConvolutionArgs& args,
+    miopenConvFwdAlgorithm_t algo, size_t* sz)
+{
+    return miopenConvolutionForwardGetWorkSpaceSize(
+        args.handle,
+        args.wdesc.desc(),
+        args.idesc.desc(),
+        args.cdesc.desc(),
+        args.odesc.desc(),
+        sz);
+}
+miopenStatus_t getWorkspaceSize(
+    const ConvolutionArgs& args, miopenConvBwdDataAlgorithm_t algo, size_t* sz)
+{
+    return miopenConvolutionBackwardDataGetWorkSpaceSize(
+        args.handle,
+        args.odesc.desc(),
+        args.wdesc.desc(),
+        args.cdesc.desc(),
+        args.idesc.desc(),
+        sz);
+}
+miopenStatus_t getWorkspaceSize(
+    const ConvolutionArgs& args, miopenConvBwdWeightsAlgorithm_t algo, size_t* sz)
+{
+    return miopenConvolutionBackwardWeightsGetWorkSpaceSize(
+        args.handle,
+        args.odesc.desc(),
+        args.idesc.desc(),
+        args.cdesc.desc(),
+        args.wdesc.desc(),
+        sz);
+}
+
+template<typename algo_t>
+size_t getMaxWorkspaceSize(
+    const ConvolutionArgs& args,
+    const algo_t *algo, int n_algo)
+{
+    THCState *state = globalContext().lazyInitCUDA();
+
+    size_t max_ws_size = 0;
+    size_t max_block_size = 0;
+    size_t total_gpu_mem = 0;
+    size_t free_gpu_mem = 0;
+
+    THCudaCheck(THCudaMemGetInfoCached(state, &free_gpu_mem, &total_gpu_mem, &max_block_size));
+
+    for (int i = 0; i < n_algo; i++) {
+        miopenStatus_t err;
+        size_t sz;
+        err = getWorkspaceSize(args, algo[i], &sz);
+        if (miopenStatusSuccess != err || sz == 0
+            || sz < max_ws_size || sz > max_block_size) continue;
+        max_ws_size = sz;
+    }
+    return max_ws_size;
+}
+
+template<typename perf_t>
+perf_t getBestAlgorithm(perf_t *perfResults, bool deterministic, int n_algo) {
+  if (deterministic) {
+    throw std::runtime_error("no deterministic convolution algorithms available in MIOpen");
+  } else {
+    return perfResults[0];
+  }
+}
+
+template<>
+struct algorithm_search<miopenConvFwdAlgorithm_t> {
+  using perf_t = miopenConvAlgoPerf_t;
+  using algo_t = miopenConvFwdAlgorithm_t;
+
+  static constexpr auto DEFAULT_ALGO = miopenConvolutionFwdAlgoGEMM;
+  static BenchmarkCache<algo_t>& cache() { return fwd_algos; }
+
+  static perf_t findAlgorithm(const ConvolutionArgs& args) {
+    static const algo_t algos[] = {
+         miopenConvolutionFwdAlgoGEMM,
+         miopenConvolutionFwdAlgoFFT,
+         miopenConvolutionFwdAlgoDirect,
+         miopenConvolutionFwdAlgoWinograd,
+    };
+    static constexpr int num_algos = 4;
+    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
+                  "Missing MIOpen convolution forward algorithms");
+    int perf_count;
+    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+    size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
+    Workspace ws(max_ws_size);
+    MIOPEN_CHECK(miopenFindConvolutionForwardAlgorithm(
+        args.handle,
+        args.idesc.desc(), args.input.data_ptr(),
+        args.wdesc.desc(), args.weight.data_ptr(),
+        args.cdesc.desc(),
+        args.odesc.desc(), args.output.data_ptr(),
+        num_algos,
+        &perf_count,
+        &perf_results,
+        ws.data,
+        ws.size,
+        false));
+    return getBestAlgorithm(perf_results.get(), args.params.deterministic, perf_count)
+  }
+
+  static void getAlgorithm(
+    const ConvolutionArgs& args,
+    algo_t* algo)
+  {
+    throw std::runtime_error("miopenGetConvolutionForwardAlgorithm is not supported.");
+    /*miopenConvolutionFwdPreference_t pref = cudnn_convolution_FWD_PREFER_FASTEST;
+    MIOPEN_CHECK(miopenGetConvolutionForwardAlgorithm(
+        args.handle,
+        args.idesc.desc(),
+        args.wdesc.desc(),
+        args.cdesc.desc(),
+        args.odesc.desc(),
+        pref,
+        0,
+        algo));*/
+  }
+
+  static void getWorkspaceSize(
+    const ConvolutionArgs& args,
+    algo_t algo, size_t* workspaceSize)
+  {
+    throw std::runtime_error("miopenGetConvolutionForwardWorkspaceSize is not supported.");
+    /*MIOPEN_CHECK(miopenGetConvolutionForwardWorkspaceSize(
+        args.handle,
+        args.idesc.desc(),
+        args.wdesc.desc(),
+        args.cdesc.desc(),
+        args.odesc.desc(),
+        algo,
+        workspaceSize));*/
+  }
+};
+
+template<>
+struct algorithm_search<miopenConvBwdDataAlgorithm_t> {
+  using perf_t = miopenConvAlgoPerf_t;
+  using algo_t = miopenConvBwdDataAlgorithm_t;
+
+  static constexpr auto DEFAULT_ALGO = miopenConvolutionBwdDataAlgoGEMM;
+  static BenchmarkCache<algo_t>& cache() { return bwd_data_algos; }
+
+  static perf_t findAlgorithm(const ConvolutionArgs& args) {
+    static const algo_t algos[] = {
+        miopenConvolutionBwdDataAlgoGEMM,
+        miopenConvolutionBwdDataAlgoDirect,
+        miopenConvolutionBwdDataAlgoFFT,
+        miopenConvolutionBwdDataAlgoWinograd,
+    };
+    static constexpr int num_algos = 4;
+    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
+                  "Missing MIOpen convolution backward data algorithms.");
+    int perf_count;
+    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+    size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
+    Workspace ws(max_ws_size);
+    MIOPEN_CHECK(miopenFindConvolutionBackwardDataAlgorithm(
+        args.handle,
+        args.odesc.desc(), args.output.data_ptr(),
+        args.wdesc.desc(), args.weight.data_ptr(),
+        args.cdesc.desc(),
+        args.idesc.desc(), args.input.data_ptr(),
+        num_algos,
+        &perf_count,
+        &perf_results,
+        ws.data,
+        ws.size,
+        false));
+    return getBestAlgorithm(perf_results.get(), args.params.deterministic, perf_count);
+  }
+
+  static void getAlgorithm(const ConvolutionArgs& args, algo_t* algo) {
+    throw std::runtime_error("miopenGetConvolutionBackwardDataAlgorithm is not supported.");
+
+    /*MIOPEN_CHECK(miopenGetConvolutionBackwardDataAlgorithm(
+        args.handle,
+        args.wdesc.desc(),
+        args.odesc.desc(),
+        args.cdesc.desc(),
+        args.idesc.desc(),
+        cudnn_convolution_BWD_DATA_PREFER_FASTEST,
+        0,
+        algo));*/
+  }
+
+  static void getWorkspaceSize(
+    const ConvolutionArgs& args,
+    miopenConvBwdDataAlgorithm_t algo, size_t* workspaceSize)
+  {
+    throw std::runtime_error("miopenGetConvolutionBackwardDataWorkspaceSize is not supported.");
+
+    /*MIOPEN_CHECK(miopenGetConvolutionBackwardDataWorkspaceSize(
+        args.handle,
+        args.wdesc.desc(),
+        args.odesc.desc(),
+        args.cdesc.desc(),
+        args.idesc.desc(),
+        algo,
+        workspaceSize));*/
+  }
+};
+
+template<>
+struct algorithm_search<miopenConvBwdWeightsAlgorithm_t> {
+  using perf_t = miopenConvAlgoPerf_t;
+  using algo_t = miopenConvBwdWeightsAlgorithm_t;
+
+  static constexpr auto DEFAULT_ALGO = miopenConvolutionBwdWeightsAlgoGEMM;
+  static BenchmarkCache<algo_t>& cache() { return bwd_filter_algos; }
+
+  static perf_t findAlgorithm(const ConvolutionArgs& args) {
+    static const algo_t algos[] = {
+        miopenConvolutionBwdWeightsAlgoGEMM,
+        miopenConvolutionBwdWeightsAlgoDirect,
+    };
+
+    static constexpr int num_algos = 2;
+    static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
+                  "Missing MIOpen convolution backward filter algorithms.");
+    std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
+    size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
+    int perf_count;
+    Workspace ws(max_ws_size);
+
+    MIOPEN_CHECK(miopenFindConvolutionBackwardWeightsAlgorithm(
+        args.handle,
+        args.odesc.desc(), args.output.data_ptr(),
+        args.idesc.desc(), args.input.data_ptr(),
+        args.cdesc.desc(),
+        args.wdesc.desc(), args.weight.data_ptr(),
+        num_algos,
+        &perf_count,
+        &perf_results,
+        ws.data,
+        ws.size,
+        false));
+    return getBestAlgorithm<perf_t>(perf_results.get(), args.params.deterministic, perf_count);
+  }
+
+  static void getAlgorithm(const ConvolutionArgs& args, algo_t* algo) {
+    throw std::runtime_error("miopenGetConvolutionBackwardFilterAlgorithm is not supported.");
+
+    /*MIOPEN_CHECK(miopenGetConvolutionBackwardFilterAlgorithm(
+        args.handle,
+        args.idesc.desc(),
+        args.odesc.desc(),
+        args.cdesc.desc(),
+        args.wdesc.desc(),
+        cudnn_convolution_BWD_FILTER_PREFER_FASTEST,
+        0,
+        algo));*/
+  }
+
+  static void getWorkspaceSize(const ConvolutionArgs& args, algo_t algo, size_t* workspaceSize)
+  {
+    throw std::runtime_error("miopenGetConvolutionBackwardFilterWorkspaceSize is not supported.");
+
+    /*MIOPEN_CHECK(miopenGetConvolutionBackwardFilterWorkspaceSize(
+        args.handle,
+        args.idesc.desc(),
+        args.odesc.desc(),
+        args.cdesc.desc(),
+        args.wdesc.desc(),
+        algo,
+        workspaceSize));*/
+  }
+};
+
+template<typename algo_t>
+void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) {
+  using search = algorithm_search<algo_t>;
+  auto& cache = search::cache();
+
+  if (cache.find(args.params, algo)) {
+    return;
+  }
+
+  if (args.params.deterministic && !benchmark) {
+    *algo = search::DEFAULT_ALGO;
+    return;
+  }
+
+  if (!benchmark) {
+    search::getAlgorithm(args, algo);
+    return;
+  }
+
+  if (cache.find(args.params, algo)) {
+    // re-check cache since another thread may have benchmarked the algorithm
+    return;
+  }
+
+  auto perfResults = search::findAlgorithm(args);
+  // for deterministic algo, look at all the perf results and return the best
+  // deterministic algo
+  if (perfResults.status == miopenStatusSuccess &&
+      !(args.params.deterministic /*&& perfResults.determinism != CUDNN_DETERMINISTIC*/)) {
+      *algo = perfResults.algo;
+  } else {
+      //*algo = search::DEFAULT_ALGO;
+      throw std::runtime_error("Deterministic algorithms aren't supported in MIOpen.");
+  }
+  cache.insert(args.params, *algo);
+
+  THCDeviceAllocator* allocator = THCCachingAllocator_get();
+  CUDA_CHECK(allocator->emptyCache(allocator->state));
+}
+
+template<typename algo_t>
+Workspace chooseAlgorithm(
+    const ConvolutionArgs& args,
+    bool benchmark,
+    algo_t* algo)
+{
+  findAlgorithm(args, benchmark, algo);
+
+  using search = algorithm_search<algo_t>;
+  size_t workspace_size;
+  search::getWorkspaceSize(args, *algo, &workspace_size);
+  try {
+    return Workspace(workspace_size);
+  } catch (std::runtime_error& e) {
+    hipGetLastError(); // clear OOM error
+
+    // switch to default algorithm and record it in the cache to prevent
+    // further OOM errors
+    *algo = search::DEFAULT_ALGO;
+    search::cache().insert(args.params, *algo);
+
+    search::getWorkspaceSize(args, *algo, &workspace_size);
+    return Workspace(workspace_size);
+  }
+}
+
+// ---------------------------------------------------------------------
+//
+// Bias addition
+//
+// ---------------------------------------------------------------------
+
+// In-place!
+void cudnn_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const TensorArg& bias)
+{
+  checkAllSameType(c, {output, bias});
+  checkAllSameGPU(c, {output, bias});
+  checkSize(c, bias, { output->size(output_channels_dim) });
+
+  // See Note [CuDNN broadcast padding].  Handle the left padding
+  // ourselves, but use TensorDescriptor's padding argument to do the rest.
+  TensorDescriptor bdesc, odesc;
+  bdesc.set(bias->expand({1, bias->size(0)}), output->dim());
+  odesc.set(*output);
+
+  auto handle = getMiopenHandle();
+  auto dataType = getMiopenDataType(*bias);
+  Constant one(dataType, 1);
+
+  MIOPEN_CHECK(miopenConvolutionForwardBias(handle, &one, bdesc.desc(), bias->data_ptr(),
+                                     &one, odesc.desc(), output->data_ptr()));
+}
+
+// The general strategy:
+//
+//    - cudnn_convolution (Tensor)
+//      Entry points for clients, takes bias
+//
+//    - cudnn_convolution_forward (TensorArg)
+//      Entry point, which may be reused between regular
+//      convolution and transposed convolution.  Does NOT take bias.
+//
+//    - raw_cudnn_convolution_forward_out (Tensor)
+//      Low level function which invokes CuDNN, and takes an output
+//      tensor which is directly written to (thus _out).
+//
+// Where does argument checking happen?  Here's the division of
+// responsibility:
+//  - Things that happen in at::Tensor
+//    - TensorArg allocation
+//    - setCuDNNStreamToCurrent
+//  - Things that happen in TensorArg
+//    - Check arguments (type, GPU, shape)
+//
+// TODO: Consider renaming zero-indexed arguments to "self"
+
+
+
+// ---------------------------------------------------------------------
+//
+// Convolution forward / Transposed convolution backward
+//
+// ---------------------------------------------------------------------
+
+// The raw API directly invokes CuDNN and does not emulate support
+// for group convolution on old versions of CuDNN.
+//
+// There are a few reasons this should never be directly exposed
+// via ATen:
+//
+//    - It takes output as a parameter (this should be computed!)
+//    - It doesn't do input checking
+//    - It doesn't resize output (it is assumed to be correctly sized)
+//    - It takes a ConvolutionParams struct
+//
+void raw_cudnn_convolution_forward_out(
+    const Tensor& output, const Tensor& input, const Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+
+  auto dataType = getMiopenDataType(input);
+
+  ConvolutionArgs args{ input, output, weight };
+  args.handle = getMiopenHandle();
+  setConvolutionParams(&args.params, input, weight, padding, stride, dilation, groups, deterministic);
+  args.idesc.set(input);
+  args.wdesc.set(weight);
+  args.odesc.set(output);
+  args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+
+  // TODO: when we do legacy group convolution support, we'll repeatedly
+  // reinitialize the workspace for each convolution we do.  This is
+  // wasteful; we'd rather reuse the workspace.  OTOH, legacy group
+  // convolution support is already pretty slow, so this might not
+  // matter.  (This applies to raw_cudnn_convolution_backward_input as well.)
+  miopenConvFwdAlgorithm_t fwdAlg;
+  Workspace workspace = chooseAlgorithm(args, benchmark, &fwdAlg);
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  MIOPEN_CHECK(miopenConvolutionForward(
+    args.handle,
+    &one, args.idesc.desc(), input.data_ptr(),
+    args.wdesc.desc(), weight.data_ptr(),
+    args.cdesc.desc(), fwdAlg, &zero,
+    args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size));
+}
+
+Tensor cudnn_convolution_forward(
+    CheckedFrom c,
+    const TensorArg& input, const TensorArg& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  checkAllSameType(c, {input, weight});
+  checkAllSameGPU(c, {input, weight});
+
+  auto output_t = input->type().tensor(
+                    conv_output_size(input->sizes(), weight->sizes(),
+                                     padding, stride, dilation, groups));
+
+  // Avoid ambiguity of "output" when this is being used as backwards
+  TensorArg output{ output_t, "result", 0 };
+  convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups);
+
+  // See #4500
+  Tensor weight_contig = weight->contiguous();
+
+#if CUDNN_VERSION < 7000
+  for (int i = 0; i < groups; i++) {
+    raw_cudnn_convolution_forward_out(
+        narrowGroup(*output, output_channels_dim,        i, groups),
+        narrowGroup(*input,  input_channels_dim,         i, groups),
+        narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
+        padding, stride, dilation, 1, benchmark, deterministic);
+  }
+#else
+  raw_cudnn_convolution_forward_out(
+      *output, *input, weight_contig,
+      padding, stride, dilation, groups, benchmark, deterministic);
+#endif
+
+  return *output;
+}
+
+Tensor cudnn_convolution(
+    const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t,
+    IntList padding, IntList stride, IntList dilation,
+    int64_t groups, bool benchmark, bool deterministic)
+{
+  TensorArg input  { input_t,  "input",  1 },
+            weight { weight_t, "weight", 2 },
+            bias   { bias_t,   "bias",   3 };
+  setMIOpenStreamToCurrent();
+  CheckedFrom c = "cudnn_convolution";
+  auto output_t = cudnn_convolution_forward(
+    c, input, weight, padding, stride, dilation, groups, benchmark, deterministic);
+  if (bias->defined()) {
+    cudnn_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
+  }
+  return output_t;
+}
+
+// NB: output_padding not needed here, as there is no ambiguity to
+// resolve
+Tensor cudnn_convolution_transpose_backward_input(
+    const Tensor& grad_output_t, const Tensor& weight_t,
+    IntList padding, IntList stride, IntList dilation,
+    int64_t groups, bool benchmark, bool deterministic)
+{
+  TensorArg grad_output { grad_output_t,  "grad_output", 1 },
+            weight      { weight_t, "weight", 2 };
+  setMIOpenStreamToCurrent();
+  return cudnn_convolution_forward(
+    "cudnn_convolution_transpose_backward_input",
+    grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_transpose_backward(
+    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
+
+  Tensor grad_output = grad_output_t.contiguous();
+
+  Tensor grad_input, grad_weight, grad_bias;
+  if (output_mask[0]) {
+    grad_input = at::cudnn_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
+  }
+  if (output_mask[1]) {
+    grad_weight = at::cudnn_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
+  }
+  if (output_mask[2]) {
+    grad_bias = at::cudnn_convolution_backward_bias(grad_output);
+  }
+
+  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+}
+
+// ---------------------------------------------------------------------
+//
+// Convolution backward / Transposed convolution forward
+//
+// ---------------------------------------------------------------------
+
+void raw_cudnn_convolution_backward_input_out(
+    const at::Tensor& grad_input,
+    const at::Tensor& grad_output,
+    const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+
+  auto dataType = getMiopenDataType(grad_output);
+
+  ConvolutionArgs args{ grad_input, grad_output, weight };
+  args.handle = getMiopenHandle();
+  setConvolutionParams(&args.params, grad_input, weight, padding, stride, dilation, groups, deterministic);
+  args.idesc.set(grad_input);
+  args.wdesc.set(weight);
+  args.odesc.set(grad_output);
+  args.cdesc.set(dataType, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+
+  miopenConvBwdDataAlgorithm_t bwdDataAlg;
+  Workspace workspace = chooseAlgorithm(args, benchmark, &bwdDataAlg);
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  MIOPEN_CHECK(miopenConvolutionBackwardData(
+      args.handle,
+      &one, args.odesc.desc(), grad_output.data_ptr(),
+      args.wdesc.desc(), weight.data_ptr(),
+      args.cdesc.desc(), bwdDataAlg, &zero,
+      args.idesc.desc(), grad_input.data_ptr(), workspace.data, workspace.size));
+}
+
+// Backward and transpose are algorithmically equivalent, but they
+// compute their geometry differently.  In a backwards, you knew what
+// the original size of the input tensor was, so you can cache that
+// geometry and fill it directly.  In transposed convolution, it is
+// more conventional to not explicitly specify the output (previously
+// input) size, and compute it.  This, however, leaves a degree of
+// freedom; this degree of freedom is resolved using the
+// output_padding parameter.  Both of these interfaces are equivalent,
+// but they are differently convenient depending on the use case.
+
+Tensor cudnn_convolution_backward_input(
+    CheckedFrom c,
+    IntList input_size, const TensorArg& grad_output, const TensorArg& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  checkAllSameType(c, {grad_output, weight});
+  checkAllSameGPU(c, {grad_output, weight});
+
+  auto grad_input_t = grad_output->type().tensor(input_size);
+
+  // Avoid "grad_input" when this is being used as transposed convolution
+  TensorArg grad_input{ grad_input_t, "result", 0 };
+  convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
+
+  // See #4500
+  Tensor weight_contig = weight->contiguous();
+
+#if CUDNN_VERSION < 7000
+  for (int i = 0; i < groups; i++) {
+    raw_cudnn_convolution_backward_input_out(
+        narrowGroup(*grad_input, input_channels_dim, i, groups),
+        narrowGroup(*grad_output, output_channels_dim, i, groups),
+        narrowGroup(weight_contig, weight_output_channels_dim, i, groups),
+        padding, stride, dilation, 1, benchmark, deterministic);
+  }
+#else
+  raw_cudnn_convolution_backward_input_out(
+      *grad_input, *grad_output, weight_contig,
+      padding, stride, dilation, groups, benchmark, deterministic);
+#endif
+
+  return *grad_input;
+}
+
+Tensor cudnn_convolution_transpose_forward(
+    CheckedFrom c,
+    const TensorArg& grad_output, const TensorArg& weight,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(),
+                                    padding, output_padding, stride, dilation, groups);
+  return cudnn_convolution_backward_input(c, input_size, grad_output, weight,
+                                    padding, stride, dilation, groups, benchmark, deterministic);
+}
+
+Tensor cudnn_convolution_backward_input(
+    IntList input_size, const Tensor& grad_output_t, const Tensor& weight_t,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            weight{ weight_t, "weight", 2 };
+  setMIOpenStreamToCurrent();
+  return cudnn_convolution_backward_input(
+      "cudnn_convolution_backward_input",
+      input_size, grad_output, weight,
+      padding, stride, dilation, groups, benchmark, deterministic);
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_backward(
+    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
+
+  Tensor grad_output = grad_output_t.contiguous();
+
+  Tensor grad_input, grad_weight, grad_bias;
+  if (output_mask[0]) {
+    grad_input = at::cudnn_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
+  }
+  if (output_mask[1]) {
+    grad_weight = at::cudnn_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
+  }
+  if (output_mask[2]) {
+    grad_bias = at::cudnn_convolution_backward_bias(grad_output);
+  }
+
+  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+}
+
+Tensor cudnn_convolution_transpose(
+    const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t,
+    IntList padding, IntList output_padding, IntList stride, IntList dilation,
+    int64_t groups, bool benchmark, bool deterministic)
+{
+  TensorArg input  { input_t,  "input",  1 },
+            weight { weight_t, "weight", 2 },
+            bias   { bias_t,   "bias",   3 };
+  CheckedFrom c = "cudnn_convolution_transpose";
+  auto output_t = cudnn_convolution_transpose_forward(
+    c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic);
+  if (bias->defined()) {
+    cudnn_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
+  }
+  return output_t;
+}
+
+// ---------------------------------------------------------------------
+//
+// Convolution backward (weight)
+//
+// ---------------------------------------------------------------------
+
+void raw_cudnn_convolution_backward_weight_out(
+    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic) {
+
+  auto dataType = getMiopenDataType(input);
+
+  ConvolutionArgs args{ input, grad_output, grad_weight };
+  args.handle = getMiopenHandle();
+  setConvolutionParams(&args.params, input, grad_weight, padding, stride, dilation, groups, deterministic);
+  args.idesc.set(input);
+  args.wdesc.set(grad_weight);
+  args.odesc.set(grad_output);
+  args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
+
+  miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
+  Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg);
+
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  MIOPEN_CHECK(miopenConvolutionBackwardWeights(
+      args.handle,
+      &one, args.odesc.desc(), grad_output.data_ptr(),
+      args.idesc.desc(), input.data_ptr(),
+      args.cdesc.desc(), bwdFilterAlg, &zero,
+      args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
+}
+
+Tensor cudnn_convolution_backward_weight(
+    CheckedFrom c,
+    IntList weight_size, const TensorArg& grad_output, const TensorArg& input,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+
+  checkAllSameType(c, {grad_output, input});
+  checkAllSameGPU(c, {grad_output, input});
+
+  auto grad_weight_t = grad_output->type().tensor(weight_size);
+
+  // For uniformity with everything else, although it seems grad_weight
+  // would be unambiguous too.
+  TensorArg grad_weight{ grad_weight_t, "result", 0 };
+  convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
+
+#if CUDNN_VERSION < 7000
+  for (int i = 0; i < groups; i++) {
+    raw_cudnn_convolution_backward_weight_out(
+        narrowGroup(*grad_weight, weight_output_channels_dim, i, groups),
+        narrowGroup(*grad_output, output_channels_dim, i, groups),
+        narrowGroup(*input, input_channels_dim, i, groups),
+        padding, stride, dilation, groups, benchmark, deterministic);
+  }
+#else
+  raw_cudnn_convolution_backward_weight_out(
+      *grad_weight, *grad_output, *input,
+      padding, stride, dilation, groups, benchmark, deterministic);
+#endif
+
+  return grad_weight_t;
+}
+
+Tensor cudnn_convolution_backward_weight(
+    IntList weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            input{ input_t, "input", 2 };
+  setMIOpenStreamToCurrent();
+  return cudnn_convolution_backward_weight(
+      "cudnn_convolution_backward_weight",
+      weight_size, grad_output, input,
+      padding, stride, dilation, groups, benchmark, deterministic);
+}
+
+Tensor cudnn_convolution_transpose_backward_weight(
+    IntList weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntList padding, IntList stride, IntList dilation, int64_t groups,
+    bool benchmark, bool deterministic)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            input{ input_t, "input", 2 };
+  setMIOpenStreamToCurrent();
+  return cudnn_convolution_backward_weight(
+      "cudnn_convolution_backward_weight",
+      weight_size, input, grad_output,
+      padding, stride, dilation, groups, benchmark, deterministic);
+}
+
+// ---------------------------------------------------------------------
+//
+// Convolution backward (bias)
+//
+// ---------------------------------------------------------------------
+
+Tensor cudnn_convolution_backward_bias(
+    const Tensor& grad_output_t)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 };
+  setMIOpenStreamToCurrent();
+
+  auto grad_bias_t = grad_output->type().tensor(
+                        { grad_output->size(output_channels_dim) });
+
+  TensorArg grad_bias{ grad_bias_t, "result", 0 };
+
+  // See Note [CuDNN broadcast padding].  Handle the left padding
+  // ourselves, but use TensorDescriptor's pad argument to do the rest.
+  TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}),
+                         static_cast<size_t>(grad_output->dim())};
+  TensorDescriptor odesc{*grad_output};
+
+  auto handle = getMiopenHandle();
+  auto dataType = getMiopenDataType(*grad_bias);
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
+
+  MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(),
+                                                   &zero, bdesc.desc(), grad_bias->data_ptr()));
+  return *grad_bias;
+}
+
+
+}}  // namespace
+
+#endif
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index d6b45b82673dbf..4cd9d347a071b9 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -574,6 +574,8 @@
 - func: index_copy_(Tensor self, int64_t dim, IndexTensor index, Tensor source) -> Tensor
   variants: method
 
+- func: index_put(Tensor self, TensorList indices, Tensor values) -> Tensor
+
 - func: index_put_(Tensor self, TensorList indices, Tensor values) -> Tensor
 
 - func: isclose(Tensor self, Tensor other, double rtol=1e-5, double atol=1e-8, bool equal_nan=False) -> Tensor
@@ -798,6 +800,9 @@
 
 - func: randperm_out(Tensor result, int64_t n, *, Generator* generator=nullptr) -> Tensor
   variants: function
+  dispatch:
+    CPU: randperm_out_cpu
+    CUDA: randperm_out_cuda
 
 - func: range(Type dtype, Scalar start, Scalar end, Scalar step=1) -> Tensor
   variants: function
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index f7a40f58d17ef5..8515c95f0ca510 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -3,7 +3,6 @@ IF (MSVC)
     return()
   ENDIF()
 ENDIF(MSVC)
-
 list(APPEND ATen_CPU_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/scalar_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/apply_utils_test.cpp
diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp
index e299da20bcdaf9..1ad8491d8c97db 100644
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@@ -8,6 +8,22 @@
 
 #include <atomic>
 
+typedef struct _THTensor
+{
+    int64_t *size;
+    int64_t *stride;
+    int nDimension;
+
+    // Note: storage->size may be greater than the recorded size
+    // of a tensor
+    THStorage *storage;
+    ptrdiff_t storageOffset;
+    std::atomic<int> refcount;
+
+    char flag;
+
+} _THTensor;
+
 #include "generic/THTensor.hpp"
 #include "THGenerateAllTypes.h"
 
diff --git a/aten/src/TH/generic/THTensor.hpp b/aten/src/TH/generic/THTensor.hpp
index c7bd01d803e0cf..1f5d99fa6d17b6 100644
--- a/aten/src/TH/generic/THTensor.hpp
+++ b/aten/src/TH/generic/THTensor.hpp
@@ -2,20 +2,9 @@
 #define TH_GENERIC_FILE "generic/THTensor.hpp"
 #else
 
-typedef struct THTensor
-{
-    int64_t *size;
-    int64_t *stride;
-    int nDimension;
-
-    // Note: storage->size may be greater than the recorded size
-    // of a tensor
-    THStorage *storage;
-    ptrdiff_t storageOffset;
-    std::atomic<int> refcount;
-
-    char flag;
-
+// This allows one to write generic functions (i.e. functions that work across all THRealTensor types)
+// without having to explicitly cast the THRealTensor.
+typedef struct THTensor : _THTensor {
 } THTensor;
 
 #endif
diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp
index 88f4e3c9daaa9b..677972333e59c7 100644
--- a/aten/src/TH/generic/THTensorRandom.cpp
+++ b/aten/src/TH/generic/THTensorRandom.cpp
@@ -2,6 +2,8 @@
 #define TH_GENERIC_FILE "generic/THTensorRandom.cpp"
 #else
 
+#include <cmath>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -61,7 +63,7 @@ void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p)
 #define BERNOULLI_OMP 800
 #define TH_OMP_OVERHEAD_THRESHOLD_COPY 20000
 
-void iBernoulli_generate_copy(THTensor *self, THGenerator *_generator, const double p)
+void THTensor_(iBernoulli_generate_copy)(THTensor *self, THGenerator *_generator, const double p)
 {
   int64_t seed = THRandom_random(_generator);
   int64_t n = THTensor_(nElement)(self);
@@ -135,7 +137,7 @@ void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p)
 #ifdef TH_BLAS_MKL
   if(cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) {
     std::lock_guard<std::mutex> lock(_generator->mutex);
-    iBernoulli_generate_copy(self, _generator, p);
+    THTensor_(iBernoulli_generate_copy)(self, _generator, p);
   } else {
     std::lock_guard<std::mutex> lock(_generator->mutex);
     TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_bernoulli(_generator, p););
@@ -403,6 +405,10 @@ void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTenso
                             THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);),
                             2,
                             "invalid multinomial distribution (encountering probability entry < 0)");
+      THArgCheckWithCleanup((std::isfinite(val)),
+                            THCleanup(THDoubleTensor_free(cum_dist); if (start_dim == 1) THTensor_(squeeze1d)(prob_dist, prob_dist, 0);),
+                            2,
+                            "invalid multinomial distribution (encountering probability entry = infinity or NaN)");
       sum += val;
       THDoubleStorage_set(
         cum_dist->storage, \
diff --git a/aten/src/THC/THCApply.cuh b/aten/src/THC/THCApply.cuh
index 724820d6726005..c202e6d1e0b663 100644
--- a/aten/src/THC/THCApply.cuh
+++ b/aten/src/THC/THCApply.cuh
@@ -165,13 +165,11 @@ inline dim3 getApplyBlock() {
   return dim3(THC_APPLY_THREADS_PER_BLOCK);
 }
 
-inline bool getApplyGrid(THCState* state, uint64_t totalElements, dim3& grid) {
-  int curDevice = -1;
-  cudaGetDevice(&curDevice);
+inline bool getApplyGrid(THCState* state, uint64_t totalElements, dim3& grid, int curDevice) {
   if (curDevice == -1) return false;
 
   uint64_t numBlocks = THCCeilDiv(totalElements, static_cast<uint64_t>(THC_APPLY_THREADS_PER_BLOCK));
-  uint64_t maxGridX = THCState_getCurrentDeviceProperties(state)->maxGridSize[0];
+  uint64_t maxGridX = THCState_getDeviceProperties(state, curDevice)->maxGridSize[0];
   if (numBlocks > maxGridX)
       numBlocks = maxGridX;
 
@@ -192,11 +190,11 @@ bool THC_pointwiseApply1(THCState* state,
                          const Op& op,
                          TensorArgType aType = ReadWrite) {
   static_assert(std::is_same<ScalarTypeA, typename TensorUtils<TensorTypeA>::DataType>::value, "ScalarTypeA must match");
-  if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS) {
+  if (THCTensor_nDimension(state, a) > MAX_CUTORCH_DIMS) {
     return false;
   }
 
-  if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
+  if (THCTensor_nDimension(state, a) == 0) {
     // Zero-dim tensor; do nothing
     return true;
   }
@@ -204,9 +202,11 @@ bool THC_pointwiseApply1(THCState* state,
   const dim3 block = getApplyBlock();
 
   dim3 grid;
-  ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
-
-  if (!getApplyGrid(state, totalElements, grid)) {
+  ptrdiff_t totalElements = THCTensor_nElement(state, a);
+  
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  if (!getApplyGrid(state, totalElements, grid, curDevice)) {
     return false;
   }
 
@@ -218,7 +218,7 @@ bool THC_pointwiseApply1(THCState* state,
   TensorTypeA* oldA = NULL;
 
   if (aType == ReadWrite &&
-      TensorUtils<TensorTypeA>::maybeOverlappingIndices(state, a)) {
+      THCTensor_maybeOverlappingIndices(state, a)) {
     // Must perform in contiguous space
     oldA = a;
     a = TensorUtils<TensorTypeA>::newContiguous(state, a);
@@ -236,8 +236,8 @@ bool THC_pointwiseApply1(THCState* state,
   kernelPointwiseApply1<Op,                                             \
                         ScalarTypeA,                                    \
                         TYPE, A>                                        \
-    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
-      OffsetInfo<ScalarTypeA, TYPE, A>                                  \
+    <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(             \
+      OffsetInfo<ScalarTypeA, TYPE, A>  \
           (aInfo),                                                      \
       (TYPE) totalElements, op);
 
@@ -259,7 +259,7 @@ bool THC_pointwiseApply1(THCState* state,
   // and the resulting non-linear offset is all computable using 32-bit math?)
   // We also use unsigned index math in the kernel, as signed div/mod has
   // additional overhead.
-  if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a)) {
+  if (THCTensor_canUse32BitIndexMath(state, a)) {
     TensorInfo<ScalarTypeA, unsigned int> aInfo =
       getTensorInfo<ScalarTypeA, TensorTypeA, unsigned int>(state, a);
     rearrangeDims(&aInfo);
@@ -310,7 +310,7 @@ bool THC_pointwiseApply1(THCState* state,
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
     TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
-    TensorUtils<TensorTypeA>::free(state, a);
+    THCTensor_free(state, a);
     a = oldA;
   }
 
@@ -331,17 +331,17 @@ bool THC_pointwiseApply2(THCState* state,
   static_assert(std::is_same<ScalarTypeA, typename TensorUtils<TensorTypeA>::DataType>::value, "ScalarTypeA must match");
   static_assert(std::is_same<ScalarTypeB, typename TensorUtils<TensorTypeB>::DataType>::value, "ScalarTypeB must match");
 
-  ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
-  if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b)) {
+  ptrdiff_t totalElements = THCTensor_nElement(state, a);
+  if (totalElements != THCTensor_nElement(state, b)) {
     return false;
   }
 
-  if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS ||
-      TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS) {
+  if (THCTensor_nDimension(state, a) > MAX_CUTORCH_DIMS ||
+      THCTensor_nDimension(state, b) > MAX_CUTORCH_DIMS) {
     return false;
   }
 
-  if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
+  if (THCTensor_nDimension(state, a) == 0) {
     // Zero-dim tensor; do nothing
     return true;
   }
@@ -349,7 +349,9 @@ bool THC_pointwiseApply2(THCState* state,
   const dim3 block = getApplyBlock();
 
   dim3 grid;
-  if (!getApplyGrid(state, totalElements, grid)) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  if (!getApplyGrid(state, totalElements, grid, curDevice)) {
     return false;
   }
 
@@ -362,13 +364,13 @@ bool THC_pointwiseApply2(THCState* state,
   TensorTypeB* oldB = NULL;
 
   if (aType == ReadWrite &&
-      TensorUtils<TensorTypeA>::maybeOverlappingIndices(state, a)) {
+      THCTensor_maybeOverlappingIndices(state, a)) {
     // Must perform in contiguous space
     oldA = a;
     a = TensorUtils<TensorTypeA>::newContiguous(state, a);
   }
   if (bType == ReadWrite &&
-      TensorUtils<TensorTypeB>::maybeOverlappingIndices(state, b)) {
+      THCTensor_maybeOverlappingIndices(state, b)) {
     // Must perform in contiguous space
     oldB = b;
     b = TensorUtils<TensorTypeB>::newContiguous(state, b);
@@ -387,8 +389,8 @@ bool THC_pointwiseApply2(THCState* state,
                         ScalarTypeA,                                    \
                         ScalarTypeB,                                    \
                         TYPE, A, B>                                     \
-    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
-      OffsetInfo<ScalarTypeA, TYPE, A>                                  \
+    <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(             \
+      OffsetInfo<ScalarTypeA, TYPE, A>  \
           (aInfo),                                                      \
       OffsetInfo<ScalarTypeB, TYPE, B>                                  \
           (bInfo),                                                      \
@@ -422,8 +424,8 @@ bool THC_pointwiseApply2(THCState* state,
   }                                         \
 }
 
-  if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) &&
-      TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b)) {
+  if (THCTensor_canUse32BitIndexMath(state, a) &&
+      THCTensor_canUse32BitIndexMath(state, b)) {
     TensorInfo<ScalarTypeA, unsigned int> aInfo =
       getTensorInfo<ScalarTypeA, TensorTypeA, unsigned int>(state, a);
 
@@ -490,7 +492,7 @@ bool THC_pointwiseApply2(THCState* state,
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
     TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
-    TensorUtils<TensorTypeA>::free(state, a);
+    THCTensor_free(state, a);
     a = oldA;
   }
 
@@ -499,7 +501,7 @@ bool THC_pointwiseApply2(THCState* state,
     // instead, it will recursively try and invoke ourselves to make
     // oldB contiguous.
     TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b);
-    TensorUtils<TensorTypeB>::free(state, b);
+    THCTensor_free(state, b);
     b = oldB;
   }
 
@@ -525,20 +527,20 @@ bool THC_pointwiseApply3(THCState* state,
   static_assert(std::is_same<ScalarTypeB, typename TensorUtils<TensorTypeB>::DataType>::value, "ScalarTypeB must match");
   static_assert(std::is_same<ScalarTypeC, typename TensorUtils<TensorTypeC>::DataType>::value, "ScalarTypeB must match");
 
-  ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+  ptrdiff_t totalElements = THCTensor_nElement(state, a);
 
-  if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b) ||
-      totalElements != TensorUtils<TensorTypeC>::getNumElements(state, c)) {
+  if (totalElements != THCTensor_nElement(state, b) ||
+      totalElements != THCTensor_nElement(state, c)) {
     return false;
   }
 
-  if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS ||
-      TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS ||
-      TensorUtils<TensorTypeC>::getDims(state, c) > MAX_CUTORCH_DIMS) {
+  if (THCTensor_nDimension(state, a) > MAX_CUTORCH_DIMS ||
+      THCTensor_nDimension(state, b) > MAX_CUTORCH_DIMS ||
+      THCTensor_nDimension(state, c) > MAX_CUTORCH_DIMS) {
     return false;
   }
 
-  if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
+  if (THCTensor_nDimension(state, a) == 0) {
     // Zero-dim tensor; do nothing
     return true;
   }
@@ -546,7 +548,9 @@ bool THC_pointwiseApply3(THCState* state,
   const dim3 block = getApplyBlock();
 
   dim3 grid;
-  if (!getApplyGrid(state, totalElements, grid)) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  if (!getApplyGrid(state, totalElements, grid, curDevice)) {
     return false;
   }
 
@@ -560,19 +564,19 @@ bool THC_pointwiseApply3(THCState* state,
   TensorTypeC* oldC = NULL;
 
   if (aType == ReadWrite &&
-      TensorUtils<TensorTypeA>::maybeOverlappingIndices(state, a)) {
+      THCTensor_maybeOverlappingIndices(state, a)) {
     // Must perform in contiguous space
     oldA = a;
     a = TensorUtils<TensorTypeA>::newContiguous(state, a);
   }
   if (bType == ReadWrite &&
-      TensorUtils<TensorTypeB>::maybeOverlappingIndices(state, b)) {
+      THCTensor_maybeOverlappingIndices(state, b)) {
     // Must perform in contiguous space
     oldB = b;
     b = TensorUtils<TensorTypeB>::newContiguous(state, b);
   }
   if (cType == ReadWrite &&
-      TensorUtils<TensorTypeC>::maybeOverlappingIndices(state, c)) {
+      THCTensor_maybeOverlappingIndices(state, c)) {
     // Must perform in contiguous space
     oldC = c;
     c = TensorUtils<TensorTypeC>::newContiguous(state, c);
@@ -584,7 +588,7 @@ bool THC_pointwiseApply3(THCState* state,
                         ScalarTypeB,                                    \
                         ScalarTypeC,                                    \
                         TYPE, A, B, C>                                  \
-    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
+    <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(             \
       OffsetInfo<ScalarTypeA, TYPE, A>                                  \
           (aInfo),                                                      \
       OffsetInfo<ScalarTypeB, TYPE, B>                                  \
@@ -635,9 +639,9 @@ bool THC_pointwiseApply3(THCState* state,
   }                                         \
 }
 
-  if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) &&
-      TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b) &&
-      TensorUtils<TensorTypeC>::canUse32BitIndexMath(state, c)) {
+  if (THCTensor_canUse32BitIndexMath(state, a) &&
+      THCTensor_canUse32BitIndexMath(state, b) &&
+      THCTensor_canUse32BitIndexMath(state, c)) {
     TensorInfo<ScalarTypeA, unsigned int> aInfo =
       getTensorInfo<ScalarTypeA, TensorTypeA, unsigned int>(state, a);
 
@@ -720,7 +724,7 @@ bool THC_pointwiseApply3(THCState* state,
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
     TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
-    TensorUtils<TensorTypeA>::free(state, a);
+    THCTensor_free(state, a);
     a = oldA;
   }
 
@@ -729,7 +733,7 @@ bool THC_pointwiseApply3(THCState* state,
     // instead, it will recursively try and invoke ourselves to make
     // oldB contiguous.
     TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b);
-    TensorUtils<TensorTypeB>::free(state, b);
+    THCTensor_free(state, b);
     b = oldB;
   }
 
@@ -738,7 +742,7 @@ bool THC_pointwiseApply3(THCState* state,
     // instead, it will recursively try and invoke ourselves to make
     // oldC contiguous.
     TensorUtils<TensorTypeC>::copyIgnoringOverlaps(state, oldC, c);
-    TensorUtils<TensorTypeC>::free(state, c);
+    THCTensor_free(state, c);
     c = oldC;
   }
 
diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh
index 9e54c56dc45534..e89bf424e30aea 100644
--- a/aten/src/THC/THCAtomics.cuh
+++ b/aten/src/THC/THCAtomics.cuh
@@ -103,7 +103,7 @@ static inline  __device__ void atomicAdd(half *address, half val) {
 
   do {
     assumed = old;
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
     half hsum;
     hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
     hsum = THCNumerics<half>::add(hsum, val);
@@ -135,7 +135,7 @@ static inline  __device__  void atomicAdd(double *address, double val) {
     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
 } while (assumed != old);
 }
-#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000)
+#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__)
   // This needs to be defined for the host side pass
   static inline  __device__  void atomicAdd(double *address, double val) { }
 #endif
diff --git a/aten/src/THC/THCHalf.h b/aten/src/THC/THCHalf.h
index 43ffd76e316896..d9b8cba72d2081 100644
--- a/aten/src/THC/THCHalf.h
+++ b/aten/src/THC/THCHalf.h
@@ -4,22 +4,16 @@
 #include "THCGeneral.h"
 
 /* We compile with CudaHalfTensor support if we have this: */
-#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
+#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 || defined(__HIP_PLATFORM_HCC__)
 #define CUDA_HALF_TENSOR 1
 #endif
 
-/* For HIP, rely on the half instructions as well.*/
-#if defined(__HIP_PLATFORM_HCC__)
-#define CUDA_HALF_TENSOR 1
-#define CUDA_HALF_INSTRUCTIONS 1
-#endif
-
 #ifdef CUDA_HALF_TENSOR
 
 #include <cuda_fp16.h>
 #include <stdint.h>
 
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
 #ifndef __cplusplus
 typedef __half_raw half;
 #endif
diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh
index 1e7f15e5d51473..40d6b9274b2bc9 100644
--- a/aten/src/THC/THCNumerics.cuh
+++ b/aten/src/THC/THCNumerics.cuh
@@ -48,6 +48,7 @@ struct THCNumerics<uint8_t> {
   static inline __host__ __device__  uint8_t abs(uint8_t a) { return a; }
   static inline __host__ __device__  uint8_t pow(uint8_t a, uint8_t b) { return powi<uint8_t>(a, b); }
   static inline __host__ __device__  bool isnan(uint8_t a) { return false; }
+  static inline __host__ __device__  bool isinf(uint8_t a) { return false; }
 };
 
 template <>
@@ -70,6 +71,7 @@ struct THCNumerics<int8_t> {
   static inline __host__ __device__  int8_t abs(int8_t a) { return ::abs((int)a); }
   static inline __host__ __device__  int8_t pow(int8_t a, int8_t b) { return powi<int8_t>(a, b); }
   static inline __host__ __device__  bool isnan(int8_t a) { return false; }
+  static inline __host__ __device__  bool isinf(int8_t a) { return false; }
 };
 
 template <>
@@ -92,6 +94,7 @@ struct THCNumerics<int16_t> {
   static inline __host__ __device__  int16_t abs(int16_t a) { return ::abs((int)a); }
   static inline __host__ __device__  int16_t pow(int16_t a, int16_t b) { return powi<int16_t>(a, b); }
   static inline __host__ __device__  bool isnan(int16_t a) { return false; }
+  static inline __host__ __device__  bool isinf(int16_t a) { return false; }
 };
 
 template <>
@@ -114,6 +117,7 @@ struct THCNumerics<int32_t> {
   static inline __host__ __device__  int32_t abs(int32_t a) { return ::abs(a); }
   static inline __host__ __device__  int32_t pow(int32_t a, int32_t b) { return powi<int32_t>(a, b); }
   static inline __host__ __device__  bool isnan(int32_t a) { return false; }
+  static inline __host__ __device__  bool isinf(int32_t a) { return false; }
 };
 
 template <>
@@ -142,6 +146,7 @@ struct THCNumerics<int64_t> {
   static inline __host__ __device__  int64_t abs(int64_t a) { return labs(a); }
   static inline __host__ __device__  int64_t pow(int64_t a, int64_t b) { return powi<int64_t>(a, b); }
   static inline __host__ __device__  bool isnan(int64_t a) { return false; }
+  static inline __host__ __device__  bool isinf(int64_t a) { return false; }
 };
 
 #ifdef CUDA_HALF_TENSOR
@@ -624,6 +629,19 @@ static inline __host__ __device__ half lgamma(half a) {
     return ne(a, a);
   }
 
+  static inline __host__ __device__ bool isinf(half a) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hisinf(a) != 0;
+#else
+    float fa = __half2float(a);
+    return ::isinf(fa);
+#endif
+#else // __CUDA_ARCH__
+    return ::isinf(THC_half2float(a));
+#endif
+  }
+
 };
 #endif
 
@@ -677,6 +695,7 @@ struct THCNumerics<float> {
   static inline __host__ __device__  float pow  (float a, float b) { return powf(a, b); }
   static inline __host__ __device__  float atan2(float a, float b) { return atan2f(a, b); }
   static inline __host__ __device__  bool isnan(float a) { return ::isnan(a); }
+  static inline __host__ __device__  bool isinf(float a) { return ::isinf(a); }
 };
 
 template <>
@@ -729,6 +748,7 @@ struct THCNumerics<double> {
   static inline __host__ __device__  double pow  (double a, double b) { return ::pow(a, b); }
   static inline __host__ __device__  double atan2(double a, double b) { return ::atan2(a, b); }
   static inline __host__ __device__  bool isnan(double a) { return ::isnan(a); }
+  static inline __host__ __device__  bool isinf(double a) { return ::isinf(a); }
 };
 
 /// `half` has some type conversion issues associated with it, since it
diff --git a/aten/src/THC/THCReduce.cuh b/aten/src/THC/THCReduce.cuh
index 01afbd5fc44067..e046a7041e4012 100644
--- a/aten/src/THC/THCReduce.cuh
+++ b/aten/src/THC/THCReduce.cuh
@@ -283,18 +283,18 @@ bool THC_reduceDim(THCState* state,
                    int dim,
                    int keepdim) {
   static_assert(std::is_same<ScalarType, typename TensorUtils<TensorType>::DataType>::value, "ScalarType must match");
-  ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);
+  ptrdiff_t inElements = THCTensor_nElement(state, in);
 
-  int64_t reductionSize = TensorUtils<TensorType>::getSize(state, in, dim);
-  int64_t reductionStride = TensorUtils<TensorType>::getStride(state, in, dim);
+  int64_t reductionSize = THCTensor_size(state, in, dim);
+  int64_t reductionStride = THCTensor_stride(state, in, dim);
   ptrdiff_t outElements = inElements / reductionSize;
 
-  if (TensorUtils<TensorType>::getDims(state, out) > MAX_CUTORCH_DIMS ||
-      TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
+  if (THCTensor_nDimension(state, out) > MAX_CUTORCH_DIMS ||
+      THCTensor_nDimension(state, in) > MAX_CUTORCH_DIMS) {
     return false;
   }
 
-  if (TensorUtils<TensorType>::getDims(state, in) == 0) {
+  if (THCTensor_nDimension(state, in) == 0) {
     // Zero-dim tensor; do nothing
     return true;
   }
@@ -343,13 +343,13 @@ bool THC_reduceDim(THCState* state,
   // Resize out to correspond to the reduced size with keepdim=True.
 
   // Preserve noncontiguities by unsqueezing out if necessary
-  TensorUtils<TensorType>::preserveReduceDimSemantics(
-      state, out, TensorUtils<TensorType>::getDims(state, in), dim, keepdim);
+  THCTensor_preserveReduceDimSemantics(
+      state, out, THCTensor_nDimension(state, in), dim, keepdim);
 
   // Resize out
-  THLongStorage* sizes = TensorUtils<TensorType>::newSizeOf(state, in);
+  THLongStorage* sizes = THCTensor_newSizeOf(state, in);
   THLongStorage_set(sizes, dim, 1);
-  TensorUtils<TensorType>::resize(state, out, sizes, NULL);
+  THCTensor_resize(state, out, sizes, NULL);
   THLongStorage_free(sizes);
 
   // It is possible that the tensor dimensions are able to be collapsed,
@@ -418,8 +418,8 @@ bool THC_reduceDim(THCState* state,
     }                                                     \
   }
 
-  if (TensorUtils<TensorType>::canUse32BitIndexMath(state, out) &&
-      TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) {
+  if (THCTensor_canUse32BitIndexMath(state, out) &&
+      THCTensor_canUse32BitIndexMath(state, in)) {
     TensorInfo<ScalarType,
                unsigned int> outInfo =
       getTensorInfo<ScalarType, TensorType, unsigned int>(state, out);
@@ -459,7 +459,7 @@ bool THC_reduceDim(THCState* state,
 
 
   if (!keepdim) {
-    TensorUtils<TensorType>::squeeze1d(state, out, out, dim);
+    THCTensor_squeeze1d(state, out, out, dim);
   }
   return true;
 }
diff --git a/aten/src/THC/THCReduceAll.cuh b/aten/src/THC/THCReduceAll.cuh
index 1237d1998534c6..db9431de8de1da 100644
--- a/aten/src/THC/THCReduceAll.cuh
+++ b/aten/src/THC/THCReduceAll.cuh
@@ -232,13 +232,13 @@ bool THC_reduceAll(THCState* state,
                    AccT* out,
                    int outOnDevice) {
   static_assert(std::is_same<ScalarType, typename TensorUtils<TensorType>::DataType>::value, "ScalarTypeA must match");
-  ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);
+  ptrdiff_t inElements = THCTensor_nElement(state, in);
 
-  if (TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
+  if (THCTensor_nDimension(state, in) > MAX_CUTORCH_DIMS) {
     return false;
   }
 
-  if (TensorUtils<TensorType>::getDims(state, in) == 0) {
+  if (THCTensor_nDimension(state, in) == 0) {
     // Zero-dim tensor; do nothing
     *out = init;
     return true;
@@ -283,7 +283,7 @@ bool THC_reduceAll(THCState* state,
     }                                             \
   }
 
-  if (TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) {
+  if (THCTensor_canUse32BitIndexMath(state, in)) {
     TensorInfo<ScalarType, unsigned int> inInfo =
       getTensorInfo<ScalarType, TensorType, unsigned int>(state, in);
     inInfo.collapseDims();
diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp
index c1e09275ade571..6547beb9d6ede8 100644
--- a/aten/src/THC/THCStorage.cpp
+++ b/aten/src/THC/THCStorage.cpp
@@ -8,6 +8,63 @@
 #include "generic/THCStorage.cpp"
 #include "THCGenerateAllTypes.h"
 
+THCStorage* THCStorage_new(THCState *state, at::ScalarType scalar_type)
+{
+  return THCStorage_newWithSize(state, scalar_type, 0);
+}
+
+THCStorage* THCStorage_newWithSize(THCState *state, at::ScalarType scalar_type, ptrdiff_t size)
+{
+  return THCStorage_newWithAllocator(
+    state, scalar_type, size,
+    state->cudaDeviceAllocator,
+    state->cudaDeviceAllocator->state);
+}
+
+THCStorage* THCStorage_newWithAllocator(THCState *state,
+                                        at::ScalarType scalar_type,
+                                        ptrdiff_t size,
+                                        THCDeviceAllocator* allocator,
+                                        void* allocatorContext)
+{
+  THArgCheck(size >= 0, 2, "invalid size");
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+
+  THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage));
+  memset(storage, 0, sizeof(THCStorage));
+  new (&storage->refcount) std::atomic<int>(1);
+  storage->scalar_type = scalar_type;
+  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM;
+  storage->allocator = allocator;
+  storage->allocatorContext = allocatorContext;
+  storage->size = size;
+  storage->device = device;
+
+  if(size > 0)
+  {
+    // update heap *before* attempting malloc, to free space for the malloc
+    cudaError_t err =
+      (*allocator->malloc)(allocatorContext,
+                           (void**)&(storage->data_ptr),
+                           size * at::elementSize(scalar_type),
+                           THCState_getCurrentStreamOnDevice(state, device));
+    if(err != cudaSuccess){
+      free(storage);
+    }
+    THCudaCheck(err);
+  } else {
+    storage->data_ptr = NULL;
+  }
+  return storage;
+}
+
+void THCStorage_retain(THCState *state, THCStorage *self)
+{
+  if(self && (self->flag & TH_STORAGE_REFCOUNTED))
+    self->refcount++;
+}
+
 void THCStorage_free(THCState *state, THCStorage *self)
 {
   if(!(self->flag & TH_STORAGE_REFCOUNTED))
@@ -26,3 +83,75 @@ void THCStorage_free(THCState *state, THCStorage *self)
     THFree(self);
   }
 }
+
+void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size)
+{
+  THArgCheck(size >= 0, 2, "invalid size");
+  THAssert(self->allocator != NULL);
+  int device;
+  THCudaCheck(cudaGetDevice(&device));
+
+  if(!(self->flag & TH_STORAGE_RESIZABLE))
+    THError("Trying to resize storage that is not resizable");
+
+  size_t elementSize = at::elementSize(self->scalar_type);
+
+  if (self->allocator->realloc) {
+    void * data_ptr = self->data_ptr;
+    cudaError_t err = (*self->allocator->realloc)(
+      self->allocatorContext,
+      (void**)&(data_ptr),
+      self->size * elementSize,
+      size * elementSize, THCState_getCurrentStreamOnDevice(state, device));
+    if (err != cudaSuccess) {
+      THCudaCheck(err);
+    }
+    self->size = size;
+    self->device = device;
+    return;
+  }
+
+  if(size == 0)
+  {
+    if(self->flag & TH_STORAGE_FREEMEM) {
+      THCudaCheck(
+        (*self->allocator->free)(self->allocatorContext, self->data_ptr));
+    }
+    self->data_ptr = NULL;
+    self->size = 0;
+    self->device = device;
+  }
+  else
+  {
+    void *data = NULL;
+    cudaError_t err =
+      (*self->allocator->malloc)(self->allocatorContext,
+                                 (void**)&(data),
+                                 size * elementSize,
+                                 THCState_getCurrentStreamOnDevice(state, device));
+    THCudaCheck(err);
+
+    if (self->data_ptr) {
+      // Enable p2p access when the memcpy is across devices
+      THCState_getPeerToPeerAccess(state, device, self->device);
+
+      THCudaCheck(cudaMemcpyAsync(data,
+                                  self->data_ptr,
+                                  THMin(self->size, size) * elementSize,
+                                  cudaMemcpyDeviceToDevice,
+                                  THCState_getCurrentStream(state)));
+      if(self->flag & TH_STORAGE_FREEMEM) {
+        THCudaCheck(
+          (*self->allocator->free)(self->allocatorContext, self->data_ptr));
+      }
+    }
+
+    self->data_ptr = data;
+    self->size = size;
+    self->device = device;
+  }
+}
+
+int THCStorage_getDevice(THCState* state, const THCStorage* storage) {
+  return storage->device;
+}
\ No newline at end of file
diff --git a/aten/src/THC/THCStorage.h b/aten/src/THC/THCStorage.h
index 20afc851fabc94..22a607ce43107f 100644
--- a/aten/src/THC/THCStorage.h
+++ b/aten/src/THC/THCStorage.h
@@ -9,7 +9,4 @@
 #include "generic/THCStorage.h"
 #include "THCGenerateAllTypes.h"
 
-// This exists to have a data-type independent way of freeing (necessary for THPPointer).
-THC_API void THCStorage_free(THCState *state, THCStorage *self);
-
 #endif
diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp
index 817d522ff5163d..69b7ac63dcf53c 100644
--- a/aten/src/THC/THCStorage.hpp
+++ b/aten/src/THC/THCStorage.hpp
@@ -37,3 +37,20 @@ typedef struct THCStorage
       return static_cast<T*>(this->data_ptr);
     }
 } THCStorage;
+
+THC_API THCStorage* THCStorage_new(THCState *state, at::ScalarType scalar_type);
+THC_API THCStorage* THCStorage_newWithSize(THCState *state, at::ScalarType scalar_type, ptrdiff_t size);
+
+THC_API THCStorage* THCStorage_newWithAllocator(THCState *state,
+                                        at::ScalarType scalar_type,
+                                        ptrdiff_t size,
+                                        THCDeviceAllocator* allocator,
+                                        void* allocatorContext);
+
+THC_API void THCStorage_retain(THCState *state, THCStorage *storage);
+
+// This exists to have a data-type independent way of freeing (necessary for THPPointer).
+THC_API void THCStorage_free(THCState *state, THCStorage *self);
+
+THC_API void THCStorage_resize(THCState *state, THCStorage *storage, ptrdiff_t size);
+THC_API int THCStorage_getDevice(THCState* state, const THCStorage* storage);
diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp
index d749ccb713398b..93950c13da7069 100644
--- a/aten/src/THC/THCTensor.cpp
+++ b/aten/src/THC/THCTensor.cpp
@@ -6,3 +6,409 @@
 
 #include "generic/THCTensor.cpp"
 #include "THCGenerateAllTypes.h"
+
+#include "THCTensorInfo.cuh"
+
+int THCTensor_nDimension(THCState *state, const _THCTensor *self) {
+  return self->nDimension;
+}
+
+int64_t THCTensor_size(THCState *state, const _THCTensor *self, int dim) {
+  THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "out of range");
+  return self->size[dim];
+}
+
+int64_t THCTensor_stride(THCState *state, const _THCTensor *self, int dim) {
+  THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "out of range");
+  return self->stride[dim];
+}
+THLongStorage *THCTensor_newSizeOf(THCState *state, _THCTensor *self) {
+  THLongStorage *size = THLongStorage_newWithSize(self->nDimension);
+  THLongStorage_rawCopy(size, self->size);
+  return size;
+}
+
+void THCTensor_resize(THCState *state, _THCTensor *self, THLongStorage *size, THLongStorage *stride) {
+  THArgCheck(size != NULL, 2, "invalid size");
+  if(stride)
+    THArgCheck(stride->size == size->size, 3, "invalid stride");
+
+  THCTensor_resizeNd(state, self, size->size, THLongStorage_data(size), (stride ? THLongStorage_data(stride) : NULL));
+}
+
+void THCTensor_resizeAs(THCState *state, _THCTensor *self, _THCTensor *src) {
+  int isSame = 0;
+  int d;
+  if(self->nDimension == src->nDimension)
+  {
+    isSame = 1;
+    for(d = 0; d < self->nDimension; d++)
+    {
+      if(self->size[d] != src->size[d])
+      {
+        isSame = 0;
+        break;
+      }
+    }
+  }
+
+  if(!isSame)
+    THCTensor_resizeNd(state, self, src->nDimension, src->size, NULL);
+}
+
+void THCTensor_resizeNd(THCState *state, _THCTensor *self, int nDimension, int64_t *size, int64_t *stride)
+{
+  int d;
+  int nDimension_;
+  ptrdiff_t totalSize;
+  int hascorrectsize = 1;
+
+  nDimension_ = 0;
+  for(d = 0; d < nDimension; d++)
+  {
+    if(size[d] > 0)
+    {
+      nDimension_++;
+      if((self->nDimension > d) && (size[d] != self->size[d]))
+        hascorrectsize = 0;
+
+      if((self->nDimension > d) && stride && (stride[d] >= 0) && (stride[d] != self->stride[d]))
+        hascorrectsize = 0;
+    }
+    else
+      break;
+  }
+  nDimension = nDimension_;
+
+  if(nDimension != self->nDimension)
+    hascorrectsize = 0;
+
+  if(hascorrectsize)
+    return;
+
+  if(nDimension > 0)
+  {
+    if(nDimension != self->nDimension)
+    {
+      self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*nDimension);
+      self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*nDimension);
+      self->nDimension = nDimension;
+    }
+
+    totalSize = 1;
+    for(d = self->nDimension-1; d >= 0; d--)
+    {
+      self->size[d] = size[d];
+      if(stride && (stride[d] >= 0) )
+        self->stride[d] = stride[d];
+      else
+      {
+        if(d == self->nDimension-1)
+          self->stride[d] = 1;
+        else
+          self->stride[d] = self->size[d+1]*self->stride[d+1];
+      }
+      totalSize += (self->size[d]-1)*self->stride[d];
+    }
+
+    if(totalSize+self->storageOffset > 0)
+    {
+      if(!self->storage)
+        THError("Tensor: invalid null storage");
+      if(totalSize+self->storageOffset > self->storage->size)
+        THCStorage_resize(state, self->storage, totalSize+self->storageOffset);
+    }
+  }
+  else
+    self->nDimension = 0;
+}
+
+void THCTensor_set(THCState *state, _THCTensor *self, _THCTensor *src)
+{
+  if(self != src)
+    THCTensor_setStorageNd(state,
+                           self,
+                           src->storage,
+                           src->storageOffset,
+                           src->nDimension,
+                           src->size,
+                           src->stride);
+}
+
+void THCTensor_setStorageNd(THCState *state, _THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride)
+{
+  /* storage */
+  if(self->storage != storage)
+  {
+    if (!self->storage) {
+      THError("Tensor: invalid null storage");
+    }
+    auto scalar_type = self->storage->scalar_type;
+    THCStorage_free(state, self->storage);
+
+    if(storage)
+    {
+      self->storage = storage;
+      THCStorage_retain(state, self->storage);
+    }
+    else
+      self->storage = THCStorage_new(state, scalar_type);
+  }
+
+  /* storageOffset */
+  if(storageOffset < 0)
+    THError("Tensor: invalid storage offset");
+  self->storageOffset = storageOffset;
+
+  /* size and stride */
+  THCTensor_resizeNd(state, self, nDimension, size, stride);
+}
+
+
+void THCTensor_squeeze1d(THCState *state, _THCTensor *self, _THCTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck(dimension < src->nDimension, 3, "dimension out of range");
+
+  THCTensor_set(state, self, src);
+
+  if(src->size[dimension] == 1 && src->nDimension > 1)
+  {
+    for(d = dimension; d < self->nDimension-1; d++)
+    {
+      self->size[d] = self->size[d+1];
+      self->stride[d] = self->stride[d+1];
+    }
+    self->nDimension--;
+  }
+}
+
+void THCTensor_unsqueeze1d(THCState *state, _THCTensor *self, _THCTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck((dimension >= 0) && (dimension <= src->nDimension), 3, "dimension out of range");
+  THArgCheck(src->nDimension > 0, 3, "cannot unsqueeze empty tensor");
+
+  THCTensor_set(state, self, src);
+
+  self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->nDimension+1));
+  self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->nDimension+1));
+  self->nDimension++;
+  for (d = self->nDimension-1; d > dimension; d--) {
+    self->size[d] = self->size[d-1];
+    self->stride[d] = self->stride[d-1];
+  }
+  if (dimension+1 < self->nDimension) {
+    self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
+  } else {
+    self->stride[dimension] = 1;
+  }
+  self->size[dimension] = 1;
+}
+
+bool THCTensor_isContiguous(THCState *state, const _THCTensor *self) {
+  int64_t z = 1;
+  int d;
+  for(d = self->nDimension-1; d >= 0; d--)
+  {
+    if(self->size[d] != 1)
+    {
+      if(self->stride[d] == z)
+        z *= self->size[d];
+      else
+        return false;
+    }
+  }
+  return true;
+}
+
+bool THCTensor_allContiguous(THCState *state, const _THCTensor **inputs, int numInputs) {
+  THAssert(numInputs > 0);
+  for (int i = 0; i < numInputs; ++i) {
+    if (!THCTensor_isContiguous(state, inputs[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+ptrdiff_t THCTensor_nElement(THCState *state, const _THCTensor *self) {
+  if(self->nDimension == 0)
+    return 0;
+  else
+  {
+    ptrdiff_t nElement = 1;
+    int d;
+    for(d = 0; d < self->nDimension; d++)
+      nElement *= self->size[d];
+    return nElement;
+  }
+}
+
+void THCTensor_retain(THCState *state, _THCTensor *self) {
+  if(self->flag & TH_TENSOR_REFCOUNTED)
+    self->refcount++;
+}
+
+
+void THCTensor_free(THCState *state, _THCTensor *self) {
+  if(!self)
+    return;
+
+  if(self->flag & TH_TENSOR_REFCOUNTED)
+  {
+    if(--self->refcount == 0)
+    {
+      THFree(self->size);
+      THFree(self->stride);
+      if(self->storage)
+        THCStorage_free(state, self->storage);
+      self->refcount.~atomic<int>();
+      THFree(self);
+    }
+  }
+}
+
+int THCTensor_getDevice(THCState* state, const _THCTensor* tensor) {
+  if (!tensor->storage) return -1;
+  return THCStorage_getDevice(state, tensor->storage);
+}
+
+bool THCTensor_allSameDevice(THCState* state, const _THCTensor ** inputs, int numInputs) {
+  THAssert(numInputs > 0);
+  int device = THCTensor_getDevice(state, inputs[0]);
+  for (int i = 1; i < numInputs; ++i) {
+    if (THCTensor_getDevice(state, inputs[i]) != device) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool THCTensor_canUse32BitIndexMath(THCState* state, const _THCTensor* t, ptrdiff_t max_elem) {
+  ptrdiff_t elements = THCTensor_nElement(state, t);
+  if (elements >= max_elem) {
+    return false;
+  }
+
+  ptrdiff_t offset = 0;
+  ptrdiff_t linearId = elements - 1;
+
+  for (int i = THCTensor_nDimension(state, t) - 1; i >= 0; --i) {
+    ptrdiff_t curDimIndex =
+      linearId % THCTensor_size(state, t, i);
+    ptrdiff_t curDimOffset = curDimIndex *
+      THCTensor_stride(state, t, i);
+    offset += curDimOffset;
+    linearId /= THCTensor_size(state, t, i);
+  }
+
+  if (offset >= max_elem) {
+    return false;
+  }
+
+  return true;
+}
+
+bool THCTensor_all32BitIndexable(THCState* state, const _THCTensor** inputs, int numInputs) {
+  for (int i = 0; i < numInputs; ++i) {
+    if (!THCTensor_canUse32BitIndexMath(state, inputs[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/* Due to the resize semantics of ops with `out=` keywords, if       */ \
+/* the output `tensor` has the same shape as the output of the       */ \
+/* reduction operation, then any noncontiguities in the output       */ \
+/* `tensor` should be preserved. This needs to be special cased b/c  */ \
+/* otherwise, when keepdim=False, the implementations of reduction   */ \
+/* ops resize `tensor` to the reduced size with keepdim=True, and    */ \
+/* then later squeeze `tensor` to the correct output size, breaking  */ \
+/* the contiguity guarantees of the resize semantics.                */ \
+void THCTensor_preserveReduceDimSemantics(THCState *state, _THCTensor *tensor,
+                                          int in_dims, int64_t dimension, int keepdim) {
+  int out_dims = THCTensor_nDimension(state, tensor);
+  if (out_dims > 0 && !keepdim && out_dims == in_dims - 1) {
+    THCTensor_unsqueeze1d(state, tensor, tensor, dimension);
+  }
+}
+
+namespace {
+
+struct SizeAndStride {
+  int64_t size;
+  int64_t stride;
+};
+
+/*
+ A comparator that will sort SizeAndStride structs by stride,
+ in ascending order.
+ */
+int compareSizeAndStride(const void* a, const void* b) {
+  const SizeAndStride* aS = (const SizeAndStride*) a;
+  const SizeAndStride* bS = (const SizeAndStride*) b;
+
+  if (aS->stride < bS->stride) return -1;
+  if (aS->stride == bS->stride) return 0;
+  return 1;
+}
+
+}
+
+/* Returns false if there is no possibility that the tensor    */
+/* has "overlapping" indices and true otherwise.               */
+/* "Overlapping" indices are two+ valid indices that specify   */
+/* the same offset within the tensor.                          */
+/* The function does this by checking for a sufficient but not */
+/* necessary condition of no overlap. In particular, that      */
+/* that there exists an ordering of the tensor's dimensions    */
+/* that is nicely "nested," with each dimension contained      */
+/* within the next one.                                        */
+bool THCTensor_maybeOverlappingIndices(THCState* state, const _THCTensor* t) {
+  /* Extract size/stride arrays; only consider size >1 dims. */
+  SizeAndStride info[MAX_CUTORCH_DIMS];
+
+  int dims = THCTensor_nDimension(state, t);
+  int nonSize1Dims = 0;
+  for (int i = 0; i < dims; ++i) {
+    int64_t size = THCTensor_size(state, t, i);
+
+    if (size > 1) {
+      info[nonSize1Dims].size = size;
+      info[nonSize1Dims].stride =
+        THCTensor_stride(state, t, i);
+
+      if (info[nonSize1Dims].stride < 1) {
+        return true;
+      }
+
+      ++nonSize1Dims;
+    }
+  }
+
+  /* Short-circuits if tensor is a single element.             */
+  if (nonSize1Dims == 0) {
+    return false;
+  }
+
+  /* Ascending order (innermost dimension in sorted view is at [0]) */
+  qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride);
+
+  for (int i = 0; i < (nonSize1Dims - 1); ++i) {
+    if (((info[i].size - 1) * info[i].stride) >= info[i + 1].stride) {
+      return true;
+    }
+  }
+
+  return false;
+}
diff --git a/aten/src/THC/THCTensor.hpp b/aten/src/THC/THCTensor.hpp
index b82279df731e32..c230d117913f7c 100644
--- a/aten/src/THC/THCTensor.hpp
+++ b/aten/src/THC/THCTensor.hpp
@@ -9,5 +9,55 @@
 
 #include <atomic>
 
+typedef struct _THCTensor
+{
+    int64_t *size;
+    int64_t *stride;
+    int nDimension;
+
+    THCStorage *storage;
+    ptrdiff_t storageOffset;
+    std::atomic<int> refcount;
+
+    char flag;
+
+} _THCTensor;
+
 #include "generic/THCTensor.hpp"
 #include "THCGenerateAllTypes.h"
+
+THC_API int THCTensor_nDimension(THCState *state, const _THCTensor *self);
+THC_API int64_t THCTensor_size(THCState *state, const _THCTensor *self, int dim);
+THC_API int64_t THCTensor_stride(THCState *state, const _THCTensor *self, int dim);
+THC_API THLongStorage *THCTensor_newSizeOf(THCState *state, _THCTensor *self);
+
+THC_API void THCTensor_resize(THCState *state, _THCTensor *tensor, THLongStorage *size, THLongStorage *stride);
+THC_API void THCTensor_resizeAs(THCState *state, _THCTensor *tensor, _THCTensor *src);
+THC_API void THCTensor_resizeNd(THCState *state, _THCTensor *tensor, int nDimension, int64_t *size, int64_t *stride);
+
+THC_API void THCTensor_set(THCState *state, _THCTensor *self, _THCTensor *src);
+THC_API void THCTensor_setStorageNd(THCState *state, _THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride);
+
+THC_API void THCTensor_squeeze1d(THCState *state, _THCTensor *self, _THCTensor *src, int dimension_);
+THC_API void THCTensor_unsqueeze1d(THCState *state, _THCTensor *self, _THCTensor *src, int dimension_);
+
+THC_API bool THCTensor_isContiguous(THCState *state, const _THCTensor *self);
+THC_API bool THCTensor_allContiguous(THCState *state, const _THCTensor **inputs, int numInputs);
+THC_API ptrdiff_t THCTensor_nElement(THCState *state, const _THCTensor *self);
+
+THC_API void THCTensor_retain(THCState *state, _THCTensor *self);
+THC_API void THCTensor_free(THCState *state, _THCTensor *self);
+
+THC_API int THCTensor_getDevice(THCState* state, const _THCTensor* tensor);
+THC_API bool THCTensor_allSameDevice(THCState* state, const _THCTensor ** inputs, int numInputs);
+
+/* Can we use 32 bit math for indexing? */
+THC_API bool THCTensor_canUse32BitIndexMath(THCState* state, const _THCTensor* t, ptrdiff_t max_elem=INT32_MAX);
+/* Are all tensors 32-bit indexable? */
+THC_API bool THCTensor_all32BitIndexable(THCState* state, const _THCTensor** inputs, int numInputs);
+THC_API void THCTensor_preserveReduceDimSemantics(THCState *state, _THCTensor *tensor, int in_dims,
+                                                  int64_t dimension, int keepdim);
+/* Returns false if there is no possibility that the tensor    */
+/* has more than one index that references the same datapoint, */
+/* true otherwise.                                             */
+THC_API bool THCTensor_maybeOverlappingIndices(THCState* state, const _THCTensor* t);
diff --git a/aten/src/THC/THCTensorCopy.cu b/aten/src/THC/THCTensorCopy.cu
index 503f90172d2263..c7f1eee78a7060 100644
--- a/aten/src/THC/THCTensorCopy.cu
+++ b/aten/src/THC/THCTensorCopy.cu
@@ -27,13 +27,13 @@ THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) {
   static_assert(std::is_same<ScalarTypeDst, typename TensorUtils<TensorTypeDst>::DataType>::value, "ScalarTypeDst must match");
   static_assert(std::is_same<ScalarTypeSrc, typename TensorUtils<TensorTypeSrc>::DataType>::value, "ScalarTypeSrc must match");
 
-  ptrdiff_t totalElements = TensorUtils<TensorTypeDst>::getNumElements(state, dst);
+  ptrdiff_t totalElements = THCTensor_nElement(state, dst);
 
   THArgCheck(totalElements ==
-             TensorUtils<TensorTypeSrc>::getNumElements(state, src),
+             THCTensor_nElement(state, src),
              2, "sizes do not match");
 
-  if (TensorUtils<TensorTypeDst>::getDims(state, dst) == 0) {
+  if (THCTensor_nDimension(state, dst) == 0) {
     // Zero-dim tensor; copy nothing
     return;
   }
@@ -47,13 +47,13 @@ THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) {
   // contiguous).
   // -AND: both tensors have the same type.
   bool sameType = isSameType<TensorTypeSrc, TensorTypeDst>();
-  bool srcContig = TensorUtils<TensorTypeSrc>::isContiguous(state, src);
-  bool dstContig = TensorUtils<TensorTypeDst>::isContiguous(state, dst);
+  bool srcContig = THCTensor_isContiguous(state, src);
+  bool dstContig = THCTensor_isContiguous(state, dst);
   bool memcpyEligible =
     ((srcContig && dstContig) || (totalElements == 1)) && sameType;
 
-  int srcDev = TensorUtils<TensorTypeSrc>::getDevice(state, src);
-  int dstDev = TensorUtils<TensorTypeDst>::getDevice(state, dst);
+  int srcDev = THCTensor_getDevice(state, src);
+  int dstDev = THCTensor_getDevice(state, dst);
   int oldDev = curGPU();
 
   // Try to enable p2p access. This also handles the case srcDev == dstDev.
@@ -140,7 +140,7 @@ THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) {
         // Types are different
         // Copy into the new format, contiguous, on the source device
         srcContig = TensorUtils<TensorTypeDst>::newTensor(state);
-        TensorUtils<TensorTypeDst>::resizeAs(state, srcContig, dst);
+        THCTensor_resizeAs(state, srcContig, dst);
 
         bool succ =
           THC_pointwiseApply2<ScalarTypeDst,
@@ -170,12 +170,12 @@ THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) {
                     copyStream));
 
       // We are done with the src
-      TensorUtils<TensorTypeDst>::free(state, srcContig);
+      THCTensor_free(state, srcContig);
 
       if (dst != dstContig) {
         TensorUtils<TensorTypeDst>::freeCopyTo(state, dstContig, dst);
       } else {
-        TensorUtils<TensorTypeDst>::free(state, dstContig);
+        THCTensor_free(state, dstContig);
       }
 
       // We're still on srcDev at this point
diff --git a/aten/src/THC/THCTensorIndex.cu b/aten/src/THC/THCTensorIndex.cu
index 32954c4a3ebf9d..68bea1b16d6f26 100644
--- a/aten/src/THC/THCTensorIndex.cu
+++ b/aten/src/THC/THCTensorIndex.cu
@@ -459,7 +459,7 @@ void dispatchTakePutImpl(THCState *state, TensorType *a, TensorType *b, THCudaLo
 
   auto aInfo = getTensorInfo<real, TensorType, IndexType>(state, a);
   aInfo.collapseDims();
-  auto numel = TensorUtils<TensorType>::getNumElements(state, a);
+  auto numel = THCTensor_nElement(state, a);
   if (aInfo.isContiguous()) {
     auto op = Op<real, IndexType, -2>(aInfo, numel, start, end);
     THC_pointwiseApply2<real, int64_t>(state, b, index, op);
@@ -471,7 +471,7 @@ void dispatchTakePutImpl(THCState *state, TensorType *a, TensorType *b, THCudaLo
 
 template<typename real, template<class, class, int> class Op, typename TensorType>
 void dispatchTakePut(THCState *state, TensorType *a, TensorType *b, THCudaLongTensor *index) {
-  if (TensorUtils<TensorType>::canUse32BitIndexMath(state, a, INT_MAX)) {
+  if (THCTensor_canUse32BitIndexMath(state, a, INT_MAX)) {
     dispatchTakePutImpl<int32_t, real, Op>(state, a, b, index);
   } else {
     dispatchTakePutImpl<int64_t, real, Op>(state, a, b, index);
diff --git a/aten/src/THC/THCTensorMathCompare.cuh b/aten/src/THC/THCTensorMathCompare.cuh
index 953f6a59090561..9fac6089acb2aa 100644
--- a/aten/src/THC/THCTensorMathCompare.cuh
+++ b/aten/src/THC/THCTensorMathCompare.cuh
@@ -73,8 +73,8 @@ void THC_logicalValue(THCState *state,
                       TensorTypeOut *self_,
                       TensorType *src,
                       Op op) {
-  THLongStorage* st = TensorUtils<TensorType>::newSizeOf(state, src);
-  TensorUtils<TensorTypeOut>::resize(state, self_, st, NULL);
+  THLongStorage* st = THCTensor_newSizeOf(state, src);
+  THCTensor_resize(state, self_, st, NULL);
   THLongStorage_free(st);
 
   if (!THC_pointwiseApply2<ScalarTypeOut, ScalarType>(state, self_, src, op)) {
diff --git a/aten/src/THC/THCTensorMathCompareT.cuh b/aten/src/THC/THCTensorMathCompareT.cuh
index e2351e1404b3b9..9b1fb4e50bdddc 100644
--- a/aten/src/THC/THCTensorMathCompareT.cuh
+++ b/aten/src/THC/THCTensorMathCompareT.cuh
@@ -56,12 +56,12 @@ void THC_logicalTensor(THCState *state,
                        TensorType *src1,
                        TensorType *src2,
                        Op op) {
-  THLongStorage* st = TensorUtils<TensorType>::newSizeOf(state, src1);
-  TensorUtils<TensorTypeOut>::resize(state, self_, st, NULL);
+  THLongStorage* st = THCTensor_newSizeOf(state, src1);
+  THCTensor_resize(state, self_, st, NULL);
   THLongStorage_free(st);
 
-  THArgCheck(TensorUtils<TensorType>::getNumElements(state, src1) ==
-             TensorUtils<TensorType>::getNumElements(state, src2), 3,
+  THArgCheck(THCTensor_nElement(state, src1) ==
+             THCTensor_nElement(state, src2), 3,
              "sizes do not match");
 
   if (!THC_pointwiseApply3<ScalarTypeOut, ScalarType, ScalarType>(state, self_, src1, src2, op)) {
diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh
index 67dfaf06a43748..26389c3a990baf 100644
--- a/aten/src/THC/THCTensorMathPointwise.cuh
+++ b/aten/src/THC/THCTensorMathPointwise.cuh
@@ -37,7 +37,7 @@ struct TensorSigmoidOp<half> {
   __device__ __forceinline__ void operator()(half* out, half* in) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
     half one = ScalarConvert<int, half>::to(1);
-    *out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
+    *out = __hdiv(one, __hadd(one, hexp(__hneg(*in))));
 #else
     float fin = __half2float(*in);
     *out = __float2half(1.0f / (1.0f + expf(- fin)));
@@ -47,7 +47,7 @@ struct TensorSigmoidOp<half> {
   __device__ __forceinline__ void operator()(half* v) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
     half one = ScalarConvert<int, half>::to(1);
-    *v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
+    *v = __hdiv(one, __hadd(one, hexp(__hneg(*v))));
 #else
     float fv = __half2float(*v);
     *v = __float2half(1.0f / (1.0f + expf(- fv)));
diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh
index b95eb5d9bb3962..e4def67689fa03 100644
--- a/aten/src/THC/THCTensorMathReduce.cuh
+++ b/aten/src/THC/THCTensorMathReduce.cuh
@@ -290,17 +290,17 @@ __global__ void THCTensor_kernel_varOuterDim(T *tgt, T *src_, unsigned num_orows
 
 template<typename TensorTypeK, typename T, typename AccT, bool apply_sqrt>
 __host__ void THCTensor_varOuterDim(THCState *state, TensorTypeK *tgt, TensorTypeK *src, int64_t dimension, int flag) {
-  unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
+  unsigned ndim = THCTensor_nDimension(state, src);
   // Treat all outer dimensions (i.e. dim < dimension) as one.
   unsigned num_orows = 1;
   for (int64_t dim = 0; dim < dimension; dim++) {
-    num_orows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+    num_orows *= THCTensor_size(state, src, dim);
   }
-  unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, dimension);
+  unsigned row_size = THCTensor_size(state, src, dimension);
   // Treat all inner dimensions (i.e. dim > dimension) as one.
   unsigned num_irows = 1;
   for (unsigned dim = dimension + 1; dim < ndim; dim++) {
-    num_irows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+    num_irows *= THCTensor_size(state, src, dim);
   }
 
   dim3 threads(min(512, num_irows));
@@ -436,13 +436,13 @@ __global__ void THCTensor_kernel_varInnermostDim(T *tgt, T *src_, unsigned num_r
 
 template<typename TensorTypeK, typename T, typename AccT, bool apply_sqrt>
 __host__ void THCTensor_varInnermostDim(THCState *state, TensorTypeK *tgt, TensorTypeK *src, int flag) {
-  unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
+  unsigned ndim = THCTensor_nDimension(state, src);
   // Treat all outer dimensions as a single dimension.
   unsigned num_rows = 1;
   for (unsigned dim = 0; dim < ndim - 1; dim++) {
-    num_rows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+    num_rows *= THCTensor_size(state, src, dim);
   }
-  unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, ndim - 1);
+  unsigned row_size = THCTensor_size(state, src, ndim - 1);
 
   // From limited testing, 16x32 seemed a good compromise for handling both long and short dimensions.
   dim3 threads(16, 32);
@@ -509,15 +509,15 @@ THC_transformReduceOuterDimIndex(THCState *state,
                                  int64_t rdim,
                                  const thrust::pair<ScalarTypeK, ScalarTypeIndex>& init,
                                  BinaryFunction binary_op) {
-  unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
+  unsigned ndim = THCTensor_nDimension(state, src);
   unsigned num_orows = 1;
   for (int64_t dim = 0; dim < rdim; dim++) {
-    num_orows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+    num_orows *= THCTensor_size(state, src, dim);
   }
-  unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, rdim);
+  unsigned row_size = THCTensor_size(state, src, rdim);
   unsigned num_irows = 1;
   for (unsigned dim = rdim + 1; dim < ndim; dim++) {
-    num_irows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+    num_irows *= THCTensor_size(state, src, dim);
   }
 
   dim3 threads(min(512, num_irows));
@@ -612,12 +612,12 @@ THC_transformReduceInnermostDimIndex(THCState *state,
                                      TensorTypeK *src,
                                      const thrust::pair<ScalarTypeK, ScalarTypeIndex>& init,
                                      BinaryFunction binary_op) {
-  unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
+  unsigned ndim = THCTensor_nDimension(state, src);
   unsigned num_rows = 1;
   for (unsigned dim = 0; dim < ndim - 1; dim++) {
-    num_rows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+    num_rows *= THCTensor_size(state, src, dim);
   }
-  unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, ndim - 1);
+  unsigned row_size = THCTensor_size(state, src, ndim - 1);
 
   dim3 threads(16, 32);
   dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));
@@ -650,40 +650,40 @@ THC_reduceDimIndex(THCState *state,
   static_assert(std::is_same<ScalarTypeK, typename TensorUtils<TensorTypeK>::DataType>::value, "ScalarTypeK must match");
   static_assert(std::is_same<ScalarTypeIndex, typename TensorUtils<TensorTypeIndex>::DataType>::value, "ScalarTypeIndex must match");
   THArgCheck(dimension >= 0 &&
-             dimension < TensorUtils<TensorTypeK>::getDims(state, src),
+             dimension < THCTensor_nDimension(state, src),
              3, "dimension out of range");
 
 
   // Unsqueeze tgt1_/tgt_2 if necessary so that their contiguity traits
   // are preserved if they are the same size as the correct reduction output.
-  int src_dims = TensorUtils<TensorTypeK>::getDims(state, src);
-  TensorUtils<TensorTypeK>::preserveReduceDimSemantics(
+  int src_dims = THCTensor_nDimension(state, src);
+  THCTensor_preserveReduceDimSemantics(
       state, tgt1_, src_dims, dimension, keepdim);
-  TensorUtils<TensorTypeIndex>::preserveReduceDimSemantics(
+  THCTensor_preserveReduceDimSemantics(
       state, tgt2_, src_dims, dimension, keepdim);
 
-  THLongStorage *dim = TensorUtils<TensorTypeK>::newSizeOf(state, src);
+  THLongStorage *dim = THCTensor_newSizeOf(state, src);
   THLongStorage_set(dim, dimension, 1);
-  TensorUtils<TensorTypeK>::resize(state, tgt1_, dim, NULL);
-  TensorUtils<TensorTypeIndex>::resize(state, tgt2_, dim, NULL);
+  THCTensor_resize(state, tgt1_, dim, NULL);
+  THCTensor_resize(state, tgt2_, dim, NULL);
   THLongStorage_free(dim);
 
   TensorTypeK *tgt1 = TensorUtils<TensorTypeK>::newContiguous(state, tgt1_);
   TensorTypeIndex *tgt2 = TensorUtils<TensorTypeIndex>::newContiguous(state, tgt2_);
   src = TensorUtils<TensorTypeK>::newContiguous(state, src);
 
-  if (dimension == TensorUtils<TensorTypeK>::getDims(state, src) - 1) {
+  if (dimension == THCTensor_nDimension(state, src) - 1) {
     THC_transformReduceInnermostDimIndex(state, tgt1, tgt2, src, init, binary_op);
   } else {
     THC_transformReduceOuterDimIndex(state, tgt1, tgt2, src, dimension, init, binary_op);
   }
 
-  TensorUtils<TensorTypeK>::free(state, src);
+  THCTensor_free(state, src);
   TensorUtils<TensorTypeK>::freeCopyTo(state, tgt1, tgt1_);
   TensorUtils<TensorTypeIndex>::freeCopyTo(state, tgt2, tgt2_);
   if (!keepdim) {
-    TensorUtils<TensorTypeK>::squeeze1d(state, tgt1_, tgt1_, dimension);
-    TensorUtils<TensorTypeIndex>::squeeze1d(state, tgt2_, tgt2_, dimension);
+    THCTensor_squeeze1d(state, tgt1_, tgt1_, dimension);
+    THCTensor_squeeze1d(state, tgt2_, tgt2_, dimension);
   }
 }
 
diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu
index 432138493dd72f..8ee4b122e3382b 100644
--- a/aten/src/THC/THCTensorRandom.cu
+++ b/aten/src/THC/THCTensorRandom.cu
@@ -13,7 +13,7 @@
 #include <curand_mtgp32_host.h>
 #include <curand_mtgp32dc_p_11213.h>
 
-#define MAX_NUM_BLOCKS 200 
+#define MAX_NUM_BLOCKS 200
 #define BLOCK_SIZE 256
 
 
@@ -170,6 +170,7 @@ __global__ void generate_bernoulli_tensor(curandStateMtgp32 *state, int size,
 
 // NOTE: curand_uniform is (0, 1] and we want [a, b)
 GENERATE_KERNEL2(generate_uniform, float, float a, float b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
+GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, reverse_bounds(x) * (b-a) + a)
 GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, reverse_bounds(x) * (b-a) + a)
 
 GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean)
diff --git a/aten/src/THC/THCTensorRandom.cuh b/aten/src/THC/THCTensorRandom.cuh
index 4637c4555963b6..7749f231c5c771 100644
--- a/aten/src/THC/THCTensorRandom.cuh
+++ b/aten/src/THC/THCTensorRandom.cuh
@@ -183,6 +183,8 @@ sampleMultinomialOnce(int64_t* dest,
     for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) {
       val = dist[curDist * stride_dist + cat * stride_categories];
       assert(THCNumerics<T>::ge(val, zero));
+      assert(!THCNumerics<T>::isinf(val));
+      assert(!THCNumerics<T>::isnan(val));
       sum = THCNumerics<AccT>::add(sum, ScalarConvert<T, AccT>::to(val));
     }
 
diff --git a/aten/src/THC/THCTensorSort.cu b/aten/src/THC/THCTensorSort.cu
index 6d3dbb5cced3ad..3635886300fc39 100644
--- a/aten/src/THC/THCTensorSort.cu
+++ b/aten/src/THC/THCTensorSort.cu
@@ -29,7 +29,7 @@ void THCudaLongTensor_fillSliceWithIndex(THCState* state,
     <<<grid, block, 0, THCState_getCurrentStream(state)>>>(      \
       info, numSlices, sliceSize, info.strides[collapseDim])
 
-  if (TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, t)) {
+  if (THCTensor_canUse32BitIndexMath(state, t)) {
     TensorInfo<int64_t, uint32_t> info =
       getTensorInfo<int64_t, THCudaLongTensor, unsigned int>(state, t);
     info.reduceDim(dim);
diff --git a/aten/src/THC/THCTensorTypeUtils.cu b/aten/src/THC/THCTensorTypeUtils.cu
index b38766381bd013..f29348a92d9b6b 100644
--- a/aten/src/THC/THCTensorTypeUtils.cu
+++ b/aten/src/THC/THCTensorTypeUtils.cu
@@ -4,28 +4,6 @@
 #include "THCHalf.h"
 #include <stdlib.h>
 
-namespace {
-
-struct SizeAndStride {
-  int64_t size;
-  int64_t stride;
-};
-
-/* 
- A comparator that will sort SizeAndStride structs by stride,
- in ascending order.
- */
-int compareSizeAndStride(const void* a, const void* b) {
-  const SizeAndStride* aS = (const SizeAndStride*) a;
-  const SizeAndStride* bS = (const SizeAndStride*) b;
-  
-  if (aS->stride < bS->stride) return -1;
-  if (aS->stride == bS->stride) return 0;
-  return 1;
-}
-
-}
-
 #define IMPL_TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE)                       \
                                                                         \
 TENSOR_TYPE*                                                            \
@@ -39,24 +17,6 @@ TensorUtils<TENSOR_TYPE>::newContiguous(THCState* state,                \
   return TENSOR_TYPE##_newContiguous(state, t);                         \
 }                                                                       \
                                                                         \
-THLongStorage*                                                          \
-TensorUtils<TENSOR_TYPE>::newSizeOf(THCState* state,                    \
-                                    TENSOR_TYPE* t) {                   \
-  return TENSOR_TYPE##_newSizeOf(state, t);                             \
-}                                                                       \
-                                                                        \
-void                                                                    \
-TensorUtils<TENSOR_TYPE>::retain(THCState* state,                       \
-                                 TENSOR_TYPE* t) {                      \
-  TENSOR_TYPE##_retain(state, t);                                       \
-}                                                                       \
-                                                                        \
-void                                                                    \
-TensorUtils<TENSOR_TYPE>::free(THCState* state,                         \
-                               TENSOR_TYPE* t) {                        \
-  TENSOR_TYPE##_free(state, t);                                         \
-}                                                                       \
-                                                                        \
 void                                                                    \
 TensorUtils<TENSOR_TYPE>::freeCopyTo(THCState* state,                   \
                                      TENSOR_TYPE* src,                  \
@@ -64,37 +24,6 @@ TensorUtils<TENSOR_TYPE>::freeCopyTo(THCState* state,                   \
   TENSOR_TYPE##_freeCopyTo(state, src, dst);                            \
 }                                                                       \
                                                                         \
-void                                                                    \
-TensorUtils<TENSOR_TYPE>::resize(THCState* state,                       \
-                                 TENSOR_TYPE* out,                      \
-                                 THLongStorage* sizes,                  \
-                                 THLongStorage* strides) {              \
-  TENSOR_TYPE##_resize(state, out, sizes, strides);                     \
-}                                                                       \
-                                                                        \
-void                                                                    \
-TensorUtils<TENSOR_TYPE>::resizeAs(THCState* state,                     \
-                                   TENSOR_TYPE* dst,                    \
-                                   TENSOR_TYPE* src) {                  \
-  TENSOR_TYPE##_resizeAs(state, dst, src);                              \
-}                                                                       \
-                                                                        \
-void                                                                    \
-TensorUtils<TENSOR_TYPE>::squeeze1d(THCState *state,                    \
-                                    TENSOR_TYPE *dst,                   \
-                                    TENSOR_TYPE *src,                   \
-                                    int dimension) {                    \
-  TENSOR_TYPE##_squeeze1d(state, dst, src, dimension);                  \
-}                                                                       \
-                                                                        \
-void                                                                    \
-TensorUtils<TENSOR_TYPE>::unsqueeze1d(THCState *state,                  \
-                                    TENSOR_TYPE *dst,                   \
-                                    TENSOR_TYPE *src,                   \
-                                    int dimension) {                    \
-  TENSOR_TYPE##_unsqueeze1d(state, dst, src, dimension);                \
-}                                                                       \
-                                                                        \
 DATA_TYPE*                                                              \
 TensorUtils<TENSOR_TYPE>::getData(THCState* state,                      \
                                   TENSOR_TYPE* t) {                     \
@@ -102,185 +31,12 @@ TensorUtils<TENSOR_TYPE>::getData(THCState* state,                      \
   return (DATA_TYPE*) TENSOR_TYPE##_data(state, t);                     \
 }                                                                       \
                                                                         \
-ptrdiff_t                                                               \
-TensorUtils<TENSOR_TYPE>::getNumElements(THCState* state,               \
-                                         TENSOR_TYPE* t) {              \
-  return TENSOR_TYPE##_nElement(state, t);                              \
-}                                                                       \
-                                                                        \
-int64_t                                                                 \
-TensorUtils<TENSOR_TYPE>::getSize(THCState* state,                      \
-                                  TENSOR_TYPE* t,                       \
-                                  int dim) {                            \
-  return TENSOR_TYPE##_size(state, t, dim);                             \
-}                                                                       \
-                                                                        \
-int64_t                                                                 \
-TensorUtils<TENSOR_TYPE>::getStride(THCState* state,                    \
-                                    TENSOR_TYPE* t,                     \
-                                    int dim) {                          \
-  return TENSOR_TYPE##_stride(state, t, dim);                           \
-}                                                                       \
-                                                                        \
-int                                                                     \
-TensorUtils<TENSOR_TYPE>::getDims(THCState* state,                      \
-                                  TENSOR_TYPE* t) {                     \
-  return TENSOR_TYPE##_nDimension(state, t);                            \
-}                                                                       \
-                                                                        \
-bool                                                                    \
-TensorUtils<TENSOR_TYPE>::isContiguous(THCState* state,                 \
-                                       TENSOR_TYPE* t) {                \
-  return TENSOR_TYPE##_isContiguous(state, t);                          \
-}                                                                       \
-                                                                        \
-bool                                                                    \
-TensorUtils<TENSOR_TYPE>::allContiguous(THCState* state,                \
-                                        TENSOR_TYPE** inputs,           \
-                                        int numInputs) {                \
-  THAssert(numInputs > 0);                                                \
-  for (int i = 0; i < numInputs; ++i) {                                 \
-    if (!TensorUtils<TENSOR_TYPE>::isContiguous(state, inputs[i])) {    \
-      return false;                                                     \
-    }                                                                   \
-  }                                                                     \
-  return true;                                                          \
-}                                                                       \
-                                                                        \
-/* Due to the resize semantics of ops with `out=` keywords, if       */ \
-/* the output `tensor` has the same shape as the output of the       */ \
-/* reduction operation, then any noncontiguities in the output       */ \
-/* `tensor` should be preserved. This needs to be special cased b/c  */ \
-/* otherwise, when keepdim=False, the implementations of reduction   */ \
-/* ops resize `tensor` to the reduced size with keepdim=True, and    */ \
-/* then later squeeze `tensor` to the correct output size, breaking  */ \
-/* the contiguity guarantees of the resize semantics.                */ \
-void                                                                    \
-TensorUtils<TENSOR_TYPE>::preserveReduceDimSemantics(                   \
-                          THCState *state, TENSOR_TYPE *tensor,         \
-                          int in_dims, int64_t dimension, int keepdim) {\
-  int out_dims = TensorUtils<TENSOR_TYPE>::getDims(state, tensor);      \
-  if (out_dims > 0 && !keepdim && out_dims == in_dims - 1) {            \
-    TensorUtils<TENSOR_TYPE>::unsqueeze1d(state, tensor, tensor, dimension);\
-  }                                                                     \
-}                                                                       \
-                                                                        \
-int                                                                     \
-TensorUtils<TENSOR_TYPE>::getDevice(THCState* state,                    \
-                                    TENSOR_TYPE* t) {                   \
-  return TENSOR_TYPE##_getDevice(state, t);                             \
-}                                                                       \
-                                                                        \
-bool                                                                    \
-TensorUtils<TENSOR_TYPE>::allSameDevice(THCState* state,                \
-                                        TENSOR_TYPE** inputs,           \
-                                        int numInputs) {                \
-  THAssert(numInputs > 0);                                              \
-  int device = TensorUtils<TENSOR_TYPE>::getDevice(state, inputs[0]);   \
-  for (int i = 1; i < numInputs; ++i) {                                 \
-    if (TensorUtils<TENSOR_TYPE>::getDevice(state, inputs[i]) != device) {     \
-      return false;                                                     \
-    }                                                                   \
-  }                                                                     \
-  return true;                                                          \
-}                                                                       \
-                                                                        \
 void                                                                    \
 TensorUtils<TENSOR_TYPE>::copyIgnoringOverlaps(THCState* state,         \
                                                TENSOR_TYPE* dst,        \
                                                TENSOR_TYPE* src) {      \
   return TENSOR_TYPE##_copyIgnoringOverlaps(state, dst, src);           \
 }                                                                       \
-                                                                        \
-/* Returns false if there is no possibility that the tensor    */       \
-/* has "overlapping" indices and true otherwise.               */       \
-/* "Overlapping" indices are two+ valid indices that specify   */       \
-/* the same offset within the tensor.                          */       \
-/* The function does this by checking for a sufficient but not */       \
-/* necessary condition of no overlap. In particular, that      */       \
-/* that there exists an ordering of the tensor's dimensions    */       \
-/* that is nicely "nested," with each dimension contained      */       \
-/* within the next one.                                        */       \
-bool                                                                    \
-TensorUtils<TENSOR_TYPE>::maybeOverlappingIndices(THCState* state,      \
-                                             TENSOR_TYPE* t) {          \
-  /* Extract size/stride arrays; only consider size >1 dims. */         \
-  SizeAndStride info[MAX_CUTORCH_DIMS];                                 \
-                                                                        \
-  int dims = TensorUtils<TENSOR_TYPE>::getDims(state, t);               \
-  int nonSize1Dims = 0;                                                 \
-  for (int i = 0; i < dims; ++i) {                                      \
-    int64_t size = TensorUtils<TENSOR_TYPE>::getSize(state, t, i);      \
-                                                                        \
-    if (size > 1) {                                                     \
-      info[nonSize1Dims].size = size;                                   \
-      info[nonSize1Dims].stride =                                       \
-        TensorUtils<TENSOR_TYPE>::getStride(state, t, i);               \
-                                                                        \
-      if (info[nonSize1Dims].stride < 1) {                              \
-        return true;                                                    \
-      }                                                                 \
-                                                                        \
-      ++nonSize1Dims;                                                   \
-    }                                                                   \
-  }                                                                     \
-                                                                        \
-  /* Short-circuits if tensor is a single element.             */       \
-  if (nonSize1Dims == 0) {                                              \
-    return false;                                                       \
-  }                                                                     \
-                                                                        \
-  /* Ascending order (innermost dimension in sorted view is at [0]) */  \
-  qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride); \
-                                                                        \
-  for (int i = 0; i < (nonSize1Dims - 1); ++i) {                        \
-    if (((info[i].size - 1) * info[i].stride) >= info[i + 1].stride) {  \
-      return true;                                                      \
-    }                                                                   \
-  }                                                                     \
-                                                                        \
-  return false;                                                         \
-}                                                                       \
-                                                                        \
-bool                                                                    \
-TensorUtils<TENSOR_TYPE>::canUse32BitIndexMath(THCState* state,         \
-                                               TENSOR_TYPE* t,          \
-                                               ptrdiff_t max_elem) {    \
-  ptrdiff_t elements = TensorUtils<TENSOR_TYPE>::getNumElements(state, t);   \
-  if (elements >= max_elem) {                                           \
-    return false;                                                       \
-  }                                                                     \
-                                                                        \
-  ptrdiff_t offset = 0;                                                 \
-  ptrdiff_t linearId = elements - 1;                                    \
-                                                                        \
-  for (int i = TensorUtils<TENSOR_TYPE>::getDims(state, t) - 1; i >= 0; --i) { \
-    ptrdiff_t curDimIndex =                                             \
-      linearId % TensorUtils<TENSOR_TYPE>::getSize(state, t, i);        \
-    ptrdiff_t curDimOffset = curDimIndex *                              \
-      TensorUtils<TENSOR_TYPE>::getStride(state, t, i);                 \
-    offset += curDimOffset;                                             \
-    linearId /= TensorUtils<TENSOR_TYPE>::getSize(state, t, i);         \
-  }                                                                     \
-                                                                        \
-  if (offset >= max_elem) {                                             \
-    return false;                                                       \
-  }                                                                     \
-                                                                        \
-  return true;                                                          \
-}                                                                       \
-                                                                        \
-bool                                                                    \
-TensorUtils<TENSOR_TYPE>::all32BitIndexable(THCState* state,            \
-                                            TENSOR_TYPE** inputs,       \
-                                            int numInputs) {            \
-  for (int i = 0; i < numInputs; ++i) {                                 \
-    if (!TensorUtils<TENSOR_TYPE>::canUse32BitIndexMath(state, inputs[i])) { \
-      return false;                                                     \
-    }                                                                   \
-  }                                                                     \
-  return true;                                                          \
-}
 
 IMPL_TENSOR_UTILS(THCudaByteTensor, uint8_t)
 IMPL_TENSOR_UTILS(THCudaCharTensor, int8_t)
diff --git a/aten/src/THC/THCTensorTypeUtils.cuh b/aten/src/THC/THCTensorTypeUtils.cuh
index 3ec5f367583c8d..f8f3d4f01cbfeb 100644
--- a/aten/src/THC/THCTensorTypeUtils.cuh
+++ b/aten/src/THC/THCTensorTypeUtils.cuh
@@ -5,8 +5,9 @@
 #include <assert.h>
 #include "THCGeneral.h"
 #include "THCHalf.h"
-#include "THCTensor.h"
+#include "THCTensor.hpp"
 #include "THCTensorInfo.cuh"
+#include "THCTensor.hpp"
 
 /// A utility for accessing THCuda*Tensor types in a generic manner
 
@@ -39,42 +40,11 @@ struct TensorUtils {
                                                                         \
     static TENSOR_TYPE* newTensor(THCState* state);                     \
     static TENSOR_TYPE* newContiguous(THCState* state, TENSOR_TYPE* t); \
-    static THLongStorage* newSizeOf(THCState* state, TENSOR_TYPE* t);   \
-    static void retain(THCState* state, TENSOR_TYPE* t);                \
-    static void free(THCState* state, TENSOR_TYPE* t);                  \
     static void freeCopyTo(THCState* state, TENSOR_TYPE* src,           \
                            TENSOR_TYPE* dst);                           \
-    static void resize(THCState* state, TENSOR_TYPE* out,               \
-                       THLongStorage* sizes,                            \
-                       THLongStorage* strides);                         \
-    static void resizeAs(THCState* state, TENSOR_TYPE* dst,             \
-                         TENSOR_TYPE* src);                             \
-    static void squeeze1d(THCState *state, TENSOR_TYPE *dst,            \
-                          TENSOR_TYPE *src, int dimension);             \
-    static void unsqueeze1d(THCState *state, TENSOR_TYPE *dst,          \
-                          TENSOR_TYPE *src, int dimension);             \
-    static void preserveReduceDimSemantics(                             \
-                          THCState *state, TENSOR_TYPE *tensor,         \
-                          int in_dims, int64_t dimension, int keepdim); \
     static DATA_TYPE* getData(THCState* state, TENSOR_TYPE* t);         \
-    static ptrdiff_t getNumElements(THCState* state, TENSOR_TYPE* t);   \
-    static int64_t getSize(THCState* state, TENSOR_TYPE* t, int dim);   \
-    static int64_t getStride(THCState* state, TENSOR_TYPE* t, int dim); \
-    static int getDims(THCState* state, TENSOR_TYPE* t);                \
-    static bool isContiguous(THCState* state, TENSOR_TYPE* t);          \
-    static bool allContiguous(THCState* state, TENSOR_TYPE** inputs, int numInputs); \
-    static int getDevice(THCState* state, TENSOR_TYPE* t);              \
-    static bool allSameDevice(THCState* state, TENSOR_TYPE** inputs, int numInputs); \
     static void copyIgnoringOverlaps(THCState* state,                   \
                                      TENSOR_TYPE* dst, TENSOR_TYPE* src); \
-    /* Returns false if there is no possibility that the tensor    */   \
-    /* has more than one index that references the same datapoint, */   \
-    /* true otherwise.                                             */   \
-    static bool maybeOverlappingIndices(THCState* state, TENSOR_TYPE* t);    \
-    /* Can we use 32 bit math for indexing? */                          \
-    static bool canUse32BitIndexMath(THCState* state, TENSOR_TYPE* t, ptrdiff_t max_elem=INT32_MAX);  \
-    /* Are all tensors 32-bit indexable? */                             \
-    static bool all32BitIndexable(THCState* state, TENSOR_TYPE** inputs, int numInputs); \
   }
 
 TENSOR_UTILS(THCudaByteTensor, uint8_t, int64_t);
@@ -124,10 +94,10 @@ getTensorInfo(THCState* state, TensorType* t) {
   IndexType sz[MAX_CUTORCH_DIMS];
   IndexType st[MAX_CUTORCH_DIMS];
 
-  int dims = TensorUtils<TensorType>::getDims(state, t);
+  int dims = THCTensor_nDimension(state, t);
   for (int i = 0; i < dims; ++i) {
-    sz[i] = TensorUtils<TensorType>::getSize(state, t, i);
-    st[i] = TensorUtils<TensorType>::getStride(state, t, i);
+    sz[i] = THCTensor_size(state, t, i);
+    st[i] = THCTensor_stride(state, t, i);
   }
 
   return TensorInfo<ScalarType, IndexType>(
@@ -155,7 +125,7 @@ struct ScalarNegate<half> {
     return __float2half(-__half2float(v));
 #endif
 #else
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
     half out = v;
 #else
     __half_raw out = __half_raw(v);
@@ -180,7 +150,7 @@ struct ScalarInv<half> {
 };
 
 inline bool operator==(half a, half b) {
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
   return a.x == b.x;
 #else
   __half_raw araw, braw;
@@ -191,7 +161,7 @@ inline bool operator==(half a, half b) {
 }
 
 inline bool operator!=(half a, half b) {
-#if CUDA_VERSION < 9000
+#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
     return a.x != b.x;
 #else
   __half_raw araw, braw;
diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp
index fba23a2ad94922..5f7baabf71b37f 100644
--- a/aten/src/THC/generic/THCStorage.cpp
+++ b/aten/src/THC/generic/THCStorage.cpp
@@ -40,51 +40,20 @@ real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index)
 
 THCStorage* THCStorage_(new)(THCState *state)
 {
-  return THCStorage_(newWithSize)(state, 0);
+  return THCStorage_new(state, at::CTypeToScalarType<at::cuda::from_type<real>>::to());
 }
 
 THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size)
 {
-  return THCStorage_(newWithAllocator)(
-    state, size,
-    state->cudaDeviceAllocator,
-    state->cudaDeviceAllocator->state);
+  return THCStorage_newWithSize(state, at::CTypeToScalarType<at::cuda::from_type<real>>::to(), size);
 }
 
 THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size,
                                           THCDeviceAllocator* allocator,
                                           void* allocatorContext)
 {
-  THArgCheck(size >= 0, 2, "invalid size");
-  int device;
-  THCudaCheck(cudaGetDevice(&device));
-
-  THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage));
-  memset(storage, 0, sizeof(THCStorage));
-  new (&storage->refcount) std::atomic<int>(1);
-  storage->scalar_type = at::CTypeToScalarType<at::cuda::from_type<real>>::to();
-  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM;
-  storage->allocator = allocator;
-  storage->allocatorContext = allocatorContext;
-  storage->size = size;
-  storage->device = device;
-
-  if(size > 0)
-  {
-    // update heap *before* attempting malloc, to free space for the malloc
-    cudaError_t err =
-      (*allocator->malloc)(allocatorContext,
-                           (void**)&(storage->data_ptr),
-                           size * sizeof(real),
-                           THCState_getCurrentStream(state));
-    if(err != cudaSuccess){
-      free(storage);
-    }
-    THCudaCheck(err);
-  } else {
-    storage->data_ptr = NULL;
-  }
-  return storage;
+  return THCStorage_newWithAllocator(state, at::CTypeToScalarType<at::cuda::from_type<real>>::to(),
+                                     size, allocator, allocatorContext);
 }
 
 THCStorage* THCStorage_(newWithSize1)(THCState *state, real data0)
@@ -170,8 +139,7 @@ void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char fla
 
 void THCStorage_(retain)(THCState *state, THCStorage *self)
 {
-  if(self && (self->flag & TH_STORAGE_REFCOUNTED))
-    self->refcount++;
+  THCStorage_retain(state, self);
 }
 
 int THCStorage_(retainIfLive)(THCState *state, THCStorage *storage)
diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu
index 9676b297b09b6c..c3f25f4f037c03 100644
--- a/aten/src/THC/generic/THCStorage.cu
+++ b/aten/src/THC/generic/THCStorage.cu
@@ -15,72 +15,11 @@ void THCStorage_(fill)(THCState *state, THCStorage *self, real value)
 
 void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size)
 {
-  THArgCheck(size >= 0, 2, "invalid size");
-  THAssert(self->allocator != NULL);
-  int device;
-  THCudaCheck(cudaGetDevice(&device));
-
-  if(!(self->flag & TH_STORAGE_RESIZABLE))
-    THError("Trying to resize storage that is not resizable");
-
-  if (self->allocator->realloc) {
-    real * data_ptr = self->data<real>();
-    cudaError_t err = (*self->allocator->realloc)(
-      self->allocatorContext,
-      (void**)&(data_ptr),
-      self->size * sizeof(real),
-      size * sizeof(real), THCState_getCurrentStream(state));
-    if (err != cudaSuccess) {
-      THCudaCheck(err);
-    }
-    self->size = size;
-    self->device = device;
-    return;
-  }
-
-  if(size == 0)
-  {
-    if(self->flag & TH_STORAGE_FREEMEM) {
-      THCudaCheck(
-        (*self->allocator->free)(self->allocatorContext, THCStorage_(data)(state, self)));
-    }
-    self->data_ptr = NULL;
-    self->size = 0;
-    self->device = device;
-  }
-  else
-  {
-    real *data = NULL;
-    cudaError_t err =
-      (*self->allocator->malloc)(self->allocatorContext,
-                                 (void**)&(data),
-                                 size * sizeof(real),
-                                 THCState_getCurrentStream(state));
-    THCudaCheck(err);
-
-    if (THCStorage_(data)(state, self)) {
-      // Enable p2p access when the memcpy is across devices
-      THCState_getPeerToPeerAccess(state, device, self->device);
-
-      THCudaCheck(cudaMemcpyAsync(data,
-                                  THCStorage_(data)(state, self),
-                                  THMin(self->size, size) * sizeof(real),
-                                  cudaMemcpyDeviceToDevice,
-                                  THCState_getCurrentStream(state)));
-      if(self->flag & TH_STORAGE_FREEMEM) {
-        THCudaCheck(
-          (*self->allocator->free)(self->allocatorContext, THCStorage_(data)(state, self)));
-      }
-    }
-
-    self->data_ptr = data;
-    self->size = size;
-    self->device = device;
-  }
+  THCStorage_resize(state, self, size);
 }
 
 THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) {
-  return storage->device;
+  return THCStorage_getDevice(state, storage);
 }
 
 #endif
diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp
index 6867c8da57dd93..147c71491ed0de 100644
--- a/aten/src/THC/generic/THCTensor.cpp
+++ b/aten/src/THC/generic/THCTensor.cpp
@@ -15,26 +15,22 @@ ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self)
 
 int THCTensor_(nDimension)(THCState *state, const THCTensor *self)
 {
-  return self->nDimension;
+  return THCTensor_nDimension(state, self);
 }
 
 int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim)
 {
-  THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "out of range");
-  return self->size[dim];
+  return THCTensor_size(state, self, dim);
 }
 
 int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim)
 {
-  THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "out of range");
-  return self->stride[dim];
+  return THCTensor_stride(state, self, dim);
 }
 
 THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self)
 {
-  THLongStorage *size = THLongStorage_newWithSize(self->nDimension);
-  THLongStorage_rawCopy(size, self->size);
-  return size;
+  return THCTensor_newSizeOf(state, self);
 }
 
 THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self)
@@ -300,32 +296,12 @@ THCTensor *THCTensor_(newFoldBatchDim)(THCState *state, THCTensor *input) {
 /* Resize */
 void THCTensor_(resize)(THCState *state, THCTensor *self, THLongStorage *size, THLongStorage *stride)
 {
-  THArgCheck(size != NULL, 2, "invalid size");
-  if(stride)
-    THArgCheck(stride->size == size->size, 3, "invalid stride");
-
-  THCTensor_(resizeNd)(state, self, size->size, THLongStorage_data(size), (stride ? THLongStorage_data(stride) : NULL));
+  THCTensor_resize(state, self, size, stride);
 }
 
 void THCTensor_(resizeAs)(THCState *state, THCTensor *self, THCTensor *src)
 {
-  int isSame = 0;
-  int d;
-  if(self->nDimension == src->nDimension)
-  {
-    isSame = 1;
-    for(d = 0; d < self->nDimension; d++)
-    {
-      if(self->size[d] != src->size[d])
-      {
-        isSame = 0;
-        break;
-      }
-    }
-  }
-
-  if(!isSame)
-    THCTensor_(resizeNd)(state, self, src->nDimension, src->size, NULL);
+  THCTensor_resizeAs(state, self, src);
 }
 
 void THCTensor_(resize1d)(THCState *state, THCTensor *tensor, int64_t size0)
@@ -359,14 +335,7 @@ void THCTensor_(resize5d)(THCState *state, THCTensor *self, int64_t size0, int64
 
 void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src)
 {
-  if(self != src)
-    THCTensor_(setStorageNd)(state,
-                             self,
-                             src->storage,
-                             src->storageOffset,
-                             src->nDimension,
-                             src->size,
-                             src->stride);
+  THCTensor_set(state, self, src);
 }
 
 void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
@@ -570,68 +539,17 @@ void THCTensor_(squeeze)(THCState *state, THCTensor *self, THCTensor *src)
 
 void THCTensor_(squeeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension)
 {
-  int d;
-
-  if(!src)
-    src = self;
-
-  THArgCheck(dimension < src->nDimension, 3, "dimension out of range");
-
-  THCTensor_(set)(state, self, src);
-
-  if(src->size[dimension] == 1 && src->nDimension > 1)
-  {
-    for(d = dimension; d < self->nDimension-1; d++)
-    {
-      self->size[d] = self->size[d+1];
-      self->stride[d] = self->stride[d+1];
-    }
-    self->nDimension--;
-  }
+  THCTensor_squeeze1d(state, self, src, dimension);
 }
 
 void THCTensor_(unsqueeze1d)(THCState *state, THCTensor *self, THCTensor *src, int dimension)
 {
-  int d;
-
-  if(!src)
-    src = self;
-
-  THArgCheck((dimension >= 0) && (dimension <= src->nDimension), 3, "dimension out of range");
-  THArgCheck(src->nDimension > 0, 3, "cannot unsqueeze empty tensor");
-
-  THCTensor_(set)(state, self, src);
-
-  self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*(self->nDimension+1));
-  self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*(self->nDimension+1));
-  self->nDimension++;
-  for (d = self->nDimension-1; d > dimension; d--) {
-    self->size[d] = self->size[d-1];
-    self->stride[d] = self->stride[d-1];
-  }
-  if (dimension+1 < self->nDimension) {
-    self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
-  } else {
-    self->stride[dimension] = 1;
-  }
-  self->size[dimension] = 1;
+  THCTensor_unsqueeze1d(state, self, src, dimension);
 }
 
 int THCTensor_(isContiguous)(THCState *state, const THCTensor *self)
 {
-  int64_t z = 1;
-  int d;
-  for(d = self->nDimension-1; d >= 0; d--)
-  {
-    if(self->size[d] != 1)
-    {
-      if(self->stride[d] == z)
-        z *= self->size[d];
-      else
-        return 0;
-    }
-  }
-  return 1;
+  return THCTensor_isContiguous(state, self);
 }
 
 int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStorage *dims)
@@ -680,41 +598,17 @@ int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTe
 
 ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self)
 {
-  if(self->nDimension == 0)
-    return 0;
-  else
-  {
-    ptrdiff_t nElement = 1;
-    int d;
-    for(d = 0; d < self->nDimension; d++)
-      nElement *= self->size[d];
-    return nElement;
-  }
+  return THCTensor_nElement(state, self);
 }
 
 void THCTensor_(retain)(THCState *state, THCTensor *self)
 {
-  if(self->flag & TH_TENSOR_REFCOUNTED)
-    self->refcount++;
+  THCTensor_retain(state, self);
 }
 
 void THCTensor_(free)(THCState *state, THCTensor *self)
 {
-  if(!self)
-    return;
-
-  if(self->flag & TH_TENSOR_REFCOUNTED)
-  {
-    if(--self->refcount == 0)
-    {
-      THFree(self->size);
-      THFree(self->stride);
-      if(self->storage)
-        THCStorage_(free)(state, self->storage);
-      self->refcount.~atomic<int>();
-      THFree(self);
-    }
-  }
+  THCTensor_free(state, self);
 }
 
 void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst)
@@ -740,95 +634,12 @@ static void THCTensor_(rawInit)(THCState *state, THCTensor *self)
 
 void THCTensor_(setStorageNd)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, int64_t *size, int64_t *stride)
 {
-  /* storage */
-  if(self->storage != storage)
-  {
-    if(self->storage)
-      THCStorage_(free)(state, self->storage);
-
-    if(storage)
-    {
-      self->storage = storage;
-      THCStorage_(retain)(state, self->storage);
-    }
-    else
-      self->storage = THCStorage_(new)(state);
-  }
-
-  /* storageOffset */
-  if(storageOffset < 0)
-    THError("Tensor: invalid storage offset");
-  self->storageOffset = storageOffset;
-
-  /* size and stride */
-  THCTensor_(resizeNd)(state, self, nDimension, size, stride);
+  THCTensor_setStorageNd(state, self, storage, storageOffset, nDimension, size, stride);
 }
 
 void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, int64_t *size, int64_t *stride)
 {
-  int d;
-  int nDimension_;
-  ptrdiff_t totalSize;
-  int hascorrectsize = 1;
-
-  nDimension_ = 0;
-  for(d = 0; d < nDimension; d++)
-  {
-    if(size[d] > 0)
-    {
-      nDimension_++;
-      if((self->nDimension > d) && (size[d] != self->size[d]))
-        hascorrectsize = 0;
-
-      if((self->nDimension > d) && stride && (stride[d] >= 0) && (stride[d] != self->stride[d]))
-        hascorrectsize = 0;
-    }
-    else
-      break;
-  }
-  nDimension = nDimension_;
-
-  if(nDimension != self->nDimension)
-    hascorrectsize = 0;
-
-  if(hascorrectsize)
-    return;
-
-  if(nDimension > 0)
-  {
-    if(nDimension != self->nDimension)
-    {
-      self->size = (int64_t*)THRealloc(self->size, sizeof(int64_t)*nDimension);
-      self->stride = (int64_t*)THRealloc(self->stride, sizeof(int64_t)*nDimension);
-      self->nDimension = nDimension;
-    }
-
-    totalSize = 1;
-    for(d = self->nDimension-1; d >= 0; d--)
-    {
-      self->size[d] = size[d];
-      if(stride && (stride[d] >= 0) )
-        self->stride[d] = stride[d];
-      else
-      {
-        if(d == self->nDimension-1)
-          self->stride[d] = 1;
-        else
-          self->stride[d] = self->size[d+1]*self->stride[d+1];
-      }
-      totalSize += (self->size[d]-1)*self->stride[d];
-    }
-
-    if(totalSize+self->storageOffset > 0)
-    {
-      if(!self->storage)
-        self->storage = THCStorage_(new)(state);
-      if(totalSize+self->storageOffset > self->storage->size)
-        THCStorage_(resize)(state, self->storage, totalSize+self->storageOffset);
-    }
-  }
-  else
-    self->nDimension = 0;
+  THCTensor_resizeNd(state, self, nDimension, size, stride);
 }
 
 void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real value)
diff --git a/aten/src/THC/generic/THCTensor.cu b/aten/src/THC/generic/THCTensor.cu
index fbe0ed1d0d87d5..98478341575c75 100644
--- a/aten/src/THC/generic/THCTensor.cu
+++ b/aten/src/THC/generic/THCTensor.cu
@@ -3,8 +3,7 @@
 #else
 
 THC_API int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) {
-  if (!tensor->storage) return -1;
-  return THCStorage_(getDevice)(state, tensor->storage);
+  return THCTensor_getDevice(state, tensor);
 }
 
 #endif
diff --git a/aten/src/THC/generic/THCTensor.hpp b/aten/src/THC/generic/THCTensor.hpp
index ebffb56f47c9f6..f3f28557b17a83 100644
--- a/aten/src/THC/generic/THCTensor.hpp
+++ b/aten/src/THC/generic/THCTensor.hpp
@@ -2,18 +2,9 @@
 #define THC_GENERIC_FILE "generic/THCTensor.hpp"
 #else
 
-typedef struct THCTensor
-{
-    int64_t *size;
-    int64_t *stride;
-    int nDimension;
-
-    THCStorage *storage;
-    ptrdiff_t storageOffset;
-    std::atomic<int> refcount;
-
-    char flag;
-
+// This allows one to write generic functions (i.e. functions that work across all THRealTensor types)
+// without having to explicitly cast the THRealTensor.
+typedef struct THCTensor : _THCTensor {
 } THCTensor;
 
 #endif
diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu
index 85276772c30a1d..5b40fab49f735c 100644
--- a/aten/src/THC/generic/THCTensorIndex.cu
+++ b/aten/src/THC/generic/THCTensorIndex.cu
@@ -141,9 +141,9 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT
   dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
   dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128));
 
-  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) &&
-      TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
-      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
+  if (THCTensor_canUse32BitIndexMath(state, dst) &&
+      THCTensor_canUse32BitIndexMath(state, src) &&
+      THCTensor_canUse32BitIndexMath(state, indices)) {
     TensorInfo<real, unsigned int> dstInfo =
       getTensorInfo<real, THCTensor, unsigned int>(state, dst);
     int dstCopyDim = dstInfo.collapseDims(dim);
@@ -330,9 +330,9 @@ void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTe
   dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
   dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128));
 
-  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) &&
-      TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
-      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
+  if (THCTensor_canUse32BitIndexMath(state, dst) &&
+      THCTensor_canUse32BitIndexMath(state, src) &&
+      THCTensor_canUse32BitIndexMath(state, indices)) {
     TensorInfo<real, unsigned int> dstInfo =
       getTensorInfo<real, THCTensor, unsigned int>(state, dst);
     int dstAddDim = dstInfo.collapseDims(dim);
@@ -447,8 +447,8 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT
   dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
   dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128));
 
-  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) &&
-      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
+  if (THCTensor_canUse32BitIndexMath(state, dst) &&
+      THCTensor_canUse32BitIndexMath(state, indices)) {
     TensorInfo<real, unsigned int> dstInfo =
       getTensorInfo<real, THCTensor, unsigned int>(state, dst);
     int dstFillDim = dstInfo.collapseDims(dim);
@@ -579,9 +579,9 @@ void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, in
   dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
   dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128));
 
-  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) &&
-      TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
-      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
+  if (THCTensor_canUse32BitIndexMath(state, dst) &&
+      THCTensor_canUse32BitIndexMath(state, src) &&
+      THCTensor_canUse32BitIndexMath(state, indices)) {
     TensorInfo<real, unsigned int> dstInfo =
       getTensorInfo<real, THCTensor, unsigned int>(state, dst);
     int dstSelectDim = dstInfo.collapseDims(dim);
diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu
index c9e267204f8a0e..37dc86cc31e1db 100644
--- a/aten/src/THC/generic/THCTensorMath.cu
+++ b/aten/src/THC/generic/THCTensorMath.cu
@@ -192,10 +192,10 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
   if (numInputs > 1 &&
       !hasEmptyInput &&
       THCTensor_(nDimension)(state, result) <= CAT_ARRAY_MAX_INPUT_DIMS &&
-      TensorUtils<THCTensor>::canUse32BitIndexMath(state, result) &&
-      TensorUtils<THCTensor>::allContiguous(state, inputs, numInputs) &&
-      TensorUtils<THCTensor>::all32BitIndexable(state, inputs, numInputs) &&
-      TensorUtils<THCTensor>::allSameDevice(state, inputs, numInputs)) {
+      THCTensor_canUse32BitIndexMath(state, result) &&
+      THCTensor_allContiguous(state, (const _THCTensor **)inputs, numInputs) &&
+      THCTensor_all32BitIndexable(state, (const _THCTensor **)inputs, numInputs) &&
+      THCTensor_allSameDevice(state, (const _THCTensor **)inputs, numInputs)) {
 
     // First, let's set up our kernel parameters. We start with a raw pointer to the storage
     // for the output Tensor.
diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu
index 79fb0c1b496cdf..be3fc3e489dbd0 100644
--- a/aten/src/THC/generic/THCTensorMathPairwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPairwise.cu
@@ -130,7 +130,7 @@ THC_API void
 THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
-  THCTensor_(mul)(state, self_, src_, pow(2, value));
+  THCTensor_(mul)(state, self_, src_, pow(2, -value));
 #elif defined(THC_REAL_IS_HALF)
   return THError("rshift not supported for torch.CudaHalfTensor");
 #else
diff --git a/aten/src/THC/generic/THCTensorMathReduce.cu b/aten/src/THC/generic/THCTensorMathReduce.cu
index 4987ad9436abc7..e60e33c7b46821 100644
--- a/aten/src/THC/generic/THCTensorMathReduce.cu
+++ b/aten/src/THC/generic/THCTensorMathReduce.cu
@@ -90,7 +90,7 @@ THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
 
-  TensorUtils<THCTensor>::preserveReduceDimSemantics(
+  THCTensor_preserveReduceDimSemantics(
       state, self_, THCTensor_(nDimension)(state, src), dimension, keepdim);
   THLongStorage *dim = THCTensor_(newSizeOf)(state, src);
   THLongStorage_set(dim, dimension, 1);
@@ -119,7 +119,7 @@ THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
 
-  TensorUtils<THCTensor>::preserveReduceDimSemantics(
+  THCTensor_preserveReduceDimSemantics(
       state, self_, THCTensor_(nDimension)(state, src), dimension, keepdim);
   THLongStorage *dim = THCTensor_(newSizeOf)(state, src);
   THLongStorage_set(dim, dimension, 1);
diff --git a/aten/src/THC/generic/THCTensorMode.cu b/aten/src/THC/generic/THCTensorMode.cu
index 5eb7b50b4d6c17..6349e07b178dfe 100644
--- a/aten/src/THC/generic/THCTensorMode.cu
+++ b/aten/src/THC/generic/THCTensorMode.cu
@@ -180,9 +180,9 @@ THC_API void THCTensor_(mode)(THCState *state,
 
   // Resize output value, index Tensors to appropriate sizes (i.e. the same as
   // the input Tensor, except at dim=dimension, the size is 1)
-  TensorUtils<THCTensor>::preserveReduceDimSemantics(
+  THCTensor_preserveReduceDimSemantics(
       state, values, ndim, dimension, keepdim);
-  TensorUtils<THCudaLongTensor>::preserveReduceDimSemantics(
+  THCTensor_preserveReduceDimSemantics(
       state, indices, ndim, dimension, keepdim);
   dim = THCTensor_(newSizeOf)(state, input);
   THLongStorage_set(dim, dimension, 1);
@@ -209,7 +209,7 @@ THC_API void THCTensor_(mode)(THCState *state,
   // 3. Can use 32-bit index math for indexing (mainly just for implementation conciseness, could be changed)
   if (sliceSize <= MAX_BLOCK_SIZE &&
       slices <= MAX_GRID_SIZE &&
-      TensorUtils<THCTensor>::canUse32BitIndexMath(state, input)) {
+      THCTensor_canUse32BitIndexMath(state, input)) {
     // Beginning our optimized implementation. First thing we want to do is to transpose
     // the input Tensor along the sort dimension, and then make it contiguous
     transposed = THCTensor_(newTranspose)(state, input, dimension, ndim - 1);
diff --git a/aten/src/THC/generic/THCTensorScatterGather.cu b/aten/src/THC/generic/THCTensorScatterGather.cu
index c026075a954cd5..58ad914c8910f2 100644
--- a/aten/src/THC/generic/THCTensorScatterGather.cu
+++ b/aten/src/THC/generic/THCTensorScatterGather.cu
@@ -4,7 +4,7 @@
 
 #define RUN(TYPE, DIMS, REAL)                                           \
   THCudaTensor_gatherKernel<TYPE, REAL, DIMS>                                \
-  <<<grid, block, 0, THCState_getCurrentStream(state)>>>(               \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(               \
     tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements);
 
 void THCTensor_(gather)(THCState* state, THCTensor *tensor,
@@ -37,17 +37,19 @@ void THCTensor_(gather)(THCState* state, THCTensor *tensor,
   const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
   const dim3 block = getApplyBlock();
   dim3 grid;
-  THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING);
 
   THCTensor* oldTensor = NULL;
-  if (TensorUtils<THCTensor>::maybeOverlappingIndices(state, tensor)) {
+  if (THCTensor_maybeOverlappingIndices(state, tensor)) {
     oldTensor = tensor;
     tensor = THCTensor_(newContiguous)(state, tensor);
   }
 
-  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, tensor) &&
-      TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
-      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, index)) {
+  if (THCTensor_canUse32BitIndexMath(state, tensor) &&
+      THCTensor_canUse32BitIndexMath(state, src) &&
+      THCTensor_canUse32BitIndexMath(state, index)) {
     TensorInfo<real, unsigned int> tensorInfo =
       getTensorInfo<real, THCTensor, unsigned int>(state, tensor);
     TensorInfo<real, unsigned int> srcInfo =
@@ -98,7 +100,7 @@ void THCTensor_(gather)(THCState* state, THCTensor *tensor,
 
 #define RUN(TYPE, DIMS, REAL)                                           \
   THCudaTensor_scatterKernel<TYPE, REAL, DIMS>                               \
-  <<<grid, block, 0, THCState_getCurrentStream(state)>>>(               \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(               \
     tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements);
 
 void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src) {
@@ -130,17 +132,19 @@ void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLong
   const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
   const dim3 block = getApplyBlock();
   dim3 grid;
-  THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING);
 
   THCTensor* oldTensor = NULL;
-  if (TensorUtils<THCTensor>::maybeOverlappingIndices(state, tensor)) {
+  if (THCTensor_maybeOverlappingIndices(state, tensor)) {
     oldTensor = tensor;
     tensor = THCTensor_(newContiguous)(state, tensor);
   }
 
-  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, tensor) &&
-      TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
-      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, index)) {
+  if (THCTensor_canUse32BitIndexMath(state, tensor) &&
+      THCTensor_canUse32BitIndexMath(state, src) &&
+      THCTensor_canUse32BitIndexMath(state, index)) {
     TensorInfo<real, unsigned int> tensorInfo =
       getTensorInfo<real, THCTensor, unsigned int>(state, tensor);
     TensorInfo<real, unsigned int> srcInfo =
@@ -186,7 +190,7 @@ void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLong
 
 #define RUN(TYPE, DIMS, REAL)                                           \
   THCudaTensor_scatterAddKernel<TYPE, REAL, DIMS>                               \
-  <<<grid, block, 0, THCState_getCurrentStream(state)>>>(               \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(               \
     tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements);
 
 void THCTensor_(scatterAdd)(THCState* state, THCTensor *tensor, int dim, THCudaLongTensor *index, THCTensor *src) {
@@ -217,17 +221,19 @@ void THCTensor_(scatterAdd)(THCState* state, THCTensor *tensor, int dim, THCudaL
   const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
   const dim3 block = getApplyBlock();
   dim3 grid;
-  THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING);
 
   THCTensor* oldTensor = NULL;
-  if (TensorUtils<THCTensor>::maybeOverlappingIndices(state, tensor)) {
+  if (THCTensor_maybeOverlappingIndices(state, tensor)) {
     oldTensor = tensor;
     tensor = THCTensor_(newContiguous)(state, tensor);
   }
 
-  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, tensor) &&
-      TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
-      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, index)) {
+  if (THCTensor_canUse32BitIndexMath(state, tensor) &&
+      THCTensor_canUse32BitIndexMath(state, src) &&
+      THCTensor_canUse32BitIndexMath(state, index)) {
     TensorInfo<real, unsigned int> tensorInfo =
       getTensorInfo<real, THCTensor, unsigned int>(state, tensor);
     TensorInfo<real, unsigned int> srcInfo =
@@ -273,7 +279,7 @@ void THCTensor_(scatterAdd)(THCState* state, THCTensor *tensor, int dim, THCudaL
 
 #define RUN(TYPE, DIMS, REAL)                                           \
   THCudaTensor_scatterFillKernel<TYPE, REAL, DIMS>                           \
-      <<<grid, block, 0, THCState_getCurrentStream(state)>>>(      \
+      <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(      \
           tensorInfo, indexInfo, value, dim, (TYPE)totalElements);
 
 void
@@ -302,16 +308,18 @@ THCTensor_(scatterFill)(THCState* state, THCTensor *tensor,
   const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
   const dim3 block = getApplyBlock();
   dim3 grid;
-  THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THArgCheck(getApplyGrid(state, totalElements, grid, curDevice), 1, CUTORCH_DIM_WARNING);
 
   THCTensor* oldTensor = NULL;
-  if (TensorUtils<THCTensor>::maybeOverlappingIndices(state, tensor)) {
+  if (THCTensor_maybeOverlappingIndices(state, tensor)) {
     oldTensor = tensor;
     tensor = THCTensor_(newContiguous)(state, tensor);
   }
 
-  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, tensor) &&
-      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, index)) {
+  if (THCTensor_canUse32BitIndexMath(state, tensor) &&
+      THCTensor_canUse32BitIndexMath(state, index)) {
     TensorInfo<real, unsigned int> tensorInfo =
       getTensorInfo<real, THCTensor, unsigned int>(state, tensor);
     TensorInfo<int64_t, unsigned int> indexInfo =
diff --git a/aten/src/THC/generic/THCTensorSort.cu b/aten/src/THC/generic/THCTensorSort.cu
index 8d4e4720eebfe2..47dde9d7f5d41b 100644
--- a/aten/src/THC/generic/THCTensorSort.cu
+++ b/aten/src/THC/generic/THCTensorSort.cu
@@ -109,7 +109,7 @@ THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
 
   // The constructed key/value tensor info is used to select the slice
   // we are sorting on a per-block basis
-  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, key)) {
+  if (THCTensor_canUse32BitIndexMath(state, key)) {
     TensorInfo<real, unsigned int> keyInfo =
       getTensorInfo<real, THCTensor, unsigned int>(state, key);
     keyInfo.reduceDim(dim);
@@ -153,11 +153,11 @@ THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
   THCudaCheck(cudaGetLastError());
 }
 
-void sortViaThrust(THCState* state,
-                   THCTensor* sorted,
-                   THCudaLongTensor* indices,
-                   THCTensor* input,
-                   int dim, bool dir) {
+void THCTensor_(sortViaThrust)(THCState* state,
+                               THCTensor* sorted,
+                               THCudaLongTensor* indices,
+                               THCTensor* input,
+                               int dim, bool dir) {
   int nDims = THCTensor_(nDimension)(state, input);
 
   ptrdiff_t totalElements = THCTensor_(nElement)(state, input);
@@ -327,7 +327,7 @@ THC_API void THCTensor_(sort)(THCState* state,
   } else {
     // Otherwise, fall back upon Thrust, which handles all other cases
     // (potentially slowly, with extra copies/memory allocations)
-    sortViaThrust(state, sorted, indices, input, dim, (bool) order);
+    THCTensor_(sortViaThrust)(state, sorted, indices, input, dim, (bool) order);
   }
 
   THCudaCheck(cudaGetLastError());
diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu
index f10dac01f451db..1d280df04ae1bb 100644
--- a/aten/src/THC/generic/THCTensorTopK.cu
+++ b/aten/src/THC/generic/THCTensorTopK.cu
@@ -110,9 +110,9 @@ THC_API void THCTensor_(topk)(THCState* state,
 
   // Based on required index size, run the algorithm with the
   // appropriate index type
-  if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, input) &&
-      TensorUtils<THCTensor>::canUse32BitIndexMath(state, topK) &&
-      TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
+  if (THCTensor_canUse32BitIndexMath(state, input) &&
+      THCTensor_canUse32BitIndexMath(state, topK) &&
+      THCTensor_canUse32BitIndexMath(state, indices)) {
     RUN_T(uint32_t);
   } else {
     RUN_T(uint64_t);
diff --git a/aten/src/THCS/generic/THCSTensorMath.cu b/aten/src/THCS/generic/THCSTensorMath.cu
index 02fbe9e09a365d..ac77b5f198de3e 100644
--- a/aten/src/THCS/generic/THCSTensorMath.cu
+++ b/aten/src/THCS/generic/THCSTensorMath.cu
@@ -260,19 +260,22 @@ void THCSTensor_(spcadd)(THCState *state, THCTensor *r_, THCTensor *dense, real
     // TODO benchmark to decide whether to remove this special case
     const dim3 block = getApplyBlock();
     dim3 grid;
+    int curDevice = -1;
+    cudaGetDevice(&curDevice);
     if (sparse->nDimensionV == 0) {
-      THArgCheck(getApplyGrid(state, nnz, grid), 1, CUTORCH_DIM_WARNING);
+      
+      THArgCheck(getApplyGrid(state, nnz, grid, curDevice), 1, CUTORCH_DIM_WARNING);
 
       THCSTensor_sparseElementwiseKernelScalar<TensorCAddOp<real>, uint64_t, real>
-        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+        <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(
           TensorCAddOp<real>(value),
           V_INFO(r_), I_INFO(indices), V_INFO(values),
           (uint64_t) nnz);
     } else {
-      THArgCheck(getApplyGrid(state, nnz * block.x, grid), 1, CUTORCH_DIM_WARNING);
+      THArgCheck(getApplyGrid(state, nnz * block.x, grid, curDevice), 1, CUTORCH_DIM_WARNING);
 
       THCSTensor_sparseElementwiseKernel<TensorCAddOp<real>, uint64_t, real>
-        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+        <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(
           TensorCAddOp<real>(value),
           V_INFO(r_), I_INFO(indices), V_INFO(values),
           (uint64_t) nnz);
@@ -474,10 +477,12 @@ void THCSTensor_(cmul)(THCState *state, THCSTensor *r_, THCSTensor *t_, THCSTens
   int64_t valueSize = t_values_->stride[0];
   const dim3 block = dim3(min((int64_t) getApplyBlock().x, valueSize));
   dim3 grid;
-  THArgCheck(getApplyGrid(state, valueSize, grid), 1, CUTORCH_DIM_WARNING);
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THArgCheck(getApplyGrid(state, valueSize, grid, curDevice), 1, CUTORCH_DIM_WARNING);
 
   THCSTensor_valueSparseIntersectionKernel<TensorMulOp<real>, uint64_t, real>
-    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+    <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(
       TensorMulOp<real>(),
       I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_),
       V_INFO(r_values_), V_INFO(t_values_), V_INFO(s_values_),
@@ -486,7 +491,7 @@ void THCSTensor_(cmul)(THCState *state, THCSTensor *r_, THCSTensor *t_, THCSTens
 
   THCudaLongStorage *resultNnz = THCudaLongStorage_newWithSize(state, 1);
   THCSTensor_indexSparseIntersectionKernel<uint64_t, real>
-    <<<1, 1, 0, THCState_getCurrentStream(state)>>>(
+    <<<1, 1, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>(
       I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_),
       (uint64_t)t_nnz, (uint64_t)s_nnz, (uint64_t*)THCudaLongStorage_data(state, resultNnz));
   THCudaCheck(cudaGetLastError());
diff --git a/aten/src/THCUNN/AbsCriterion.cu b/aten/src/THCUNN/AbsCriterion.cu
index b0874d78509d12..cb0f47510bc559 100644
--- a/aten/src/THCUNN/AbsCriterion.cu
+++ b/aten/src/THCUNN/AbsCriterion.cu
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
+#include "THCApply.cuh"
 
 #include <thrust/fill.h>
 #include <thrust/functional.h>
diff --git a/aten/src/THCUNN/BCECriterion.cu b/aten/src/THCUNN/BCECriterion.cu
index 066f562e4a3768..dfef9082de9498 100644
--- a/aten/src/THCUNN/BCECriterion.cu
+++ b/aten/src/THCUNN/BCECriterion.cu
@@ -2,12 +2,15 @@
 #include "common.h"
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
+#include "THCThrustAllocator.cuh"
+#include "THCApply.cuh"
 
 #include <thrust/functional.h>
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
+#include <thrust/system/cuda/execution_policy.h>
 
 template <typename T>
 inline __host__ __device__ T eps();
diff --git a/aten/src/THCUNN/DistKLDivCriterion.cu b/aten/src/THCUNN/DistKLDivCriterion.cu
index 3c0b0bc86c5092..e4e85b71045f8e 100644
--- a/aten/src/THCUNN/DistKLDivCriterion.cu
+++ b/aten/src/THCUNN/DistKLDivCriterion.cu
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
+#include "THCApply.cuh"
 
 #include <thrust/fill.h>
 #include <thrust/functional.h>
diff --git a/aten/src/THCUNN/FusedRNNKernel.cu b/aten/src/THCUNN/FusedRNNKernel.cu
index 58e22f8cfb08a0..d8b594ab046d04 100644
--- a/aten/src/THCUNN/FusedRNNKernel.cu
+++ b/aten/src/THCUNN/FusedRNNKernel.cu
@@ -23,7 +23,7 @@ struct TensorSigmoidOp<half> {
   __device__ __forceinline__ void operator()(half* out, half* in) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
     half one = ScalarConvert<int, half>::to(1);
-    *out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
+    *out = __hdiv(one, __hadd(one, hexp(__hneg(*in))));
 #else
     float fin = ScalarConvert<half, float>::to(*in);
     *out = ScalarConvert<float, half>::to(1.0f / (1.0f + expf(- fin)));
@@ -33,7 +33,7 @@ struct TensorSigmoidOp<half> {
   __device__ __forceinline__ void operator()(half* v) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
     half one = ScalarConvert<int, half>::to(1);
-    *v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
+    *v = __hdiv(one, __hadd(one, hexp(__hneg(*v))));
 #else
     float fv = ScalarConvert<half, float>::to(*v);
     *v = ScalarConvert<float, half>::to(1.0f / (1.0f + expf(- fv)));
diff --git a/aten/src/THCUNN/MSECriterion.cu b/aten/src/THCUNN/MSECriterion.cu
index dd26e88bff5aef..e9571fe06c4e30 100644
--- a/aten/src/THCUNN/MSECriterion.cu
+++ b/aten/src/THCUNN/MSECriterion.cu
@@ -3,6 +3,7 @@
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCThrustAllocator.cuh"
+#include "THCApply.cuh"
 
 #include <thrust/fill.h>
 #include <thrust/functional.h>
diff --git a/aten/src/THCUNN/SmoothL1Criterion.cu b/aten/src/THCUNN/SmoothL1Criterion.cu
index 8e35599ba1dadf..c8018d997365dc 100644
--- a/aten/src/THCUNN/SmoothL1Criterion.cu
+++ b/aten/src/THCUNN/SmoothL1Criterion.cu
@@ -3,6 +3,7 @@
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCThrustAllocator.cuh"
+#include "THCApply.cuh"
 
 #include <thrust/fill.h>
 #include <thrust/functional.h>
diff --git a/aten/src/THCUNN/SoftMarginCriterion.cu b/aten/src/THCUNN/SoftMarginCriterion.cu
index e38f80c0a77204..ee53e76dca2625 100644
--- a/aten/src/THCUNN/SoftMarginCriterion.cu
+++ b/aten/src/THCUNN/SoftMarginCriterion.cu
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
+#include "THCApply.cuh"
 
 #include <thrust/fill.h>
 #include <thrust/functional.h>
diff --git a/aten/src/THCUNN/generic/AbsCriterion.cu b/aten/src/THCUNN/generic/AbsCriterion.cu
index ba6bb55694d730..ab8904e0ca0e9c 100644
--- a/aten/src/THCUNN/generic/AbsCriterion.cu
+++ b/aten/src/THCUNN/generic/AbsCriterion.cu
@@ -2,8 +2,6 @@
 #define THC_GENERIC_FILE "generic/AbsCriterion.cu"
 #else
 
-#include "THCApply.cuh"
-
 void THNN_(AbsCriterion_updateOutput)(
            THCState *state,
            THCTensor *input,
diff --git a/aten/src/THCUNN/generic/BCECriterion.cu b/aten/src/THCUNN/generic/BCECriterion.cu
index 777700498cfdee..0fe9efdd21b64f 100644
--- a/aten/src/THCUNN/generic/BCECriterion.cu
+++ b/aten/src/THCUNN/generic/BCECriterion.cu
@@ -2,8 +2,6 @@
 #define THC_GENERIC_FILE "generic/BCECriterion.cu"
 #else
 
-#include "THCApply.cuh"
-
 void THNN_(BCECriterion_updateOutput)(
            THCState *state,
            THCTensor *input,
@@ -32,7 +30,7 @@ void THNN_(BCECriterion_updateOutput)(
 
   input = THCTensor_(newContiguous)(state, input);
   target = THCTensor_(newContiguous)(state, target);
-
+  THCThrustAllocator thrustAlloc(state);
   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
 
@@ -41,6 +39,7 @@ void THNN_(BCECriterion_updateOutput)(
     weights = THCTensor_(newContiguous)(state, weights);
     thrust::device_ptr<real> weights_data(THCTensor_(data)(state, weights));
     sum = thrust::transform_reduce(
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
       thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
       thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
       bce_functor_weights<real, accreal>(),
@@ -50,6 +49,7 @@ void THNN_(BCECriterion_updateOutput)(
     THCTensor_(free)(state, weights);
   } else {
     sum = thrust::transform_reduce(
+      thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
       thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
       thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
       bce_functor<real, accreal>(),
diff --git a/aten/src/THCUNN/generic/BatchNormalization.cu b/aten/src/THCUNN/generic/BatchNormalization.cu
index eb2dc8405e9b39..eb3535898ae02c 100644
--- a/aten/src/THCUNN/generic/BatchNormalization.cu
+++ b/aten/src/THCUNN/generic/BatchNormalization.cu
@@ -6,18 +6,18 @@
 #define DeviceTensor1 THCDeviceTensor<real, 1>
 
 template <int Dim>
-static THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
+static THCDeviceTensor<real, Dim> THNN_(devicetensor)(THCState *state, THCTensor *t) {
   if (!t) {
     return THCDeviceTensor<real, Dim>();
   }
 
-  int inDim = THCTensor_(nDimension)(state, t);
+  int inDim = THCTensor_nDimension(state, t);
   if (inDim == Dim) {
     return toDeviceTensor<real, Dim>(state, t);
   }
 
   // View in which the last dimensions are collapsed or expanded as needed
-  THAssert(THCTensor_(isContiguous)(state, t));
+  THAssert(THCTensor_isContiguous(state, t));
   int size[Dim];
   for (int i = 0; i < Dim || i < inDim; ++i) {
     if (i < Dim && i < inDim) {
@@ -43,14 +43,14 @@ void THNN_(BatchNormalization_updateOutput)(
     THCTensor_(resize1d)(state, saveMean_, nInput);
     THCTensor_(resize1d)(state, saveStd_, nInput);
   }
-  DeviceTensor3 input = devicetensor<3>(state, input_);
-  DeviceTensor3 output = devicetensor<3>(state, output_);
-  DeviceTensor1 weight = devicetensor<1>(state, weight_);
-  DeviceTensor1 bias = devicetensor<1>(state, bias_);
-  DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
-  DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
-  DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
-  DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
+  DeviceTensor3 input = THNN_(devicetensor)<3>(state, input_);
+  DeviceTensor3 output = THNN_(devicetensor)<3>(state, output_);
+  DeviceTensor1 weight = THNN_(devicetensor)<1>(state, weight_);
+  DeviceTensor1 bias = THNN_(devicetensor)<1>(state, bias_);
+  DeviceTensor1 runningMean = THNN_(devicetensor)<1>(state, runningMean_);
+  DeviceTensor1 runningVar = THNN_(devicetensor)<1>(state, runningVar_);
+  DeviceTensor1 saveMean = THNN_(devicetensor)<1>(state, saveMean_);
+  DeviceTensor1 saveStd = THNN_(devicetensor)<1>(state, saveStd_);
 
   cudaStream_t s = THCState_getCurrentStream(state);
   cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state);
@@ -81,16 +81,16 @@ void THNN_(BatchNormalization_backward)(
     THCTensor_(resizeAs)(state, gradInput_, input_);
   }
 
-  DeviceTensor3 input = devicetensor<3>(state, input_);
-  DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_);
-  DeviceTensor3 gradInput = devicetensor<3>(state, gradInput_);
-  DeviceTensor1 gradWeight = devicetensor<1>(state, gradWeight_);
-  DeviceTensor1 gradBias = devicetensor<1>(state, gradBias_);
-  DeviceTensor1 weight = devicetensor<1>(state, weight_);
-  DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
-  DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
-  DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
-  DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
+  DeviceTensor3 input = THNN_(devicetensor)<3>(state, input_);
+  DeviceTensor3 gradOutput = THNN_(devicetensor)<3>(state, gradOutput_);
+  DeviceTensor3 gradInput = THNN_(devicetensor)<3>(state, gradInput_);
+  DeviceTensor1 gradWeight = THNN_(devicetensor)<1>(state, gradWeight_);
+  DeviceTensor1 gradBias = THNN_(devicetensor)<1>(state, gradBias_);
+  DeviceTensor1 weight = THNN_(devicetensor)<1>(state, weight_);
+  DeviceTensor1 runningMean = THNN_(devicetensor)<1>(state, runningMean_);
+  DeviceTensor1 runningVar = THNN_(devicetensor)<1>(state, runningVar_);
+  DeviceTensor1 saveMean = THNN_(devicetensor)<1>(state, saveMean_);
+  DeviceTensor1 saveStd = THNN_(devicetensor)<1>(state, saveStd_);
 
   cudaStream_t s = THCState_getCurrentStream(state);
 
diff --git a/aten/src/THCUNN/generic/DistKLDivCriterion.cu b/aten/src/THCUNN/generic/DistKLDivCriterion.cu
index 6ea5d3944a7c4d..b6a282527900ef 100644
--- a/aten/src/THCUNN/generic/DistKLDivCriterion.cu
+++ b/aten/src/THCUNN/generic/DistKLDivCriterion.cu
@@ -2,8 +2,6 @@
 #define THC_GENERIC_FILE "generic/DistKLDivCriterion.cu"
 #else
 
-#include "THCApply.cuh"
-
 void THNN_(DistKLDivCriterion_updateOutput)(
            THCState *state,
            THCTensor *input,
diff --git a/aten/src/THCUNN/generic/FeatureLPPooling.cu b/aten/src/THCUNN/generic/FeatureLPPooling.cu
index 31cf82582b3e17..06a9d00dff5b4c 100644
--- a/aten/src/THCUNN/generic/FeatureLPPooling.cu
+++ b/aten/src/THCUNN/generic/FeatureLPPooling.cu
@@ -159,7 +159,7 @@ void THNN_(FeatureLPPooling_updateOutput)(THCState* state,
                "input must be 1-3 dimensions for non-batch mode");
   }
 
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, inputTH), 2,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, inputTH), 2,
              "input tensor must fit into 32-bit index math");
 
   THCDeviceTensor<real, 4> input;
@@ -201,9 +201,9 @@ void THNN_(FeatureLPPooling_updateGradInput)(THCState* state,
                                              int width,
                                              int stride,
                                              bool batchMode) {
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutputTH), 2,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutputTH), 2,
                 "output gradient tensor must fit into 32-bit index math");
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, inputTH), 3,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, inputTH), 3,
                 "input tensor must fit into 32-bit index math");
   THCUNN_assertSameGPU(state, 4, gradOutputTH, inputTH, outputTH, gradInputTH);
 
diff --git a/aten/src/THCUNN/generic/FusedRNNKernel.cu b/aten/src/THCUNN/generic/FusedRNNKernel.cu
index b96ac829ea6769..e7b4fec0d557c7 100644
--- a/aten/src/THCUNN/generic/FusedRNNKernel.cu
+++ b/aten/src/THCUNN/generic/FusedRNNKernel.cu
@@ -18,10 +18,10 @@ void THNN_(FusedRNNAssertSizes)(THCState *state, int factor, int count, ...)
              THCTensor_(nElement)(state, hidden),
              3, "Input and Hidden tensor sizes should be the same.");
 
-  THAssertMsg(TensorUtils<THCTensor>::getDims(state, input) <= MAX_CUTORCH_DIMS,
+  THAssertMsg(THCTensor_nDimension(state, input) <= MAX_CUTORCH_DIMS,
               "Tensor dimension is too large.");
 
-  THAssertMsg(TensorUtils<THCTensor>::getDims(state, hidden) <= MAX_CUTORCH_DIMS,
+  THAssertMsg(THCTensor_nDimension(state, hidden) <= MAX_CUTORCH_DIMS,
               "Tensor dimension is too large.");
 
   for (int arg=2; arg < count; ++arg){
@@ -29,7 +29,7 @@ void THNN_(FusedRNNAssertSizes)(THCState *state, int factor, int count, ...)
     THArgCheck(THCTensor_(nElement)(state, input) ==
                THCTensor_(nElement)(state, tens)*factor,
                3, "A pointwise tensor was not the right size, should have 1/%u the elements of input/hidden tensor.", arg, factor);
-    THAssertMsg(TensorUtils<THCTensor>::getDims(state, tens) <= MAX_CUTORCH_DIMS,
+    THAssertMsg(THCTensor_nDimension(state, tens) <= MAX_CUTORCH_DIMS,
          "Tensor dimension is too large.");
   }
 
@@ -42,13 +42,13 @@ int THNN_(minIndexType)(THCState *state, int count, ...)
   va_start(list, count);
 
   THCTensor* tens = va_arg(list, THCTensor*);
-  int startDim = TensorUtils<THCTensor>::getDims(state, tens);
+  int startDim = THCTensor_nDimension(state, tens);
   bool canCollapse = THCTensor_(isContiguous)(state,tens);
 
   for (int arg=1; arg < count; ++arg){
     tens = va_arg(list, THCTensor*);
     canCollapse = canCollapse && THCTensor_(isContiguous)(state, tens);
-    if(TensorUtils<THCTensor>::getDims(state, tens) != startDim){
+    if(THCTensor_nDimension(state, tens) != startDim){
       va_end(list);
       return -1;
     }
@@ -65,7 +65,7 @@ bool THNN_(canUse32BitIndexMath)(THCState *state, int count, ...)
 
   for (int arg=0; arg < count; ++arg){
     THCTensor *tens = va_arg(list, THCTensor*);
-    if (!TensorUtils<THCTensor>::canUse32BitIndexMath(state, tens)){
+    if (!THCTensor_canUse32BitIndexMath(state, tens)){
       va_end(list);
       return false;
     }
@@ -385,26 +385,26 @@ __global__ void
 
 #define LSTM_FORWARD(ITYPE, DIM) THNN_(LSTMForward)             \
   <real, ITYPE, DIM>                                            \
-  <<<grid, block, 0, THCState_getCurrentStream(state)>>>        \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>        \
   (inputI, hiddenI,                                             \
    bias1I, bias2I, cxI, hyI, cyI,                               \
    hid_size, totalElements);
 
 #define LSTM_BACKWARD(ITYPE, DIM) THNN_(LSTMBackward)           \
   <real, ITYPE, DIM>                                            \
-  <<<grid, block, 0, THCState_getCurrentStream(state)>>>        \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>        \
   (storageI, gradingatesI, cxI, cyI,                            \
    gradoutI, gradoutcI, gradincxI,                              \
    hid_size, totalElements);
 
 #define GRU_FORWARD(ITYPE, DIM) THNN_(GRUForward)<real, ITYPE, DIM> \
-  <<<grid, block, 0, THCState_getCurrentStream(state)>>>            \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>            \
   (inputI, hiddenI, bias1I, bias2I, hxI, hyI, storageI,             \
    hid_size, totalElements);
 
 #define GRU_BACKWARD(ITYPE, DIM) THNN_(GRUBackward)                     \
   <real, ITYPE, DIM>                                                    \
-  <<<grid, block, 0, THCState_getCurrentStream(state)>>>                \
+  <<<grid, block, 0, THCState_getCurrentStreamOnDevice(state, curDevice)>>>                \
   (gradininputI, gradinhiddenI, gradoutI, gradinhxI, storageI,                        \
    hid_size, totalElements);
 
@@ -434,11 +434,13 @@ void THNN_(LSTM_forw_ind_wrap)(
       (state, 5, input, hidden, hy, cy, cx);
   }
 
-  ptrdiff_t totalElements = TensorUtils<THCTensor>::getNumElements(state, cx);
+  ptrdiff_t totalElements = THCTensor_nElement(state, cx);
 
   const dim3 block = getApplyBlock();
   dim3 grid;
-  THAssertMsg(getApplyGrid(state, totalElements, grid),
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice),
           "Could not get grid size for pointwise apply.");
 
   TINFO inputI = getTensorInfo<real, THCTensor, INDTYPE>(state, input);
@@ -527,11 +529,13 @@ void THNN_(LSTM_back_ind_wrap)(
   int maxDim = THNN_(minIndexType)
     (state, 7, storage, gradInGates, cx, cy,
      gradOutput, gradOutputCell, gradInputCx);
-  ptrdiff_t totalElements = TensorUtils<THCTensor>::getNumElements(state, gradOutput);
+  ptrdiff_t totalElements = THCTensor_nElement(state, gradOutput);
 
   const dim3 block = getApplyBlock();
   dim3 grid;
-  THAssertMsg(getApplyGrid(state, totalElements, grid),
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice),
               "Could not get grid size for pointwise apply");
 
   TINFO storageI = getTensorInfo<real, THCTensor, INDTYPE>(state, storage);
@@ -616,11 +620,13 @@ void THNN_(GRU_forw_ind_wrap)(
       (state, 5, input, hidden, hx, hy, storage);
   }
 
-  ptrdiff_t totalElements = TensorUtils<THCTensor>::getNumElements(state, hx);
+  ptrdiff_t totalElements = THCTensor_nElement(state, hx);
 
   const dim3 block = getApplyBlock();
   dim3 grid;
-  THAssertMsg(getApplyGrid(state, totalElements, grid),
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice),
               "Could not get grid size for pointwise apply.");
 
   TINFO inputI = getTensorInfo<real, THCTensor, INDTYPE>(state, input);
@@ -713,11 +719,13 @@ void THNN_(GRU_back_ind_wrap)(
 
   int maxDim = THNN_(minIndexType)(state, 5, gradInInput, gradInHidden, gradOutput,
                                    gradInputHx, storage);
-  ptrdiff_t totalElements = TensorUtils<THCTensor>::getNumElements(state, gradOutput);
+  ptrdiff_t totalElements = THCTensor_nElement(state, gradOutput);
 
   const dim3 block = getApplyBlock();
   dim3 grid;
-  THAssertMsg(getApplyGrid(state, totalElements, grid),
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  THAssertMsg(getApplyGrid(state, totalElements, grid, curDevice),
           "Could not get grid size for pointwise apply");
 
   TINFO gradininputI = getTensorInfo<real, THCTensor, INDTYPE>(state, gradInInput);
diff --git a/aten/src/THCUNN/generic/IndexLinear.cu b/aten/src/THCUNN/generic/IndexLinear.cu
index ec91d3f5f20881..f65c21dfe199e9 100644
--- a/aten/src/THCUNN/generic/IndexLinear.cu
+++ b/aten/src/THCUNN/generic/IndexLinear.cu
@@ -2,7 +2,7 @@
 #define THC_GENERIC_FILE "generic/IndexLinear.cu"
 #else
 
-static bool THCUNN_checkKeysValues(THCState *state, THCudaLongTensor* keys,
+static bool THNN_(checkKeysValues)(THCState *state, THCudaLongTensor* keys,
                                    THCTensor* values)
 {
     return THCudaLongTensor_size(state, keys, 0) == THCTensor_(nElement)(state, values)
@@ -38,7 +38,7 @@ void THNN_(IndexLinear_updateOutput)(
                "weight matrix must be contiguous");
     THArgCheck(THCTensor_(isContiguous)(state, bias), 8,
                "bias vector must be contiguous");
-    THArgCheck(THCUNN_checkKeysValues(state, keys, values), 1,
+    THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1,
                "Keys and values should have the same number of elements");
 
     int64_t batchSize = sizes->size[0];
@@ -127,7 +127,7 @@ void THNN_(IndexLinear_accGradParameters)(
                "bias vector must be contiguous");
     THArgCheck(THCTensor_(isContiguous)(state, valuesBuffer), 11,
                "valuesBuffer vector must be contiguous");
-    THArgCheck(THCUNN_checkKeysValues(state, keys, values), 1,
+    THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1,
                "Keys and values should have the same number of elements");
 
     THCTensor_(resize2d)(state, gradWeight, keysSize, outDim * (maxNormalize > 0 ? 2 : 1));
@@ -179,7 +179,7 @@ void THNN_(IndexLinear_accUpdateGradParameters)(
                "weight matrix must be contiguous");
     THArgCheck(THCTensor_(isContiguous)(state, bias), 8,
                "bias vector must be contiguous");
-    THArgCheck(THCUNN_checkKeysValues(state, keys, values), 1,
+    THArgCheck(THNN_(checkKeysValues)(state, keys, values), 1,
                "Keys and values should have the same number of elements");
 
     int64_t batchSize = sizes->size[0];
diff --git a/aten/src/THCUNN/generic/LookupTable.cu b/aten/src/THCUNN/generic/LookupTable.cu
index 38179fa848ae70..b13215209af5fc 100644
--- a/aten/src/THCUNN/generic/LookupTable.cu
+++ b/aten/src/THCUNN/generic/LookupTable.cu
@@ -184,7 +184,7 @@ void THNN_(LookupTable_renorm)(
 
   // At launch time figure out what the index type is and norm type
   int Norm = ScalarConvert<accreal, int>::to(normType);
-  if (TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, idx)) {
+  if (THCTensor_canUse32BitIndexMath(state, idx)) {
     if (Norm == 1) {
       RUN(1, unsigned int);
     } else if (Norm == 2) {
diff --git a/aten/src/THCUNN/generic/MSECriterion.cu b/aten/src/THCUNN/generic/MSECriterion.cu
index bd23f9d512350f..99e29be0502e15 100644
--- a/aten/src/THCUNN/generic/MSECriterion.cu
+++ b/aten/src/THCUNN/generic/MSECriterion.cu
@@ -2,8 +2,6 @@
 #define THC_GENERIC_FILE "generic/MSECriterion.cu"
 #else
 
-#include "THCApply.cuh"
-
 void THNN_(MSECriterion_updateOutput)(
            THCState *state,
            THCTensor *input,
diff --git a/aten/src/THCUNN/generic/SmoothL1Criterion.cu b/aten/src/THCUNN/generic/SmoothL1Criterion.cu
index 765ddbb71df8e4..c338f173790dd3 100644
--- a/aten/src/THCUNN/generic/SmoothL1Criterion.cu
+++ b/aten/src/THCUNN/generic/SmoothL1Criterion.cu
@@ -2,8 +2,6 @@
 #define THC_GENERIC_FILE "generic/SmoothL1Criterion.cu"
 #else
 
-#include "THCApply.cuh"
-
 void THNN_(SmoothL1Criterion_updateOutput)(
            THCState *state,
            THCTensor *input,
diff --git a/aten/src/THCUNN/generic/SoftMarginCriterion.cu b/aten/src/THCUNN/generic/SoftMarginCriterion.cu
index 356def09778c0b..a68a9c691879ca 100644
--- a/aten/src/THCUNN/generic/SoftMarginCriterion.cu
+++ b/aten/src/THCUNN/generic/SoftMarginCriterion.cu
@@ -2,8 +2,6 @@
 #define THC_GENERIC_FILE "generic/SoftMarginCriterion.cu"
 #else
 
-#include "THCApply.cuh"
-
 void THNN_(SoftMarginCriterion_updateOutput)(
            THCState *state,
            THCTensor *input,
diff --git a/aten/src/THCUNN/generic/SparseLinear.cu b/aten/src/THCUNN/generic/SparseLinear.cu
index 3d143697eaf847..4e8844b481ebc9 100644
--- a/aten/src/THCUNN/generic/SparseLinear.cu
+++ b/aten/src/THCUNN/generic/SparseLinear.cu
@@ -2,22 +2,22 @@
 #define THC_GENERIC_FILE "generic/SparseLinear.cu"
 #else
 
-static bool checkInput(THCTensor* t)
+static bool THNN_(checkInput)(THCTensor* t)
 {
   return t->nDimension == 2 && t->size[1] == 3;
 }
 
-static bool checkSize2D(THCTensor* t, int64_t size0, int64_t size1)
+static bool THNN_(checkSize2D)(THCTensor* t, int64_t size0, int64_t size1)
 {
   return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
 }
 
-static bool checkSize1D(THCTensor* t, int64_t size0)
+static bool THNN_(checkSize1D)(THCTensor* t, int64_t size0)
 {
   return t->nDimension == 1 && t->size[0] == size0;
 }
 
-static inline void copyCudaFloatingType(THCState *state, THCudaIntTensor *buf, THCTensor *t) {
+static inline void THNN_(copyCudaFloatingType)(THCState *state, THCudaIntTensor *buf, THCTensor *t) {
   #ifdef THC_REAL_IS_FLOAT
   THCudaIntTensor_copyCudaFloat(state, buf, t);
   #elif defined(THC_REAL_IS_DOUBLE)
@@ -40,9 +40,9 @@ void THNN_(SparseLinear_updateOutput)(
   int64_t outDim = THCTensor_(size)(state, weight, 0);
   int64_t inDim = THCTensor_(size)(state, weight, 1);
 
-  THArgCheck(checkInput(input), 2, "input size must be nnz x 3");
+  THArgCheck(THNN_(checkInput)(input), 2, "input size must be nnz x 3");
   THArgCheck(THCTensor_(nDimension)(state, output) == 2, 3, "output must be batchsize x outputsize");
-  THArgCheck(checkSize1D(bias, outDim), 5, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
 
   weight = THCTensor_(newContiguous)(state, weight);
   
@@ -66,9 +66,9 @@ void THNN_(SparseLinear_updateOutput)(
   // If rows might get out of order in future implementations, or if cusparse
   //    complains with an illegal memory access, sort like we do in AccGradParameters
   THCTensor_(select)(state, sel, input, 1, 0);
-  copyCudaFloatingType(state, rowbuf, sel);
+  THNN_(copyCudaFloatingType)(state, rowbuf, sel);
   THCTensor_(select)(state, sel, input, 1, 1);
-  copyCudaFloatingType(state, colInds, sel);
+  THNN_(copyCudaFloatingType)(state, colInds, sel);
   THCTensor_(select)(state, sel, input, 1, 2);
   THCTensor_(copyCuda)(state, values, sel);
 
@@ -136,9 +136,9 @@ void THNN_(SparseLinear_accGradParameters)(
   int64_t outDim = THCTensor_(size)(state, weight, 0);
   int64_t inDim = THCTensor_(size)(state, weight, 1);
 
-  THArgCheck(checkInput(input), 2, "input size must be batchsize x nnz x 2");
-  THArgCheck(checkSize2D(gradWeight, outDim, inDim), 4, "gradWeight size wrong");
-  THArgCheck(checkSize1D(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(input), 2, "input size must be batchsize x nnz x 2");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
 
   weight = THCTensor_(newContiguous)(state, weight);
   int64_t nnz = THCTensor_(size)(state, input, 0);
@@ -166,9 +166,9 @@ void THNN_(SparseLinear_accGradParameters)(
 
   // Get data ready for cusparse, need CudaInt buffers
   THCTensor_(select)(state, sel, buf, 1, 0);
-  copyCudaFloatingType(state, rowInds, sel);
+  THNN_(copyCudaFloatingType)(state, rowInds, sel);
   THCTensor_(select)(state, sel, buf, 1, 1);
-  copyCudaFloatingType(state, colbuf, sel);
+  THNN_(copyCudaFloatingType)(state, colbuf, sel);
   THCTensor_(select)(state, sel, buf, 1, 2);
   THCTensor_(copyCuda)(state, values, sel);
 
diff --git a/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu
index 61a1c7046dd25e..117fe9a8109c42 100644
--- a/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu
+++ b/aten/src/THCUNN/generic/SpatialCrossMapLRN.cu
@@ -2,7 +2,7 @@
 #define THC_GENERIC_FILE "generic/SpatialCrossMapLRN.cu"
 #else
 
-void LRNforward(THCState* state, THCTensor* input, THCTensor* output,
+void THNN_(LRNforward)(THCState* state, THCTensor* input, THCTensor* output,
     THCTensor* scale, int local_size, accreal alpha_, accreal beta_, accreal k_)
 {
   real alpha = ScalarConvert<accreal, real>::to(alpha_);
@@ -47,7 +47,7 @@ void LRNforward(THCState* state, THCTensor* input, THCTensor* output,
 }
 
 
-void LRNbackward(THCState* state, THCTensor* input, THCTensor* output,
+void THNN_(LRNbackward)(THCState* state, THCTensor* input, THCTensor* output,
     THCTensor* gradOutput, THCTensor* gradInput, THCTensor* scale,
     int local_size, accreal alpha_, accreal beta_, accreal k_)
 {
@@ -101,7 +101,7 @@ void THNN_(SpatialCrossMapLRN_updateOutput)(
     accreal beta,
     accreal k)
 {
-  LRNforward(state, input, output, scale, size, alpha, beta, k);
+  THNN_(LRNforward)(state, input, output, scale, size, alpha, beta, k);
 }
 
 void THNN_(SpatialCrossMapLRN_updateGradInput)(
@@ -116,7 +116,7 @@ void THNN_(SpatialCrossMapLRN_updateGradInput)(
     accreal beta,
     accreal k)
 {
-  LRNbackward(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k);
+  THNN_(LRNbackward)(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k);
 }
 
 #endif
diff --git a/aten/src/THCUNN/generic/SpatialReflectionPadding.cu b/aten/src/THCUNN/generic/SpatialReflectionPadding.cu
index 734e23a1bc2287..dc9bc99d0db203 100644
--- a/aten/src/THCUNN/generic/SpatialReflectionPadding.cu
+++ b/aten/src/THCUNN/generic/SpatialReflectionPadding.cu
@@ -7,7 +7,7 @@ void THNN_(SpatialReflectionPadding_updateOutput)(THCState *state,
            THCTensor *output,
            int padL, int padR,
            int padT, int padB) {
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
              "input tensor must fit into 32-bit index math");
 
   int planeDim = 0;
@@ -82,9 +82,9 @@ void THNN_(SpatialReflectionPadding_updateGradInput)(
            int padL, int padR,
            int padT, int padB) {
 
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
                 "input tensor must fit into 32-bit index math");
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), 3,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3,
                 "output gradient tensor must fit into 32-bit index math");
 
   int planeDim = 0;
diff --git a/aten/src/THCUNN/generic/SpatialReplicationPadding.cu b/aten/src/THCUNN/generic/SpatialReplicationPadding.cu
index 4bc787205d4418..26150adb437d6b 100644
--- a/aten/src/THCUNN/generic/SpatialReplicationPadding.cu
+++ b/aten/src/THCUNN/generic/SpatialReplicationPadding.cu
@@ -8,7 +8,7 @@ void THNN_(SpatialReplicationPadding_updateOutput)(
            THCTensor *output,
            int padL, int padR,
            int padT, int padB) {
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
              "input tensor must fit into 32-bit index math");
 
   int planeDim = 0;
@@ -72,9 +72,9 @@ void THNN_(SpatialReplicationPadding_updateGradInput)(
            int padL, int padR,
            int padT, int padB) {
 
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
                 "input tensor must fit into 32-bit index math");
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), 3,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3,
                 "output gradient tensor must fit into 32-bit index math");
 
   int planeDim = 0;
diff --git a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
index 1caf7a356b544e..1dac0219d96dfb 100644
--- a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
+++ b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
@@ -6,7 +6,7 @@ void THNN_(TemporalReflectionPadding_updateOutput)(THCState *state,
            THCTensor *input,
            THCTensor *output,
            int padL, int padR) {
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
              "input tensor must fit into 32-bit index math");
 
   int planeDim = 0;
@@ -71,9 +71,9 @@ void THNN_(TemporalReflectionPadding_updateGradInput)(
            THCTensor *gradInput,
            int padL, int padR) {
 
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
                 "input tensor must fit into 32-bit index math");
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), 3,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3,
                 "output gradient tensor must fit into 32-bit index math");
 
   int planeDim = 0;
diff --git a/aten/src/THCUNN/generic/TemporalReplicationPadding.cu b/aten/src/THCUNN/generic/TemporalReplicationPadding.cu
index a6c7fd5a81b29a..6e1379b3c3f4af 100644
--- a/aten/src/THCUNN/generic/TemporalReplicationPadding.cu
+++ b/aten/src/THCUNN/generic/TemporalReplicationPadding.cu
@@ -7,7 +7,7 @@ void THNN_(TemporalReplicationPadding_updateOutput)(
            THCTensor *input,
            THCTensor *output,
            int padL, int padR) {
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
              "input tensor must fit into 32-bit index math");
 
   int planeDim = 0;
@@ -66,9 +66,9 @@ void THNN_(TemporalReplicationPadding_updateGradInput)(
            THCTensor *gradInput,
            int padL, int padR) {
 
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
                 "input tensor must fit into 32-bit index math");
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), 3,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput), 3,
                 "output gradient tensor must fit into 32-bit index math");
 
   int planeDim = 0;
diff --git a/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu b/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu
index aecbd19078fdb6..cf5a298b3f05db 100644
--- a/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu
+++ b/aten/src/THCUNN/generic/VolumetricReplicationPadding.cu
@@ -9,7 +9,7 @@ static inline void THNN_(VolumetricReplicationPadding_shapeCheck)(
                          int pleft, int pright,
                          int ptop, int pbottom,
                          int pfront, int pback) {
-  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+  THArgCheck(THCTensor_canUse32BitIndexMath(state, input), 2,
              "input tensor must fit into 32-bit index math");
   int numInputDims = THCTensor_(nDimension)(state, input);
 
@@ -40,7 +40,7 @@ static inline void THNN_(VolumetricReplicationPadding_shapeCheck)(
              idepth, iheight, iwidth, odepth, oheight, owidth);
 
   if (gradOutput != NULL) {
-    THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput),
+    THArgCheck(THCTensor_canUse32BitIndexMath(state, gradOutput),
                3, "output gradient tensor must fit into 32-bit index math");
 
     THArgCheck(numPlanes == THCTensor_(size)(state, gradOutput, planeDim), 3,
diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index c88b785b04aea4..7a70698508656e 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -179,6 +179,7 @@ void loadInput(
 void runNetwork(
     shared_ptr<caffe2::Workspace> workspace,
     caffe2::NetDef& net_def,
+    const bool wipe_cache,
     const bool run_individual,
     const int warmup,
     const int iter) {
@@ -196,7 +197,9 @@ void runNetwork(
     CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed.");
   }
 
-  caffe2::wipe_cache();
+  if (wipe_cache) {
+    caffe2::wipe_cache();
+  }
   LOG(INFO) << "Main runs.";
   CAFFE_ENFORCE(
       iter >= 0,
@@ -206,11 +209,15 @@ void runNetwork(
   for (int i = 0; i < iter; ++i) {
     caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup);
     CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed.");
-    caffe2::wipe_cache();
+    if (wipe_cache) {
+      caffe2::wipe_cache();
+    }
     if (run_individual) {
       caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup);
       CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed.");
-      caffe2::wipe_cache();
+      if (wipe_cache) {
+        caffe2::wipe_cache();
+      }
     }
   }
 }
diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h
index fa4a2bcebbed5c..4ee32dba219194 100644
--- a/binaries/benchmark_helper.h
+++ b/binaries/benchmark_helper.h
@@ -87,5 +87,6 @@ void runNetwork(
     shared_ptr<caffe2::Workspace>,
     caffe2::NetDef&,
     const bool,
+    const bool,
     const int,
     const int);
diff --git a/binaries/caffe2_benchmark.cc b/binaries/caffe2_benchmark.cc
index b6f8ce6193c836..16c1173d0ff0ff 100644
--- a/binaries/caffe2_benchmark.cc
+++ b/binaries/caffe2_benchmark.cc
@@ -66,6 +66,10 @@ CAFFE2_DEFINE_bool(
     false,
     "Whether to write out output in text format for regression purpose.");
 CAFFE2_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
+CAFFE2_DEFINE_bool(
+    wipe_cache,
+    false,
+    "Whether to evict the cache before running network.");
 
 int main(int argc, char** argv) {
   caffe2::GlobalInit(&argc, &argv);
@@ -103,6 +107,7 @@ int main(int argc, char** argv) {
   runNetwork(
       workspace,
       net_def,
+      caffe2::FLAGS_wipe_cache,
       caffe2::FLAGS_run_individual,
       caffe2::FLAGS_warmup,
       caffe2::FLAGS_iter);
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
new file mode 100644
index 00000000000000..1cb1d31d76d328
--- /dev/null
+++ b/c10/CMakeLists.txt
@@ -0,0 +1,50 @@
+project(c10 CXX C)
+
+set(LIB_SOURCES
+        dummy.cpp
+)
+
+set(TEST_SOURCES
+        dummy_test.cpp
+)
+
+if(MSVC)
+  # TODO Also add some warning options that MSVC can understand
+  set(WARNING_OPTIONS "")
+else()
+  set(WARNING_OPTIONS
+        -Wall
+        -Wextra
+        -Wold-style-cast
+        -Wno-missing-braces
+        -Wcast-align
+        -Wcast-qual
+        -Wctor-dtor-privacy
+        -Wdisabled-optimization
+        -Wformat=2
+        -Winit-self
+        -Wmissing-include-dirs
+        -Woverloaded-virtual
+        -Wredundant-decls
+        -Wshadow
+        -Wsign-promo
+        -Wstrict-overflow=5
+        -fdiagnostics-show-option
+        -Wconversion
+        -Wpedantic
+        -Wno-gnu-zero-variadic-macro-arguments
+        -Wundef
+        -Werror
+        )
+endif()
+
+add_library(${PROJECT_NAME} OBJECT ${LIB_SOURCES})
+target_compile_options(${PROJECT_NAME} PRIVATE ${WARNING_OPTIONS})
+
+if(BUILD_TEST)
+    add_executable(${PROJECT_NAME}_test ${TEST_SOURCES} $<TARGET_OBJECTS:${PROJECT_NAME}>)
+    add_test(NAME ${PROJECT_NAME}_test COMMAND $<TARGET_FILE:${PROJECT_NAME}_test>)
+    target_compile_options(${PROJECT_NAME}_test PRIVATE ${WARNING_OPTIONS})
+    target_link_libraries(${PROJECT_NAME}_test gtest_main)
+    install(TARGETS ${PROJECT_NAME}_test DESTINATION test)
+endif()
diff --git a/c10/dummy.cpp b/c10/dummy.cpp
new file mode 100644
index 00000000000000..7eb029f2618e7f
--- /dev/null
+++ b/c10/dummy.cpp
@@ -0,0 +1,3 @@
+// This is going to be replaced with actual c10 files
+
+#include <c10/dummy.h>
diff --git a/c10/dummy.h b/c10/dummy.h
new file mode 100644
index 00000000000000..c4be4d6b808728
--- /dev/null
+++ b/c10/dummy.h
@@ -0,0 +1,3 @@
+#pragma once
+
+// This is going to be replaced with actual c10 files
diff --git a/c10/dummy_test.cpp b/c10/dummy_test.cpp
new file mode 100644
index 00000000000000..4e32bef2cd3aa5
--- /dev/null
+++ b/c10/dummy_test.cpp
@@ -0,0 +1,6 @@
+#include <gtest/gtest.h>
+#include <c10/dummy.h>
+
+TEST(DummyTest, dummy) {
+    EXPECT_TRUE(true);
+}
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 3a4db6f1819479..a31c7521196c17 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -8,8 +8,6 @@ if(BUILD_ATEN)
   set(__caffe2_CMAKE_POSITION_INDEPENDENT_CODE ${CMAKE_POSITION_INDEPENDENT_CODE})
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
   set(AT_LINK_STYLE INTERFACE)
-  # Disable contrib for root-level build
-  set(ATEN_NO_CONTRIB ON)
   add_subdirectory(../aten aten)
   set(CMAKE_POSITION_INDEPENDENT_CODE ${__caffe2_CMAKE_POSITION_INDEPENDENT_CODE})
 
@@ -44,6 +42,11 @@ if(BUILD_ATEN)
   list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS})
   list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS})
   list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE})
+  IF(USE_ROCM)
+    # Set the HIP Variables
+    set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS})
+    set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE})
+  ENDIF()
 endif()
 
 # ---[ Caffe2 build
@@ -177,6 +180,7 @@ if(BUILD_CAFFE2)
 endif()
 
 # Compile exposed libraries.
+list(APPEND Caffe2_CPU_SRCs $<TARGET_OBJECTS:c10>)
 add_library(caffe2 ${Caffe2_CPU_SRCS})
 if (BUILD_CAFFE2)
   caffe2_interface_library(caffe2_protos caffe2_protos_whole)
@@ -199,13 +203,12 @@ target_compile_options(caffe2 INTERFACE "-std=c++11")
 target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
 # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
 target_compile_options(caffe2 PRIVATE "$<$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>:-O2>")
-set_target_properties(caffe2 PROPERTIES VERSION 1 SOVERSION 1)
 install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib)
 caffe2_interface_library(caffe2 caffe2_library)
 list(APPEND Caffe2_MAIN_LIBS caffe2_library)
 
 # ---[ CUDA library.
-if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2))
+if(USE_CUDA)
   # A hack to deal with cuda library dependencies and modern CMake: the
   # CUDA_ADD_LIBRARY includes a target_link_libraries, and as a result,
   # one cannot use PUBLIC/PRIVATE/INTERFACE for the target anymore. This
@@ -236,7 +239,6 @@ if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2))
 
   # Set standard properties on the target
   aten_set_target_props(caffe2_gpu)
-  set_target_properties(caffe2_gpu PROPERTIES VERSION 1 SOVERSION 1)
 
   install(TARGETS caffe2_gpu EXPORT Caffe2Targets DESTINATION lib)
   caffe2_interface_library(caffe2_gpu caffe2_gpu_library)
@@ -244,25 +246,40 @@ if(USE_CUDA OR (USE_ROCM AND NOT BUILD_CAFFE2))
 endif()
 
 # ---[ Caffe2 HIP sources.
-if(BUILD_CAFFE2)
-  if(USE_ROCM)
-    HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS})
-    set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS})
+if(USE_ROCM)
+  HIP_INCLUDE_DIRECTORIES(${Caffe2_HIP_INCLUDES})
 
-    target_include_directories(
-      caffe2_hip PUBLIC ${Caffe2_HIP_INCLUDES})
-    target_include_directories(
-      caffe2_hip INTERFACE $<INSTALL_INTERFACE:include>)
+  IF(BUILD_ATEN)
+    # Set necessary HIPCC Flags
+    SET(HIP_HCC_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${HIP_HCC_FLAGS}")
+    SET(Caffe2_HIP_CXX_FLAGS "-fPIC -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -D__HIP_PLATFORM_HCC__=1 ${Caffe2_HIP_CXX_FLAGS}")
+  ENDIF()
 
-    target_link_libraries(caffe2_hip PUBLIC caffe2)
-    target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS})
+  # Since the HIP_ADD_LIBRARY is a MACRO, we need to set HIP_HCC_FLAGS prior to calling it.
+  # Also, Since HIP_INCLUDE_DIRECTORIES is a MACRO, we must call it before HIP_ADD_LIBRARY.
+  HIP_ADD_LIBRARY(caffe2_hip ${Caffe2_HIP_SRCS})
 
-    set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP)
+  set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${Caffe2_HIP_CXX_FLAGS})
 
-    caffe2_interface_library(caffe2_hip caffe2_hip_library)
-    list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library)
-    install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib)
-  endif()
+  target_include_directories(
+    caffe2_hip PRIVATE ${Caffe2_HIP_INCLUDES})
+  target_include_directories(
+    caffe2_hip INTERFACE $<INSTALL_INTERFACE:include>)
+
+  IF(BUILD_ATEN)
+    aten_set_target_props(caffe2_hip)
+  ENDIF()
+
+  target_link_libraries(caffe2_hip PUBLIC caffe2)
+  target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS})
+
+  # https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_faq.md#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine
+  # To avoid having to do above, keep this commented.
+  # set_target_properties(caffe2_hip PROPERTIES LINKER_LANGUAGE HIP)
+
+  caffe2_interface_library(caffe2_hip caffe2_hip_library)
+  list(APPEND Caffe2_MAIN_LIBS caffe2_hip_library)
+  install(TARGETS caffe2_hip EXPORT Caffe2Targets DESTINATION lib)
 endif()
 
 # ---[ Check if warnings should be errors.
@@ -292,7 +309,9 @@ if(BUILD_CAFFE2)
         target_compile_features(${test_name} PRIVATE cxx_range_for)
       endif()
       add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
-      install(TARGETS ${test_name} DESTINATION test)
+      if (INSTALL_TEST)
+        install(TARGETS ${test_name} DESTINATION test)
+      endif()
     endforeach()
 
     if(USE_ROCM)
@@ -314,7 +333,8 @@ if(BUILD_CAFFE2)
   # Aten tests should only run when Caffe2 is not built
   set(__aten_test_dir "test/aten")
 endif()
-if(BUILD_ATEN)
+# Todo - Set up ATen tests for ROCm in an upcoming PR
+if(BUILD_ATEN AND NOT USE_ROCM)
   foreach(test_src ${ATen_CPU_TEST_SRCS})
     get_filename_component(test_name ${test_src} NAME_WE)
     add_executable(${test_name} "${test_src}")
diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h
index 6ee551c219480c..f56ffe893db54b 100644
--- a/caffe2/core/common_gpu.h
+++ b/caffe2/core/common_gpu.h
@@ -298,86 +298,126 @@ struct SimpleArray {
 
 constexpr int kCUDATensorMaxDims = 8;
 
-#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(val, func, T, ...) \
+#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(val, Func, T, ...) \
   do {                                                            \
     CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims);                    \
     switch (val) {                                                \
       case 1: {                                                   \
-        func<T, 1>(__VA_ARGS__);                                  \
+        Func<T, 1>(__VA_ARGS__);                                  \
         break;                                                    \
       }                                                           \
       case 2: {                                                   \
-        func<T, 2>(__VA_ARGS__);                                  \
+        Func<T, 2>(__VA_ARGS__);                                  \
         break;                                                    \
       }                                                           \
       case 3: {                                                   \
-        func<T, 3>(__VA_ARGS__);                                  \
+        Func<T, 3>(__VA_ARGS__);                                  \
         break;                                                    \
       }                                                           \
       case 4: {                                                   \
-        func<T, 4>(__VA_ARGS__);                                  \
+        Func<T, 4>(__VA_ARGS__);                                  \
         break;                                                    \
       }                                                           \
       case 5: {                                                   \
-        func<T, 5>(__VA_ARGS__);                                  \
+        Func<T, 5>(__VA_ARGS__);                                  \
         break;                                                    \
       }                                                           \
       case 6: {                                                   \
-        func<T, 6>(__VA_ARGS__);                                  \
+        Func<T, 6>(__VA_ARGS__);                                  \
         break;                                                    \
       }                                                           \
       case 7: {                                                   \
-        func<T, 7>(__VA_ARGS__);                                  \
+        Func<T, 7>(__VA_ARGS__);                                  \
         break;                                                    \
       }                                                           \
       case 8: {                                                   \
-        func<T, 8>(__VA_ARGS__);                                  \
+        Func<T, 8>(__VA_ARGS__);                                  \
         break;                                                    \
       }                                                           \
       default: { break; }                                         \
     }                                                             \
   } while (false)
 
-#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(val, func, T1, T2, ...) \
+#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(val, Func, T1, T2, ...) \
   do {                                                                 \
     CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims);                         \
     switch (val) {                                                     \
       case 1: {                                                        \
-        func<T1, T2, 1>(__VA_ARGS__);                                  \
+        Func<T1, T2, 1>(__VA_ARGS__);                                  \
         break;                                                         \
       }                                                                \
       case 2: {                                                        \
-        func<T1, T2, 2>(__VA_ARGS__);                                  \
+        Func<T1, T2, 2>(__VA_ARGS__);                                  \
         break;                                                         \
       }                                                                \
       case 3: {                                                        \
-        func<T1, T2, 3>(__VA_ARGS__);                                  \
+        Func<T1, T2, 3>(__VA_ARGS__);                                  \
         break;                                                         \
       }                                                                \
       case 4: {                                                        \
-        func<T1, T2, 4>(__VA_ARGS__);                                  \
+        Func<T1, T2, 4>(__VA_ARGS__);                                  \
         break;                                                         \
       }                                                                \
       case 5: {                                                        \
-        func<T1, T2, 5>(__VA_ARGS__);                                  \
+        Func<T1, T2, 5>(__VA_ARGS__);                                  \
         break;                                                         \
       }                                                                \
       case 6: {                                                        \
-        func<T1, T2, 6>(__VA_ARGS__);                                  \
+        Func<T1, T2, 6>(__VA_ARGS__);                                  \
         break;                                                         \
       }                                                                \
       case 7: {                                                        \
-        func<T1, T2, 7>(__VA_ARGS__);                                  \
+        Func<T1, T2, 7>(__VA_ARGS__);                                  \
         break;                                                         \
       }                                                                \
       case 8: {                                                        \
-        func<T1, T2, 8>(__VA_ARGS__);                                  \
+        Func<T1, T2, 8>(__VA_ARGS__);                                  \
         break;                                                         \
       }                                                                \
       default: { break; }                                              \
     }                                                                  \
   } while (false)
 
+#define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(val, Func, T1, T2, T3, ...) \
+  do {                                                                     \
+    CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims);                             \
+    switch (val) {                                                         \
+      case 1: {                                                            \
+        Func<T1, T2, T3, 1>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 2: {                                                            \
+        Func<T1, T2, T3, 2>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 3: {                                                            \
+        Func<T1, T2, T3, 3>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 4: {                                                            \
+        Func<T1, T2, T3, 4>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 5: {                                                            \
+        Func<T1, T2, T3, 5>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 6: {                                                            \
+        Func<T1, T2, T3, 6>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 7: {                                                            \
+        Func<T1, T2, T3, 7>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      case 8: {                                                            \
+        Func<T1, T2, T3, 8>(__VA_ARGS__);                                  \
+        break;                                                             \
+      }                                                                    \
+      default: { break; }                                                  \
+    }                                                                      \
+  } while (false)
+
 } // namespace caffe2
 
 #endif // CAFFE2_CORE_COMMON_GPU_H_
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index 9a901d2ffe1e53..bbff0f4bec4185 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -51,7 +51,7 @@ class ThreadLocalCUDAObjects {
 
   cudaStream_t GetStream(int gpu, int stream_id) {
     vector<cudaStream_t>& gpu_streams = cuda_streams_[gpu];
-    if (gpu_streams.size() <= stream_id) {
+    if (gpu_streams.size() <= (unsigned)stream_id) {
       gpu_streams.resize(stream_id + 1, nullptr);
     }
     if (!gpu_streams[stream_id]) {
@@ -65,7 +65,7 @@ class ThreadLocalCUDAObjects {
   cublasHandle_t GetHandle(int gpu, int stream_id) {
     DeviceGuard guard(gpu);
     vector<cublasHandle_t>& gpu_handles = cublas_handles_[gpu];
-    if (gpu_handles.size() <= stream_id) {
+    if (gpu_handles.size() <= (unsigned)stream_id) {
       gpu_handles.resize(stream_id + 1, nullptr);
     }
     if (!gpu_handles[stream_id]) {
@@ -84,7 +84,7 @@ class ThreadLocalCUDAObjects {
   cudnnHandle_t GetCudnnHandle(int gpu, int stream_id) {
     DeviceGuard guard(gpu);
     vector<cudnnHandle_t>& gpu_handles = cudnn_handles_[gpu];
-    if (gpu_handles.size() <= stream_id) {
+    if (gpu_handles.size() <= (unsigned)stream_id) {
       gpu_handles.resize(stream_id + 1, nullptr);
     }
     if (!gpu_handles[stream_id]) {
diff --git a/caffe2/core/db.cc b/caffe2/core/db.cc
index b88afce259d058..3dd993c925a2d4 100644
--- a/caffe2/core/db.cc
+++ b/caffe2/core/db.cc
@@ -52,10 +52,10 @@ class MiniDBCursor : public Cursor {
     CAFFE_ENFORCE_GT(key_len_, 0);
     CAFFE_ENFORCE_GT(value_len_, 0);
     // Resize if the key and value len is larger than the current one.
-    if (key_len_ > key_.size()) {
+    if (key_len_ > (int)key_.size()) {
       key_.resize(key_len_);
     }
-    if (value_len_ > value_.size()) {
+    if (value_len_ > (int)value_.size()) {
       value_.resize(value_len_);
     }
     // Actually read in the contents.
diff --git a/caffe2/core/db.h b/caffe2/core/db.h
index 7bcfa1a5f6306b..7c5b79df691918 100644
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@@ -226,7 +226,7 @@ class DBReader {
     *value = cursor_->value();
 
     // In sharded mode, each read skips num_shards_ records
-    for (int s = 0; s < num_shards_; s++) {
+    for (uint32_t s = 0; s < num_shards_; s++) {
       cursor_->Next();
       if (!cursor_->Valid()) {
         MoveToBeginning();
@@ -270,7 +270,7 @@ class DBReader {
 
   void MoveToBeginning() const {
     cursor_->SeekToFirst();
-    for (auto s = 0; s < shard_id_; s++) {
+    for (uint32_t s = 0; s < shard_id_; s++) {
       cursor_->Next();
       CAFFE_ENFORCE(
           cursor_->Valid(), "Db has less rows than shard id: ", s, shard_id_);
diff --git a/caffe2/core/flags.cc b/caffe2/core/flags.cc
index 2df6edc7848edd..04edccfdef576b 100644
--- a/caffe2/core/flags.cc
+++ b/caffe2/core/flags.cc
@@ -85,7 +85,7 @@ bool ParseCaffeCommandLineFlags(int* pargc, char*** pargv) {
 
     string key;
     string value;
-    int prefix_idx = arg.find('=');
+    size_t prefix_idx = arg.find('=');
     if (prefix_idx == string::npos) {
       // If there is no equality char in the arg, it means that the
       // arg is specified in the next argument.
diff --git a/caffe2/core/graph.cc b/caffe2/core/graph.cc
index d7a900f802b4d0..ba5a3a136404c6 100644
--- a/caffe2/core/graph.cc
+++ b/caffe2/core/graph.cc
@@ -22,7 +22,7 @@ Graph::Graph(const NetDef& net) : netdef_(net) {
   // In python, this is known as "versions".
   std::unordered_map<string, int> edge_parent;
 
-  for (int i = 0; i < nodes_.size(); i++) {
+  for (int i = 0; i < (int)nodes_.size(); i++) {
     for (const string& blob : node(i).op.input()) {
       auto it = edge_parent.find(blob);
       if (it != edge_parent.end()) {
@@ -43,7 +43,7 @@ Graph::Graph(const NetDef& net) : netdef_(net) {
   // For any blob, which operator was the last to read to from it?
   std::unordered_map<string, int> edge_child;
 
-  for (int i = nodes_.size() - 1; i >= 0; i--) {
+  for (int i = (int)nodes_.size() - 1; i >= 0; i--) {
     for (const string& blob : node(i).op.output()) {
       auto it = edge_child.find(blob);
       if (it == edge_child.end()) {
@@ -76,7 +76,7 @@ const std::vector<std::pair<string, int>> Graph::GetSubgraphPerimeterHelper(
     const std::vector<int>& match) {
   std::vector<std::pair<string, int>> edge_list;
   std::unordered_set<int> match_set(match.begin(), match.end());
-  for (int x = 0; x < nodes_.size(); x++) {
+  for (int x = 0; x < (int)nodes_.size(); x++) {
     if (!is_node_active(x)) {
       continue;
     }
@@ -142,7 +142,7 @@ NetDef Graph::GetNetDef() {
   // However, giving each blob unique names via SSA will satisfy this condition.
   // Then, the resulting graph can be optimized with memonger.
 
-  for (int i = 0; i < nodes_.size(); i++) {
+  for (int i = 0; i < (int)nodes_.size(); i++) {
     unchecked_parent_count.push_back(node(i).parents.size());
     if (node(i).parents.size() == 0 && is_node_active(i)) {
       q.push(i);
diff --git a/caffe2/core/memonger.cc b/caffe2/core/memonger.cc
index 4a61ecb1060a85..121feaf30e736e 100644
--- a/caffe2/core/memonger.cc
+++ b/caffe2/core/memonger.cc
@@ -30,7 +30,7 @@ NetDef optimize_inference_net(
   // Step 1: count first and last operator for each blob
   std::unordered_set<std::string> all_blobs;
   std::unordered_map<std::string, std::pair<int, int>> ranges;
-  for (int i = 0; i < ops.size(); i++) {
+  for (size_t i = 0; i < ops.size(); i++) {
     for (auto& inp : ops[i].input()) {
       if (ranges.find(inp) != ranges.end()) {
         ranges[inp].second = i;
@@ -53,7 +53,7 @@ NetDef optimize_inference_net(
   std::unordered_map<std::string, std::string> renaming;
   std::unordered_map<std::string, std::string> mapping;
 
-  for (int i = 0; i < ops.size(); i++) {
+  for (int i = 0; i < (int)ops.size(); i++) {
     auto& op = ops[i];
     std::unordered_set<std::string> new_free_blobs;
 
@@ -366,7 +366,7 @@ class ComputeBlobRecyclingForDag {
     for (const auto& input : current_op.input()) {
       if (has_key(shareable_blob_names, input)) {
         blob_input_count_[input]++;
-        if (blob_input_count_[input] == blob_to_ops_[input].size()) {
+        if (blob_input_count_[input] == (int)blob_to_ops_[input].size()) {
           const string& actual_blob = get_blob_or_mapped_blob(input);
           if (!has_key(dont_share_blob_names, actual_blob)) {
             new_free_blobs.emplace_back(
@@ -456,7 +456,7 @@ class ComputeBlobRecyclingForDag {
       return 0;
     }
     int size = 1;
-    for (int i = 0; i < blob_shapes_iter->second.size(); ++i) {
+    for (size_t i = 0; i < blob_shapes_iter->second.size(); ++i) {
       size *= blob_shapes_iter->second[i];
     }
     return size;
@@ -526,7 +526,7 @@ class ComputeBlobRecyclingForDag {
       const int blob_size = infer_blob_size(blob_name, blob_shapes);
       int best_size = -1;
       int free_blob_index = -1;
-      for (int i = 0; i < free_blobs->size(); ++i) {
+      for (size_t i = 0; i < free_blobs->size(); ++i) {
         const string& cb_name = (*free_blobs)[i].second;
         if (can_use_blob(cb_name, tokens, device)) {
           const int cand_bz = blob_sizes_[cb_name];
diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc
index b14e55ea5a5b6e..9d6f536c063467 100644
--- a/caffe2/core/net.cc
+++ b/caffe2/core/net.cc
@@ -109,7 +109,7 @@ void checkExecutorOverride(std::string& net_type) {
   CAFFE_ENFORCE(
       executors.size() % 2 == 0, "Invalid override executors flag value");
   std::unordered_map<std::string, std::string> overrides;
-  for (auto idx = 0; idx < executors.size() - 1; idx += 2) {
+  for (size_t idx = 0; idx < executors.size() - 1; idx += 2) {
     overrides[executors[idx]] = executors[idx + 1];
   }
   if (overrides.count(net_type)) {
diff --git a/caffe2/core/net_async_base.cc b/caffe2/core/net_async_base.cc
index 30c2371b58b783..742a3457ec9164 100644
--- a/caffe2/core/net_async_base.cc
+++ b/caffe2/core/net_async_base.cc
@@ -163,7 +163,7 @@ int AsyncNetBase::stream(int task_id) {
   if (device_option.device_type() == CUDA) {
     int gpu_id = device_option.cuda_gpu_id();
     CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
-    if (gpu_id >= stream_counters_.size()) {
+    if ((unsigned)gpu_id >= stream_counters_.size()) {
       stream_counters_.resize(gpu_id + 1, 0);
     }
     do {
diff --git a/caffe2/core/net_async_dag_gpu.cc b/caffe2/core/net_async_dag_gpu.cc
index 402b27c47141f1..12bd33ac7e247d 100644
--- a/caffe2/core/net_async_dag_gpu.cc
+++ b/caffe2/core/net_async_dag_gpu.cc
@@ -114,7 +114,7 @@ int AsyncDAGNet::stream(const DeviceOption& device_option) {
   if (device_option.device_type() == CUDA) {
     int gpu_id = device_option.cuda_gpu_id();
     CAFFE_ENFORCE_GE(gpu_id, 0, "Invalid gpu id: " + caffe2::to_string(gpu_id));
-    if (gpu_id >= stream_counters_.size()) {
+    if ((unsigned)gpu_id >= stream_counters_.size()) {
       stream_counters_.resize(gpu_id + 1, 0);
     }
     do {
diff --git a/caffe2/core/net_async_tracing.cc b/caffe2/core/net_async_tracing.cc
index 4ef17f68742e0c..0d3f83aba277ed 100644
--- a/caffe2/core/net_async_tracing.cc
+++ b/caffe2/core/net_async_tracing.cc
@@ -255,7 +255,7 @@ Tracer::~Tracer() {
   renameThreads();
   std::stringstream serialized;
   serialized << "[\n";
-  for (auto idx = 0; idx < events_.size(); ++idx) {
+  for (size_t idx = 0; idx < events_.size(); ++idx) {
     serialized << serializeEvent(events_[idx]);
     if (idx != events_.size() - 1) {
       serialized << ",\n";
diff --git a/caffe2/core/net_dag.cc b/caffe2/core/net_dag.cc
index fca21866f10420..8ecc17f902c958 100644
--- a/caffe2/core/net_dag.cc
+++ b/caffe2/core/net_dag.cc
@@ -51,7 +51,7 @@ DAGNetBase::DAGNetBase(
 
   // Figure out the initial frontier - this is the one we will feed into the job
   // queue to start a run.
-  for (int idx = 0; idx < operator_nodes_.size(); ++idx) {
+  for (size_t idx = 0; idx < operator_nodes_.size(); ++idx) {
     if (operator_nodes_[idx].parents_.size() == 0) {
       initial_frontier_.push_back(idx);
     }
@@ -66,7 +66,7 @@ DAGNetBase::DAGNetBase(
   }
   num_workers_ = num_workers;
 
-  for (int idx = 0; idx < operator_nodes_.size(); ++idx) {
+  for (size_t idx = 0; idx < operator_nodes_.size(); ++idx) {
     if (operator_nodes_[idx].is_chain_start_) {
       task_timers_[idx] = caffe2::make_unique<Timer>();
     }
@@ -112,11 +112,11 @@ bool DAGNetBase::DoRunAsync() {
     job_queue_ = caffe2::make_unique<SimpleQueue<int>>();
   }
   // Figure out number of workers to start.
-  auto num_workers_to_start = num_workers_ - workers_.size();
+  size_t num_workers_to_start = num_workers_ - workers_.size();
 
   // Ensure the number of workers matches the defined in case
   // any of the previously started threads terminated.
-  for (auto i = 0; i < num_workers_to_start; i++) {
+  for (size_t i = 0; i < num_workers_to_start; i++) {
     VLOG(1) << "Start worker #" << workers_.size();
     workers_.push_back(std::thread(&DAGNetBase::WorkerFunction, this));
   }
diff --git a/caffe2/core/net_dag_utils.cc b/caffe2/core/net_dag_utils.cc
index 9cec13906dc095..6c24a3fdba06f4 100644
--- a/caffe2/core/net_dag_utils.cc
+++ b/caffe2/core/net_dag_utils.cc
@@ -30,7 +30,7 @@ void prune(int node_idx, std::vector<OpGraphNode>& nodes) {
 
     // If the node has already been visited, pop curr out of
     // stack and clean up the ancestor table
-    CAFFE_ENFORCE(curr < ancestors.size(), "Out of bound access");
+    CAFFE_ENFORCE(curr < (int)ancestors.size(), "Out of bound access");
     if (ancestors[curr]) {
       ancestors[curr] = false;
       nodes_stack.pop();
@@ -93,7 +93,7 @@ std::vector<OpGraphNode> pruneOpNodeGraph(
     pruned.push_back(nd);
   }
 
-  for (int i = 0; i < pruned.size(); ++i) {
+  for (int i = 0; i < (int)pruned.size(); ++i) {
     if (pruned[i].parents_.size() == 0) {
       prune(i, pruned);
     }
@@ -107,7 +107,7 @@ std::vector<OpGraphNode> pruneOpNodeGraph(
 void updateOperatorNodes(
     std::vector<OperatorNode>& nodes,
     const ExecutionChains& chains) {
-  for (int i = 0; i < nodes.size(); ++i) {
+  for (int i = 0; i < (int)nodes.size(); ++i) {
     auto& node = nodes[i];
     if (chains.find(i) != chains.end()) {
       node.is_chain_start_ = true;
@@ -123,7 +123,7 @@ void updateOperatorNodes(
 ExecutionChains computeChains(std::vector<OperatorNode>& orig_nodes) {
   const std::vector<OpGraphNode> nodes = pruneOpNodeGraph(orig_nodes);
   vector<int> initial_frontier;
-  for (int idx = 0; idx < nodes.size(); ++idx) {
+  for (int idx = 0; idx < (int)nodes.size(); ++idx) {
     if (nodes[idx].parents_.size() == 0) {
       initial_frontier.push_back(idx);
     }
@@ -280,7 +280,7 @@ ExecutionChains computeChains(std::vector<OperatorNode>& orig_nodes) {
 
 ExecutionChains singleChains(std::vector<OperatorNode>& nodes) {
   ExecutionChains chains;
-  for (auto i = 0; i < nodes.size(); ++i) {
+  for (int i = 0; i < (int)nodes.size(); ++i) {
     chains[i] = {i};
   }
   updateOperatorNodes(nodes, chains);
@@ -362,7 +362,7 @@ std::vector<OperatorNode> prepareOperatorNodes(
 
   // Now, make sure that the parent list and the children list do not contain
   // duplicated items.
-  for (int i = 0; i < operator_nodes.size(); ++i) {
+  for (int i = 0; i < (int)operator_nodes.size(); ++i) {
     auto& node = operator_nodes[i];
     // Sort, remove duplicates, and delete self dependency.
     auto& p = node.parents_;
@@ -383,7 +383,7 @@ std::vector<OpGraphNode> prepareChainGraphNodes(
     const std::vector<dag_utils::OperatorNode>& operator_nodes,
     const std::vector<std::vector<int>>& execution_chains) {
   std::unordered_map<int, int> op_to_chain_idx;
-  for (int chain_idx = 0; chain_idx < execution_chains.size(); ++chain_idx) {
+  for (int chain_idx = 0; chain_idx < (int)execution_chains.size(); ++chain_idx) {
     const auto& chain_indices = execution_chains[chain_idx];
     for (const auto& chain_op_idx : chain_indices) {
       CAFFE_ENFORCE(!op_to_chain_idx.count(chain_op_idx));
@@ -392,7 +392,7 @@ std::vector<OpGraphNode> prepareChainGraphNodes(
   }
 
   std::vector<OpGraphNode> chain_nodes(execution_chains.size());
-  for (int op_idx = 0; op_idx < operator_nodes.size(); ++op_idx) {
+  for (int op_idx = 0; op_idx < (int)operator_nodes.size(); ++op_idx) {
     CAFFE_ENFORCE(op_to_chain_idx.count(op_idx));
     auto chain_idx = op_to_chain_idx[op_idx];
     auto& chain = chain_nodes[chain_idx];
diff --git a/caffe2/core/net_simple.cc b/caffe2/core/net_simple.cc
index 43d6aec84c5dbd..46c1e8f5956d51 100644
--- a/caffe2/core/net_simple.cc
+++ b/caffe2/core/net_simple.cc
@@ -178,7 +178,7 @@ vector<float> SimpleNet::TEST_Benchmark(
         ++idx;
       }
     }
-    int idx = 0;
+    size_t idx = 0;
     for (auto& op : operators_) {
       const string& op_type = op->debug_def().type();
       const string& print_name =
@@ -232,7 +232,7 @@ vector<float> SimpleNet::TEST_Benchmark(
     metric_per_op_type_vec_vec.emplace_back(&memory_bytes_read_per_op_type);
     metric_per_op_type_vec_vec.emplace_back(&memory_bytes_written_per_op_type);
     metric_per_op_type_vec_vec.emplace_back(&param_bytes_per_op_type);
-    for (int i = 0; i < metric_per_op_type_vec_vec.size(); ++i) {
+    for (size_t i = 0; i < metric_per_op_type_vec_vec.size(); ++i) {
       std::cout << metric[i] << " per operator type:" << std::endl;
       auto* item = metric_per_op_type_vec_vec[i];
       std::vector<std::pair<string, float>> metric_per_op_type_vec(
@@ -260,7 +260,7 @@ vector<float> SimpleNet::TEST_Benchmark(
     }
   }
   // We will reuse time_per_op to return the result of BenchmarkNet.
-  for (int i = 0; i < time_per_op.size(); ++i) {
+  for (size_t i = 0; i < time_per_op.size(); ++i) {
     time_per_op[i] /= main_runs;
   }
   if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
diff --git a/caffe2/core/numa.cc b/caffe2/core/numa.cc
index 26c1284b4c4d0e..a8c2d4f22f541d 100644
--- a/caffe2/core/numa.cc
+++ b/caffe2/core/numa.cc
@@ -75,7 +75,7 @@ void NUMAMove(void* ptr, size_t size, int numa_node_id) {
   size_t page_start_ptr = (((size_t)ptr) & ~(getpagesize() - 1));
   size_t offset = ((size_t)ptr) - page_start_ptr;
   // Avoid extra dynamic allocation and NUMA api calls
-  CAFFE_ENFORCE(numa_node_id >= 0 && numa_node_id < sizeof(unsigned long) * 8);
+  CAFFE_ENFORCE(numa_node_id >= 0 && (unsigned)numa_node_id < sizeof(unsigned long) * 8);
   unsigned long mask = 1UL << numa_node_id;
   CAFFE_ENFORCE(
       mbind(
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 1f54254fadadea..9a71993e826fa6 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -148,7 +148,7 @@ unique_ptr<OperatorBase> _CreateOperator(
             << engine;
     auto op = TryCreateOperator(key, operator_def, ws);
     if (op) {
-      if (engine.size() <= FLAGS_caffe2_operator_max_engine_name_length) {
+      if (engine.size() <= (unsigned)FLAGS_caffe2_operator_max_engine_name_length) {
         op->annotate_engine(engine);
       } else {
         op->annotate_engine(
@@ -450,7 +450,7 @@ TensorShapes InferBlobShapesAndTypes(
           CaffeMap<string, string> grads_to_params =
               GradientMakerBase::MatchGradsToParams(op);
 
-          for (int i = 0; i < out.size(); i++) {
+          for (size_t i = 0; i < out.size(); i++) {
             if (out[i].unknown_shape()) {
               std::string gradout = op.output(i);
 
@@ -479,7 +479,7 @@ TensorShapes InferBlobShapesAndTypes(
         return tps;
       }
 
-      if (out.size() != op.output_size()) {
+      if (out.size() != (unsigned)op.output_size()) {
         if (op.type() == "Slice") {
           CAFFE_ENFORCE(
               out.size() == 0,
@@ -495,7 +495,7 @@ TensorShapes InferBlobShapesAndTypes(
               out.size());
         }
       } else {
-        for (int i = 0; i < out.size(); i++) {
+        for (size_t i = 0; i < out.size(); i++) {
           blob_desc[op.output(i)] = out[i];
         }
       }
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index f31ac181b7aae7..9372ed49f4243a 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -189,7 +189,7 @@ class OperatorBase : public Observable<OperatorBase> {
 
     bool found_input;
     if (err->caller() != nullptr) {
-      for (int i = 0; i < inputs_.size(); i++) {
+      for (size_t i = 0; i < inputs_.size(); i++) {
         if (inputs_[i]->GetRaw() == err->caller()) {
           found_input = true;
           err->AppendMessage(
@@ -197,7 +197,7 @@ class OperatorBase : public Observable<OperatorBase> {
           break;
         }
       }
-      for (int i = 0; i < outputs_.size(); i++) {
+      for (size_t i = 0; i < outputs_.size(); i++) {
         if (outputs_[i]->GetRaw() == err->caller()) {
           if (found_input) {
             err->AppendMessage("\n OR ");
diff --git a/caffe2/core/operator_schema.cc b/caffe2/core/operator_schema.cc
index c38c3f724ad822..678646a520a261 100644
--- a/caffe2/core/operator_schema.cc
+++ b/caffe2/core/operator_schema.cc
@@ -286,7 +286,7 @@ DEFINE_STANDARG_ARG(IsTest, is_test)
 #undef DEFINE_STANDARG_ARG
 
 OpSchema& OpSchema::Input(const int n, const char* name, const char* description) {
-  if (input_desc_.size() <= n) {
+  if (input_desc_.size() <= (unsigned)n) {
     input_desc_.resize(n + 1);
   }
   input_desc_[n] = std::make_pair(name, description);
@@ -294,7 +294,7 @@ OpSchema& OpSchema::Input(const int n, const char* name, const char* description
 }
 
 OpSchema& OpSchema::Output(const int n, const char* name, const char* description) {
-  if (output_desc_.size() <= n) {
+  if (output_desc_.size() <= (unsigned)n) {
     output_desc_.resize(n + 1);
   }
   output_desc_[n] = std::make_pair(name, description);
@@ -328,7 +328,7 @@ std::ostream& operator<<(std::ostream& out, const OpSchema& schema) {
   if (schema.max_input_ > 0) {
     out << "Inputs:" << std::endl;
     if (!schema.input_desc_.empty()) {
-      for (int i = 0; i < schema.input_desc_.size(); ++i) {
+      for (size_t i = 0; i < schema.input_desc_.size(); ++i) {
         const auto& p = schema.input_desc_[i];
         out << "  " << i << ", " << (p.first ? p.first : "(unnamed)") << " : "
             << (p.second ? p.second : "(no doc)") << std::endl;
@@ -340,7 +340,7 @@ std::ostream& operator<<(std::ostream& out, const OpSchema& schema) {
   if (schema.max_output_ > 0) {
     out << "Outputs:" << std::endl;
     if (!schema.output_desc_.empty()) {
-      for (int i = 0; i < schema.output_desc_.size(); ++i) {
+      for (size_t i = 0; i < schema.output_desc_.size(); ++i) {
         const auto& p = schema.output_desc_[i];
         out << "  " << i << ", " << (p.first ? p.first : "(unnamed)") << " : "
             << (p.second ? p.second : "(no doc)") << std::endl;
diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc
index 7574470f798b34..5f6ff342ed79c5 100644
--- a/caffe2/core/plan_executor.cc
+++ b/caffe2/core/plan_executor.cc
@@ -437,7 +437,7 @@ bool ExecuteStepRecursive(ExecutionStepWrapper& stepWrapper) {
         if (step.has_num_concurrent_instances()) {
           numThreads *= step.num_concurrent_instances();
         }
-        for (int64_t i = 0; i < numThreads; ++i) {
+        for (size_t i = 0; i < numThreads; ++i) {
           threads.emplace_back(worker);
         }
         for (auto& thread : threads) {
diff --git a/caffe2/core/predictor.cc b/caffe2/core/predictor.cc
index 0462ad7aded3e0..e82aa098ce6e8e 100644
--- a/caffe2/core/predictor.cc
+++ b/caffe2/core/predictor.cc
@@ -122,8 +122,8 @@ Predictor::Predictor(
 }
 
 bool Predictor::run(const TensorVector& inputs, TensorVector* outputs) {
-  CAFFE_ENFORCE(inputs.size() <= run_net_.external_input_size());
-  for (auto i = 0; i < inputs.size(); ++i) {
+  CAFFE_ENFORCE(inputs.size() <= (unsigned)run_net_.external_input_size());
+  for (size_t i = 0; i < inputs.size(); ++i) {
     shareInputTensor(&ws_, run_net_.external_input(i), inputs[i]);
   }
 
@@ -132,7 +132,7 @@ bool Predictor::run(const TensorVector& inputs, TensorVector* outputs) {
   }
 
   outputs->resize(run_net_.external_output_size());
-  for (auto i = 0; i < outputs->size(); ++i) {
+  for (size_t i = 0; i < outputs->size(); ++i) {
     (*outputs)[i] = extractOutputTensor(&ws_, run_net_.external_output(i));
   }
   return true;
@@ -158,7 +158,7 @@ bool Predictor::run_map(const TensorMap& inputs, TensorVector* outputs) {
   }
 
   outputs->resize(run_net_.external_output_size());
-  for (auto i = 0; i < outputs->size(); ++i) {
+  for (size_t i = 0; i < outputs->size(); ++i) {
     (*outputs)[i] = extractOutputTensor(&ws_, run_net_.external_output(i));
   }
   return true;
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index 852b605ac2aecf..1be8193400f11c 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -84,7 +84,7 @@ struct TensorCPUStatGetter : BlobStatGetter {
     auto nbytes = tensor.nbytes();
     if (nbytes > 0 && tensor.IsType<std::string>()) {
       const auto* data = tensor.data<std::string>();
-      for (size_t i = 0; i < tensor.size(); ++i) {
+      for (int i = 0; i < tensor.size(); ++i) {
         nbytes += data[i].size();
       }
     }
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 338ffe03d70539..84c7c98e73b930 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -39,7 +39,7 @@ inline vector<TIndex> ToVectorTIndex(const std::vector<int>& src) {
  */
 inline TIndex size_from_dim_(int k, const vector<TIndex>& dims) {
   TIndex r = 1;
-  for (int i = k; i < dims.size(); ++i) {
+  for (size_t i = k; i < dims.size(); ++i) {
     r *= dims[i];
   }
   return r;
@@ -47,7 +47,7 @@ inline TIndex size_from_dim_(int k, const vector<TIndex>& dims) {
 
 // Product of all dims up to
 inline TIndex size_to_dim_(int k, const vector<TIndex>& dims) {
-  CAFFE_ENFORCE(k <= dims.size());
+  CAFFE_ENFORCE((unsigned)k <= dims.size());
   TIndex r = 1;
   for (int i = 0; i < k; ++i) {
     r *= dims[i];
@@ -57,7 +57,7 @@ inline TIndex size_to_dim_(int k, const vector<TIndex>& dims) {
 
 // Product of all dims between k and l (not including dims[k] and dims[l])
 inline TIndex size_between_dim_(int k, int l, const vector<TIndex>& dims) {
-  CAFFE_ENFORCE(l < dims.size());
+  CAFFE_ENFORCE((unsigned)l < dims.size());
   TIndex r = 1;
   if (k < l) {
     for (int i = k + 1; i < l; ++i) {
@@ -694,7 +694,7 @@ class Tensor {
     auto old_size = size_;
     dims_.resize(src.size());
     TIndex new_size = 1;
-    for (unsigned int i = 0; i < src.size(); ++i) {
+    for (size_t i = 0; i < src.size(); ++i) {
       new_size *= src[i];
       dims_[i] = src[i];
     }
diff --git a/caffe2/core/transform.cc b/caffe2/core/transform.cc
index 51ac5d1297cec3..5ea42fe83b9b83 100644
--- a/caffe2/core/transform.cc
+++ b/caffe2/core/transform.cc
@@ -20,7 +20,7 @@ std::vector<std::vector<int>> Transform::PatternMatch(const Graph& graph) {
   std::vector<std::vector<int>> matches;
 
   // Consider every possible node as the starting point.
-  for (int idx = 0; idx < graph.size(); ++idx) {
+  for (int idx = 0; idx < (int)graph.size(); ++idx) {
     // The current working subgraph. We will try to add new nodes to this,
     // when invoking the PatternRule.
     std::vector<int> subgraph;
@@ -81,7 +81,7 @@ void Transform::PatternMatchHelper(
     best_subgraph = subgraph;
   }
 
-  int size_before = subgraph.size();
+  size_t size_before = subgraph.size();
 
   if (pattern_match_type_ == CONNECTED_SUBGRAPH) {
     // Connected Component Order Pattern Matching
@@ -89,7 +89,7 @@ void Transform::PatternMatchHelper(
 
     // Try adding each parent and child of every node in the subgraph,
     // and see if we can accept it.
-    for (int i = 0; i < subgraph.size(); i++) {
+    for (size_t i = 0; i < subgraph.size(); i++) {
       int x = subgraph[i];
       TryNeighbors(
           graph,
@@ -119,11 +119,11 @@ void Transform::PatternMatchHelper(
     // node in our current subgraph.
     // Thus, we simply iterate over the nodes that come AFTER the last node of
     // our current subgraph.
-    int start_idx = 0;
+    size_t start_idx = 0;
     if (subgraph.size() > 0) {
       start_idx = subgraph.back() + 1;
     }
-    for (int i = start_idx; i < graph.size(); i++) {
+    for (size_t i = start_idx; i < graph.size(); i++) {
       if (!matched.at(i) && PatternRule(graph, subgraph, i)) {
         subgraph.push_back(i);
         PatternMatchHelper(graph, matched, subgraph_ptr, best_subgraph_ptr);
@@ -136,7 +136,7 @@ void Transform::PatternMatchHelper(
 
     // For every current subgraph, we consider all nodes to be
     // the next candidate node, as long as it isn't already matched.
-    for (int i = 0; i < graph.size(); i++) {
+    for (size_t i = 0; i < graph.size(); i++) {
       if (std::find(subgraph.begin(), subgraph.end(), i) == subgraph.end()) {
         // Then we try appending it to the subgraph.
         if (!matched.at(i) && PatternRule(graph, subgraph, i)) {
diff --git a/caffe2/core/typeid.h b/caffe2/core/typeid.h
index 3d57baceff7cec..cad012e43211ef 100644
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@@ -219,7 +219,7 @@ class TypeMeta {
   template <typename T>
   static void _Ctor(void* ptr, size_t n) {
     T* typed_ptr = static_cast<T*>(ptr);
-    for (int i = 0; i < n; ++i) {
+    for (size_t i = 0; i < n; ++i) {
       new (typed_ptr + i) T;
     }
   }
@@ -231,7 +231,7 @@ class TypeMeta {
   static void _Copy(const void* src, void* dst, size_t n) {
     const T* typed_src = static_cast<const T*>(src);
     T* typed_dst = static_cast<T*>(dst);
-    for (int i = 0; i < n; ++i) {
+    for (size_t i = 0; i < n; ++i) {
       typed_dst[i] = typed_src[i];
     }
   }
@@ -253,7 +253,7 @@ class TypeMeta {
   template <typename T>
   static void _Dtor(void* ptr, size_t n) {
     T* typed_ptr = static_cast<T*>(ptr);
-    for (int i = 0; i < n; ++i) {
+    for (size_t i = 0; i < n; ++i) {
       typed_ptr[i].~T();
     }
   }
diff --git a/caffe2/ideep/operators/elementwise_sum_op.cc b/caffe2/ideep/operators/elementwise_sum_op.cc
index eb9baf99d6990a..373d8801368b91 100644
--- a/caffe2/ideep/operators/elementwise_sum_op.cc
+++ b/caffe2/ideep/operators/elementwise_sum_op.cc
@@ -12,7 +12,7 @@ class IDEEPSumOp final : public IDEEPOperator {
   virtual ~IDEEPSumOp() {}
 
   bool RunOnDevice() override {
-    const auto &X = Input(INPUT0);
+    const auto& X = Input(INPUT0);
     auto* Y = Output(OUTPUT);
 
     if (InputSize() == 1) {
@@ -20,8 +20,15 @@ class IDEEPSumOp final : public IDEEPOperator {
 
     } else {
       vector<itensor> inputs;
-      vector<float> scales (InputSize(), 1.0);
+      const vector<float> scales(InputSize(), 1.0);
+      const auto dims = X.get_dims();
       for (int i = 0; i < InputSize(); ++i) {
+        if (Input(i).get_dims() != dims) {
+          CAFFE_ENFORCE_EQ(
+              dims,
+              Input(i).get_dims(),
+              "Broadcast is not yet supported with IDEEP.");
+        }
         inputs.emplace_back(Input(i));
       }
 
@@ -32,11 +39,11 @@ class IDEEPSumOp final : public IDEEPOperator {
   }
 
  private:
-
   INPUT_TAGS(INPUT0);
   OUTPUT_TAGS(OUTPUT);
 };
 
 REGISTER_IDEEP_OPERATOR(Sum, IDEEPSumOp);
+REGISTER_IDEEP_OPERATOR(Add, IDEEPSumOp);
 
 } // namespace caffe2
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.cc b/caffe2/ideep/operators/operator_fallback_ideep.cc
index 216633e38d8a06..1f3ad1eb98ce41 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.cc
+++ b/caffe2/ideep/operators/operator_fallback_ideep.cc
@@ -8,6 +8,7 @@
 #include <caffe2/operators/filler_op.h>
 #include <caffe2/operators/flatten_op.h>
 #include <caffe2/operators/given_tensor_fill_op.h>
+#include <caffe2/operators/load_save_op.h>
 #include <caffe2/operators/loss_op.h>
 #include <caffe2/operators/reshape_op.h>
 #include <caffe2/operators/softmax_op.h>
@@ -53,5 +54,7 @@ REGISTER_IDEEP_OPERATOR(
 REGISTER_IDEEP_OPERATOR(
     GivenTensorFill,
     IDEEPFallbackOp<GivenTensorFillOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(Load, IDEEPFallbackOp<LoadOp<CPUContext>>);
+REGISTER_IDEEP_OPERATOR(Save, IDEEPFallbackOp<SaveOp<CPUContext>>);
 
 } // namespace caffe2
diff --git a/caffe2/ideep/operators/squeeze_op.cc b/caffe2/ideep/operators/squeeze_op.cc
new file mode 100644
index 00000000000000..fe78e30a7d3369
--- /dev/null
+++ b/caffe2/ideep/operators/squeeze_op.cc
@@ -0,0 +1,59 @@
+#include <caffe2/ideep/ideep_utils.h>
+#include "caffe2/operators/expand_squeeze_dims_op.h"
+
+namespace caffe2 {
+
+class IDEEPSqueezeOp final : public IDEEPOperator {
+ public:
+  USE_IDEEP_DEF_ALIASES();
+  USE_IDEEP_OPERATOR_FUNCTIONS();
+
+  IDEEPSqueezeOp(const OperatorDef& operator_def, Workspace* ws)
+      : IDEEPOperator(operator_def, ws),
+        dims_(OperatorBase::GetRepeatedArgument<int>("dims")) {
+    auto originalSize = dims_.size();
+    CAFFE_ENFORCE(originalSize > 0, "Parameter `dims` must be provided.");
+
+    std::sort(dims_.begin(), dims_.end());
+    dims_.erase(std::unique(dims_.begin(), dims_.end()), dims_.end());
+    if (dims_.size() < originalSize) {
+      LOG(WARNING) << "Parameter `dims` has repeated dimensions.";
+    }
+    CAFFE_ENFORCE(dims_.front() >= 0, "Dimension ids must be non-negative.");
+  }
+
+  virtual ~IDEEPSqueezeOp() {}
+
+  bool RunOnDevice() override {
+    const auto& X = Input(INPUT);
+    auto* Y = Output(OUTPUT);
+
+    CAFFE_ENFORCE_GT(
+        X.ndims(),
+        dims_.back(),
+        "Input needs at least ",
+        (dims_.back() + 1),
+        " dimensions.");
+    const auto& ideep_dims = X.get_dims();
+    vector<TIndex> dims(ideep_dims.begin(), ideep_dims.end());
+    const auto& new_dims = SqueezeOp<IDEEPContext>::ComputeDims(dims, dims_);
+    itensor::dims new_dims_ideep(new_dims.begin(), new_dims.end());
+    if (&X != Y) {
+      // Copy if not inplace
+      ideep::direct_copy::compute(X, *Y);
+    }
+    Y->reshape(new_dims_ideep);
+
+    return true;
+  }
+
+ private:
+  vector<int> dims_;
+
+  INPUT_TAGS(INPUT);
+  OUTPUT_TAGS(OUTPUT);
+};
+
+REGISTER_IDEEP_OPERATOR(Squeeze, IDEEPSqueezeOp);
+
+} // namespace caffe2
diff --git a/caffe2/mkl/operators/operator_fallback_mkl.cc b/caffe2/mkl/operators/operator_fallback_mkl.cc
index a7e9082b7c5994..cb865c5dcbe85b 100644
--- a/caffe2/mkl/operators/operator_fallback_mkl.cc
+++ b/caffe2/mkl/operators/operator_fallback_mkl.cc
@@ -5,7 +5,7 @@
 #include "caffe2/operators/cross_entropy_op.h"
 #include "caffe2/operators/dropout_op.h"
 #include "caffe2/operators/elementwise_linear_op.h"
-#include "caffe2/operators/elementwise_op.h"
+#include "caffe2/operators/elementwise_ops.h"
 #include "caffe2/operators/filler_op.h"
 #include "caffe2/operators/load_save_op.h"
 #include "caffe2/operators/loss_op.h"
@@ -16,14 +16,17 @@
 
 namespace caffe2 {
 namespace {
+
 struct SigmoidCPUFunctor {
   template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) {
+  bool operator()(const int n, const T* x, T* y, CPUContext* /* context */)
+      const {
     ConstEigenVectorArrayMap<T> xM(x, n);
     EigenVectorArrayMap<T>(y, n) = 1. / (1. + (-xM).exp());
+    return true;
   }
 };
+
 } // namespace
 } // namespace caffe2
 
diff --git a/caffe2/operators/abs_op.cc b/caffe2/operators/abs_op.cc
index 75379beb4a6c12..40e4568309a61b 100644
--- a/caffe2/operators/abs_op.cc
+++ b/caffe2/operators/abs_op.cc
@@ -1,36 +1,37 @@
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/utils/math.h"
+#include "caffe2/operators/abs_op.h"
 
-namespace caffe2 {
+#include <algorithm>
+#include <functional>
 
-struct AbsCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* device_context) {
-    math::Abs<T, CPUContext>(n, x, y, device_context);
-  }
-};
+namespace caffe2 {
 
-struct AbsGradientCPUFunctor {
-  template <typename T>
-  inline void
-  Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) {
-    ConstEigenVectorArrayMap<T> dyM(dy, n);
-    ConstEigenVectorArrayMap<T> xM(x, n);
-    EigenVectorMap<T>(dx, n) =
-        (xM == T(0)).select(T(0), (xM > T(0)).select(dyM, -dyM));
-  }
-};
+template <>
+template <typename T>
+bool AbsGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) =
+      (X_arr == T(0)).select(T(0), (X_arr > T(0)).select(dY_arr, -dY_arr));
+  return true;
+}
 
 REGISTER_CPU_OPERATOR(
     Abs,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, AbsCPUFunctor>);
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, AbsFunctor<CPUContext>>);
 REGISTER_CPU_OPERATOR(
     AbsGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CPUContext,
-        WithoutBroadcast<AbsGradientCPUFunctor>>);
+        AbsGradientFunctor<CPUContext>>);
 
 OPERATOR_SCHEMA(Abs)
     .NumInputs(1)
@@ -48,15 +49,21 @@ Calculates the absolute value of the given input tensor, element-wise.
 
 OPERATOR_SCHEMA(AbsGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
 
+namespace {
+
 class GetAbsGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "AbsGradient",
         "",
-        std::vector<string>{I(0), GO(0)},
-        std::vector<string>{GI(0)});
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
+
+} // namespace
+
 REGISTER_GRADIENT(Abs, GetAbsGradient);
+
 } // namespace caffe2
diff --git a/caffe2/operators/abs_op.cu b/caffe2/operators/abs_op.cu
index c4ccdd7ffde229..8da37af5777988 100644
--- a/caffe2/operators/abs_op.cu
+++ b/caffe2/operators/abs_op.cu
@@ -1,61 +1,60 @@
-#include <cmath>
+#include "caffe2/operators/abs_op.h"
+
+#include <algorithm>
+#include <functional>
 
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
 
 namespace caffe2 {
 
-template <typename T>
-__global__ void AbsKernel(const int N, const T* X, T* Y) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-    Y[i] = fabs(X[i]);
-  }
-}
+namespace {
 
 template <typename T>
-__global__ void AbsGradientKernel(const int N, const T* X, const T* dY, T* dX) {
+__global__ void
+AbsGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(X + i) == T(0)
+        ? T(0)
+        : (__ldg(X + i) > T(0) ? __ldg(dY + i) : -__ldg(dY + i));
+#else
     dX[i] = X[i] == T(0) ? T(0) : (X[i] > T(0) ? dY[i] : -dY[i]);
+#endif
   }
 }
 
-struct AbsCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    AbsKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-};
-
-struct AbsGradientCUDAFunctor {
-  template <typename T>
-  inline void Run(
-      const int n,
-      const T* x,
-      const T* dy,
-      T* dx,
-      CUDAContext* device_context) {
-    AbsGradientKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, dy, dx);
-    return;
-  }
-};
+} // namespace
+
+template <>
+template <typename T>
+bool AbsGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  AbsGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
 
 REGISTER_CUDA_OPERATOR(
     Abs,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, AbsCUDAFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        AbsFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     AbsGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CUDAContext,
-        WithoutBroadcast<AbsGradientCUDAFunctor>>);
+        AbsGradientFunctor<CUDAContext>>);
+
 } // namespace caffe2
diff --git a/caffe2/operators/abs_op.h b/caffe2/operators/abs_op.h
new file mode 100644
index 00000000000000..9b59557e040a62
--- /dev/null
+++ b/caffe2/operators/abs_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_ABS_OP_H_
+#define CAFFE2_OPERATORS_ABS_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct AbsFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Abs(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct AbsGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& X_dims,
+      const T* dY,
+      const T* X,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ABS_OP_H_
diff --git a/caffe2/operators/acos_op.cc b/caffe2/operators/acos_op.cc
index 8db8e7cf2343de..67e8477869cfa7 100644
--- a/caffe2/operators/acos_op.cc
+++ b/caffe2/operators/acos_op.cc
@@ -1,35 +1,39 @@
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/utils/math.h"
+#include "caffe2/operators/acos_op.h"
 
-namespace caffe2 {
+#include <algorithm>
+#include <functional>
 
-struct AcosCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* device_context) {
-    math::Acos<T, CPUContext>(n, x, y, device_context);
-  }
-};
+namespace caffe2 {
 
-struct AcosGradientCPUFunctor {
-  template <typename T>
-  inline void
-  Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) {
-    ConstEigenVectorArrayMap<T> dyM(dy, n);
-    ConstEigenVectorArrayMap<T> xM(x, n);
-    EigenVectorMap<T>(dx, n) = -dyM / sqrt(1 - xM * xM);
-  }
-};
+template <>
+template <typename T>
+bool AcosGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = -dY_arr * (T(1) - X_arr.square()).rsqrt();
+  return true;
+}
 
 REGISTER_CPU_OPERATOR(
     Acos,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, AcosCPUFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        AcosFunctor<CPUContext>>);
 REGISTER_CPU_OPERATOR(
     AcosGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CPUContext,
-        WithoutBroadcast<AcosGradientCPUFunctor>>);
+        AcosGradientFunctor<CPUContext>>);
 
 OPERATOR_SCHEMA(Acos)
     .NumInputs(1)
@@ -44,18 +48,26 @@ Calculates the arccosine of the given input tensor, element-wise.
         "output",
         "The arccosine of the input tensor computed element-wise");
 
-OPERATOR_SCHEMA(AcosGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
+OPERATOR_SCHEMA(AcosGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape();
+
+namespace {
 
 class GetAcosGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "AcosGradient",
         "",
-        std::vector<string>{I(0), GO(0)},
-        std::vector<string>{GI(0)});
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
 
+} // namespace
+
 REGISTER_GRADIENT(Acos, GetAcosGradient);
+
 } // namespace caffe2
diff --git a/caffe2/operators/acos_op.cu b/caffe2/operators/acos_op.cu
index e15085249da83d..e1fcc23104e596 100644
--- a/caffe2/operators/acos_op.cu
+++ b/caffe2/operators/acos_op.cu
@@ -1,61 +1,60 @@
-#include <cmath>
+#include "caffe2/operators/acos_op.h"
+
+#include <algorithm>
+#include <functional>
 
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
 
 namespace caffe2 {
 
-template <typename T>
-__global__ void AcosKernel(const int N, const T* X, T* Y) {
+namespace {
+
+__global__ void AcosGradientCUDAKernel(
+    const int N,
+    const float* dY,
+    const float* X,
+    float* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    Y[i] = acos(X[i]);
+#if __CUDA_ARCH__ >= 350
+    dX[i] = -__ldg(dY + i) * rsqrtf(1.0f - __ldg(X + i) * __ldg(X + i));
+#else
+    dX[i] = -dY[i] * rsqrtf(1.0f - X[i] * X[i]);
+#endif
   }
 }
 
+} // namespace
+
+template <>
 template <typename T>
-__global__ void AcosGradientKernel(const int N, const T* X, const T* dY, T* dX) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-    dX[i] = -dY[i] / sqrt(1 - X[i] * X[i]);
-  }
+bool AcosGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  AcosGradientCUDAKernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
 }
 
-struct AcosCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    AcosKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-};
-
-struct AcosGradientCUDAFunctor {
-  template <typename T>
-  inline void Run(
-      const int n,
-      const T* x,
-      const T* dy,
-      T* dx,
-      CUDAContext* device_context) {
-    AcosGradientKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, dy, dx);
-    return;
-  }
-};
-
 REGISTER_CUDA_OPERATOR(
     Acos,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, AcosCUDAFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        AcosFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     AcosGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CUDAContext,
-        WithoutBroadcast<AcosGradientCUDAFunctor>>);
+        AcosGradientFunctor<CUDAContext>>);
+
 } // namespace caffe2
diff --git a/caffe2/operators/acos_op.h b/caffe2/operators/acos_op.h
new file mode 100644
index 00000000000000..6ca53ce8d784f8
--- /dev/null
+++ b/caffe2/operators/acos_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_ACOS_OP_H_
+#define CAFFE2_OPERATORS_ACOS_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct AcosFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Acos(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct AcosGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& X_dims,
+      const T* dY,
+      const T* X,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ACOS_OP_H_
diff --git a/caffe2/operators/asin_op.cc b/caffe2/operators/asin_op.cc
index 8b8b7509603901..163a6403b4d11b 100644
--- a/caffe2/operators/asin_op.cc
+++ b/caffe2/operators/asin_op.cc
@@ -1,35 +1,39 @@
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/utils/math.h"
+#include "caffe2/operators/asin_op.h"
 
-namespace caffe2 {
+#include <algorithm>
+#include <functional>
 
-struct AsinCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* device_context) {
-    math::Asin<T, CPUContext>(n, x, y, device_context);
-  }
-};
+namespace caffe2 {
 
-struct AsinGradientCPUFunctor {
-  template <typename T>
-  inline void
-  Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) {
-    ConstEigenVectorArrayMap<T> dyM(dy, n);
-    ConstEigenVectorArrayMap<T> xM(x, n);
-    EigenVectorMap<T>(dx, n) = dyM / sqrt(1 - xM * xM);
-  }
-};
+template <>
+template <typename T>
+bool AsinGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = dY_arr * (T(1) - X_arr.square()).rsqrt();
+  return true;
+}
 
 REGISTER_CPU_OPERATOR(
     Asin,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, AsinCPUFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        AsinFunctor<CPUContext>>);
 REGISTER_CPU_OPERATOR(
     AsinGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CPUContext,
-        WithoutBroadcast<AsinGradientCPUFunctor>>);
+        AsinGradientFunctor<CPUContext>>);
 
 OPERATOR_SCHEMA(Asin)
     .NumInputs(1)
@@ -44,18 +48,26 @@ Calculates the arcsine of the given input tensor, element-wise.
         "output",
         "The arcsine of the input tensor computed element-wise");
 
-OPERATOR_SCHEMA(AsinGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
+OPERATOR_SCHEMA(AsinGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape();
+
+namespace {
 
 class GetAsinGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "AsinGradient",
         "",
-        std::vector<string>{I(0), GO(0)},
-        std::vector<string>{GI(0)});
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
 
+} // namespace
+
 REGISTER_GRADIENT(Asin, GetAsinGradient);
+
 } // namespace caffe2
diff --git a/caffe2/operators/asin_op.cu b/caffe2/operators/asin_op.cu
index 61de4cf7a29a64..d285cac0d7e09d 100644
--- a/caffe2/operators/asin_op.cu
+++ b/caffe2/operators/asin_op.cu
@@ -1,61 +1,60 @@
-#include <cmath>
+#include "caffe2/operators/asin_op.h"
+
+#include <algorithm>
+#include <functional>
 
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
 
 namespace caffe2 {
 
-template <typename T>
-__global__ void AsinKernel(const int N, const T* X, T* Y) {
+namespace {
+
+__global__ void AsinGradientCUDAKernel(
+    const int N,
+    const float* dY,
+    const float* X,
+    float* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    Y[i] = asin(X[i]);
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) * rsqrtf(1.0f - __ldg(X + i) * __ldg(X + i));
+#else
+    dX[i] = dY[i] * rsqrtf(1.0f - X[i] * X[i]);
+#endif
   }
 }
 
+} // namespace
+
+template <>
 template <typename T>
-__global__ void AsinGradientKernel(const int N, const T* X, const T* dY, T* dX) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-    dX[i] = dY[i] / sqrt(1 - X[i] * X[i]);
-  }
+bool AsinGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  AsinGradientCUDAKernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
 }
 
-struct AsinCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    AsinKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-};
-
-struct AsinGradientCUDAFunctor {
-  template <typename T>
-  inline void Run(
-      const int n,
-      const T* x,
-      const T* dy,
-      T* dx,
-      CUDAContext* device_context) {
-    AsinGradientKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, dy, dx);
-    return;
-  }
-};
-
 REGISTER_CUDA_OPERATOR(
     Asin,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, AsinCUDAFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        AsinFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     AsinGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CUDAContext,
-        WithoutBroadcast<AsinGradientCUDAFunctor>>);
+        AsinGradientFunctor<CUDAContext>>);
+
 } // namespace caffe2
diff --git a/caffe2/operators/asin_op.h b/caffe2/operators/asin_op.h
new file mode 100644
index 00000000000000..0e03f24527855a
--- /dev/null
+++ b/caffe2/operators/asin_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_ASIN_OP_H_
+#define CAFFE2_OPERATORS_ASIN_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct AsinFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Asin(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct AsinGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& X_dims,
+      const T* dY,
+      const T* X,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ASIN_OP_H_
diff --git a/caffe2/operators/atan_op.cc b/caffe2/operators/atan_op.cc
index f93d62ea18353f..47e6cf3be52be6 100644
--- a/caffe2/operators/atan_op.cc
+++ b/caffe2/operators/atan_op.cc
@@ -1,35 +1,39 @@
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/utils/math.h"
+#include "caffe2/operators/atan_op.h"
 
-namespace caffe2 {
+#include <algorithm>
+#include <functional>
 
-struct AtanCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* device_context) {
-    math::Atan<T, CPUContext>(n, x, y, device_context);
-  }
-};
+namespace caffe2 {
 
-struct AtanGradientCPUFunctor {
-  template <typename T>
-  inline void
-  Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) {
-    ConstEigenVectorArrayMap<T> dyM(dy, n);
-    ConstEigenVectorArrayMap<T> xM(x, n);
-    EigenVectorMap<T>(dx, n) = dyM / (1 + xM * xM);
-  }
-};
+template <>
+template <typename T>
+bool AtanGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = dY_arr / (T(1) + X_arr.square());
+  return true;
+}
 
 REGISTER_CPU_OPERATOR(
     Atan,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, AtanCPUFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        AtanFunctor<CPUContext>>);
 REGISTER_CPU_OPERATOR(
     AtanGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CPUContext,
-        WithoutBroadcast<AtanGradientCPUFunctor>>);
+        AtanGradientFunctor<CPUContext>>);
 
 OPERATOR_SCHEMA(Atan)
     .NumInputs(1)
@@ -44,18 +48,26 @@ Calculates the arctangent of the given input tensor, element-wise.
         "output",
         "The arctangent of the input tensor computed element-wise");
 
-OPERATOR_SCHEMA(AtanGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
+OPERATOR_SCHEMA(AtanGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape();
+
+namespace {
 
 class GetAtanGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "AtanGradient",
         "",
-        std::vector<string>{I(0), GO(0)},
-        std::vector<string>{GI(0)});
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
 
+} // namespace
+
 REGISTER_GRADIENT(Atan, GetAtanGradient);
+
 } // namespace caffe2
diff --git a/caffe2/operators/atan_op.cu b/caffe2/operators/atan_op.cu
index e1cc0c00b5681d..c71015d79b5814 100644
--- a/caffe2/operators/atan_op.cu
+++ b/caffe2/operators/atan_op.cu
@@ -1,61 +1,58 @@
-#include <cmath>
+#include "caffe2/operators/atan_op.h"
+
+#include <algorithm>
+#include <functional>
 
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
 
 namespace caffe2 {
 
+namespace {
+
 template <typename T>
-__global__ void AtanKernel(const int N, const T* X, T* Y) {
+__global__ void
+AtanGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    Y[i] = atan(X[i]);
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) / (T(1) + __ldg(X + i) * __ldg(X + i));
+#else
+    dX[i] = dY[i] / (T(1) + X[i] * X[i]);
+#endif
   }
 }
 
+} // namespace
+
+template <>
 template <typename T>
-__global__ void AtanGradientKernel(const int N, const T* X, const T* dY, T* dX) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-    dX[i] = dY[i] / (1 + X[i] * X[i]);
-  }
+bool AtanGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  AtanGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
 }
 
-struct AtanCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    AtanKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-};
-
-struct AtanGradientCUDAFunctor {
-  template <typename T>
-  inline void Run(
-      const int n,
-      const T* x,
-      const T* dy,
-      T* dx,
-      CUDAContext* device_context) {
-    AtanGradientKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, dy, dx);
-    return;
-  }
-};
-
 REGISTER_CUDA_OPERATOR(
     Atan,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, AtanCUDAFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        AtanFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     AtanGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CUDAContext,
-        WithoutBroadcast<AtanGradientCUDAFunctor>>);
+        AtanGradientFunctor<CUDAContext>>);
+
 } // namespace caffe2
diff --git a/caffe2/operators/atan_op.h b/caffe2/operators/atan_op.h
new file mode 100644
index 00000000000000..6bde917fd5db83
--- /dev/null
+++ b/caffe2/operators/atan_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_ATAN_OP_H_
+#define CAFFE2_OPERATORS_ATAN_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct AtanFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Atan(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct AtanGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& X_dims,
+      const T* dY,
+      const T* X,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ATAN_OP_H_
diff --git a/caffe2/operators/cos_op.cc b/caffe2/operators/cos_op.cc
index 8328a3c47046be..8f65fb81dd7690 100644
--- a/caffe2/operators/cos_op.cc
+++ b/caffe2/operators/cos_op.cc
@@ -1,35 +1,36 @@
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/utils/math.h"
+#include "caffe2/operators/cos_op.h"
 
-namespace caffe2 {
+#include <algorithm>
+#include <functional>
 
-struct CosCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* device_context) {
-    math::Cos<T, CPUContext>(n, x, y, device_context);
-  }
-};
+namespace caffe2 {
 
-struct CosGradientCPUFunctor {
-  template <typename T>
-  inline void
-  Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) {
-    ConstEigenVectorArrayMap<T> dyM(dy, n);
-    ConstEigenVectorArrayMap<T> xM(x, n);
-    EigenVectorMap<T>(dx, n) = -dyM * sin(xM);
-  }
-};
+template <>
+template <typename T>
+bool CosGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = -dY_arr * X_arr.sin();
+  return true;
+}
 
 REGISTER_CPU_OPERATOR(
     Cos,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, CosCPUFunctor>);
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, CosFunctor<CPUContext>>);
 REGISTER_CPU_OPERATOR(
     CosGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CPUContext,
-        WithoutBroadcast<CosGradientCPUFunctor>>);
+        CosGradientFunctor<CPUContext>>);
 
 OPERATOR_SCHEMA(Cos)
     .NumInputs(1)
@@ -46,15 +47,21 @@ Calculates the cosine of the given input tensor, element-wise.
 
 OPERATOR_SCHEMA(CosGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
 
+namespace {
+
 class GetCosGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "CosGradient",
         "",
-        std::vector<string>{I(0), GO(0)},
-        std::vector<string>{GI(0)});
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
+
+} // namespace
+
 REGISTER_GRADIENT(Cos, GetCosGradient);
+
 } // namespace caffe2
diff --git a/caffe2/operators/cos_op.cu b/caffe2/operators/cos_op.cu
index 76aa1e443ac923..fcabc6781c78d0 100644
--- a/caffe2/operators/cos_op.cu
+++ b/caffe2/operators/cos_op.cu
@@ -1,61 +1,58 @@
-#include <cmath>
+#include "caffe2/operators/cos_op.h"
+
+#include <algorithm>
+#include <functional>
 
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
 
 namespace caffe2 {
 
-template <typename T>
-__global__ void CosKernel(const int N, const T* X, T* Y) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-    Y[i] = cos(X[i]);
-  }
-}
+namespace {
 
 template <typename T>
-__global__ void CosGradientKernel(const int N, const T* X, const T* dY, T* dX) {
+__global__ void
+CosGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = -__ldg(dY + i) * sin(__ldg(X + i));
+#else
     dX[i] = -dY[i] * sin(X[i]);
+#endif
   }
 }
 
-struct CosCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    CosKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-};
-
-struct CosGradientCUDAFunctor {
-  template <typename T>
-  inline void Run(
-      const int n,
-      const T* x,
-      const T* dy,
-      T* dx,
-      CUDAContext* device_context) {
-    CosGradientKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, dy, dx);
-    return;
-  }
-};
+} // namespace
+
+template <>
+template <typename T>
+bool CosGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  CosGradientCUDAKernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
 
 REGISTER_CUDA_OPERATOR(
     Cos,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, CosCUDAFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        CosFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     CosGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CUDAContext,
-        WithoutBroadcast<CosGradientCUDAFunctor>>);
+        CosGradientFunctor<CUDAContext>>);
+
 } // namespace caffe2
diff --git a/caffe2/operators/cos_op.h b/caffe2/operators/cos_op.h
new file mode 100644
index 00000000000000..5a25172fa8c77d
--- /dev/null
+++ b/caffe2/operators/cos_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_COS_OP_H_
+#define CAFFE2_OPERATORS_COS_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct CosFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Cos(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct CosGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& X_dims,
+      const T* dY,
+      const T* X,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_COS_OP_H_
diff --git a/caffe2/operators/elementwise_add_op.cc b/caffe2/operators/elementwise_add_op.cc
index d914876e2ae0af..b02c956d1b78df 100644
--- a/caffe2/operators/elementwise_add_op.cc
+++ b/caffe2/operators/elementwise_add_op.cc
@@ -1,10 +1,33 @@
-#include "caffe2/operators/elementwise_op.h"
+#include "caffe2/operators/elementwise_add_op.h"
 
 namespace caffe2 {
 
-// See the operations supported here:
-// https://eigen.tuxfamily.org/dox-devel/group__QuickRefPage.html
-#define EIGEN_ADD(x, y) ((x) + (y))
-EIGEN_FUNCTOR(Add, EIGEN_ADD, NumericTypes, SameTypeAsInput);
-#undef EIGEN_ADD
-}
+REGISTER_CPU_OPERATOR(
+    Add,
+    BinaryElementwiseOp<NumericTypes, CPUContext, AddFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    AddGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CPUContext,
+        AddFunctor<CPUContext>>);
+
+namespace {
+
+class GetAddGradient final : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "AddGradient",
+        "",
+        std::vector<std::string>{GO(0), I(0), I(1)},
+        std::vector<std::string>{GI(0), GI(1)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Add, GetAddGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_add_op.h b/caffe2/operators/elementwise_add_op.h
new file mode 100644
index 00000000000000..5af98f7f483a39
--- /dev/null
+++ b/caffe2/operators/elementwise_add_op.h
@@ -0,0 +1,73 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_ADD_OP_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_ADD_OP_H_
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct AddFunctor {
+  template <typename TIn, typename TOut>
+  bool Forward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TIn* A,
+      const TIn* B,
+      TOut* C,
+      Context* context) const {
+    math::Add(
+        A_dims.size(),
+        A_dims.data(),
+        B_dims.size(),
+        B_dims.data(),
+        A,
+        B,
+        C,
+        context);
+    return true;
+  }
+
+  template <typename TGrad, typename TIn, typename TOut>
+  bool Backward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TGrad* dC,
+      const TIn* /* A */,
+      const TIn* /* B */,
+      const TOut* /* C */,
+      TGrad* dA,
+      TGrad* dB,
+      Context* context) const {
+    const std::vector<int> C_dims =
+        ComputeBinaryBroadcastForwardDims(A_dims, B_dims);
+    std::vector<int> A_axes;
+    std::vector<int> B_axes;
+    ComputeBinaryBroadcastBackwardAxes(A_dims, B_dims, &A_axes, &B_axes);
+    math::ReduceSum(
+        C_dims.size(),
+        C_dims.data(),
+        A_axes.size(),
+        A_axes.data(),
+        dC,
+        dA,
+        context);
+    math::ReduceSum(
+        C_dims.size(),
+        C_dims.data(),
+        B_axes.size(),
+        B_axes.data(),
+        dC,
+        dB,
+        context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_ADD_OP_H_
diff --git a/caffe2/operators/elementwise_add_op_gpu.cc b/caffe2/operators/elementwise_add_op_gpu.cc
new file mode 100644
index 00000000000000..2f38bbd36e1569
--- /dev/null
+++ b/caffe2/operators/elementwise_add_op_gpu.cc
@@ -0,0 +1,17 @@
+#include "caffe2/operators/elementwise_add_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Add,
+    BinaryElementwiseOp<NumericTypes, CUDAContext, AddFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    AddGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CUDAContext,
+        AddFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_div_op.cc b/caffe2/operators/elementwise_div_op.cc
index 955ab848d566a6..226589bd997e4a 100644
--- a/caffe2/operators/elementwise_div_op.cc
+++ b/caffe2/operators/elementwise_div_op.cc
@@ -1,27 +1,120 @@
-#include "caffe2/operators/elementwise_op.h"
+#include "caffe2/operators/elementwise_div_op.h"
+
+#include <algorithm>
+#include <functional>
 
 namespace caffe2 {
 
-// See the operations supported here:
-// https://eigen.tuxfamily.org/dox-devel/group__QuickRefPage.html
-#define EIGEN_DIV(x, y) ((x) / (y))
-EIGEN_FUNCTOR(Div, EIGEN_DIV, NumericTypes, SameTypeAsInput);
-#undef EIGEN_DIV
-
-void ElementWiseDivide(
-    CPUContext& /* unused context */,
-    const int n,
-    float* dXdata,
-    float* dYdata,
-    const float* dZdata,
-    const float* Ydata,
-    const float* Zdata) {
-  ConstEigenVectorArrayMap<float> dZdataVec(dZdata, n);
-  ConstEigenVectorArrayMap<float> YdataVec(Ydata, n);
-  ConstEigenVectorArrayMap<float> ZdataVec(Zdata, n);
-  EigenVectorArrayMap<float>(dXdata, n) = dZdataVec / YdataVec;
-  EigenVectorArrayMap<float>(dYdata, n) = - (dZdataVec * ZdataVec) / YdataVec;
+namespace {
+
+template <typename TGrad, typename TIn, typename TOut>
+void ComputeDivGradient(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    const int* C_dims,
+    const TGrad* dC,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dA,
+    TGrad* dB,
+    CPUContext* context) {
+  const int A_size =
+      std::accumulate(A_dims, A_dims + ndim, 1, std::multiplies<int>());
+  const int B_size =
+      std::accumulate(B_dims, B_dims + ndim, 1, std::multiplies<int>());
+  const int C_size =
+      std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies<int>());
+  math::Set<TGrad, CPUContext>(A_size, TGrad(0), dA, context);
+  math::Set<TGrad, CPUContext>(B_size, TGrad(0), dB, context);
+  std::vector<int> index(ndim, 0);
+  for (int C_index = 0; C_index < C_size; ++C_index) {
+    const int A_index =
+        math::utils::GetIndexFromDims(ndim, A_dims, index.data());
+    const int B_index =
+        math::utils::GetIndexFromDims(ndim, B_dims, index.data());
+    dA[A_index] += dC[C_index] / B[B_index];
+    dB[B_index] += -dC[C_index] * C[C_index] / B[B_index];
+    math::utils::IncreaseIndexInDims(ndim, C_dims, index.data());
+  }
 }
 
-REGISTER_CPU_OPERATOR(DivGradient, DivGradientOp<CPUContext>);
+} // namespace
+
+template <>
+template <typename TGrad, typename TIn, typename TOut>
+bool DivFunctor<CPUContext>::Backward(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    const TGrad* dC,
+    const TIn* /* A */,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dA,
+    TGrad* dB,
+    CPUContext* context) const {
+  if (A_dims == B_dims) {
+    const int size = std::accumulate(
+        A_dims.cbegin(), A_dims.cend(), 1, std::multiplies<int>());
+    math::Div(size, dC, B, dA, context);
+    EigenVectorMap<TGrad>(dB, size) =
+        -ConstEigenVectorArrayMap<TGrad>(dC, size) *
+        ConstEigenVectorArrayMap<TOut>(C, size) /
+        ConstEigenVectorArrayMap<TIn>(B, size);
+    return true;
+  }
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  std::vector<int> A_broadcast_dims(ndim);
+  std::vector<int> B_broadcast_dims(ndim);
+  std::vector<int> C_broadcast_dims(ndim);
+  math::utils::ComputeBroadcastBinaryOpDims(
+      A_dims.size(),
+      A_dims.data(),
+      B_dims.size(),
+      B_dims.data(),
+      A_broadcast_dims.data(),
+      B_broadcast_dims.data(),
+      C_broadcast_dims.data());
+  ComputeDivGradient<TGrad, TIn, TOut>(
+      ndim,
+      A_broadcast_dims.data(),
+      B_broadcast_dims.data(),
+      C_broadcast_dims.data(),
+      dC,
+      B,
+      C,
+      dA,
+      dB,
+      context);
+  return true;
 }
+
+REGISTER_CPU_OPERATOR(
+    Div,
+    BinaryElementwiseOp<NumericTypes, CPUContext, DivFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    DivGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CPUContext,
+        DivFunctor<CPUContext>>);
+
+namespace {
+
+class GetDivGradient final : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "DivGradient",
+        "",
+        std::vector<std::string>{GO(0), I(0), I(1), O(0)},
+        std::vector<std::string>{GI(0), GI(1)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Div, GetDivGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_div_op.cu b/caffe2/operators/elementwise_div_op.cu
new file mode 100644
index 00000000000000..acba40b1d57ac9
--- /dev/null
+++ b/caffe2/operators/elementwise_div_op.cu
@@ -0,0 +1,316 @@
+#include "caffe2/operators/elementwise_div_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/cub.cuh>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;
+
+template <typename TGrad, typename TIn, int D>
+__global__ void ComputeDivAGradientCUDAKernel(
+    const int outer_size,
+    const int inner_size,
+    const SimpleArray<int, D> C_dims,
+    const SimpleArray<int, D> C_strides,
+    const SimpleArray<int, D> B_strides,
+    const SimpleArray<int, D> A_dims,
+    const TGrad* dC,
+    const TIn* B,
+    TGrad* dA) {
+  __shared__ typename BlockReduce<TGrad>::TempStorage temp_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    TGrad sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int A_index = i * inner_size + j;
+      int C_index = 0;
+      int A_index_val = A_index;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        C_index += (A_index_val % A_dims.data[d]) * C_strides.data[d];
+        A_index_val /= A_dims.data[d];
+      }
+      int B_index = 0;
+      int C_index_val = C_index;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        B_index += B_strides.data[d] == 0
+            ? 0
+            : (C_index_val % C_dims.data[d]) * B_strides.data[d];
+        C_index_val /= C_dims.data[d];
+      }
+#if __CUDA_ARCH__ >= 350
+      sum += __ldg(dC + C_index) / __ldg(B + B_index);
+#else
+      sum += dC[C_index] / B[B_index];
+#endif
+    }
+    sum = BlockReduce<TGrad>(temp_storage).Reduce(sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dA[i] = sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename TGrad, typename TIn, typename TOut>
+__global__ void ComputeSimpleDivBGradientCUDAKernel(
+    const int size,
+    const TGrad* dC,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dB) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+#if __CUDA_ARCH__ >= 350
+    dB[i] = -__ldg(dC + i) * __ldg(C + i) / __ldg(B + i);
+#else
+    dB[i] = -dC[i] * C[i] / B[i];
+#endif
+  }
+}
+
+template <typename TGrad, typename TIn, typename TOut, int D>
+__global__ void ComputeDivBGradientCUDAKernel(
+    const int outer_size,
+    const int inner_size,
+    const SimpleArray<int, D> C_strides,
+    const SimpleArray<int, D> B_dims,
+    const TGrad* dC,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dB) {
+  __shared__ typename BlockReduce<TGrad>::TempStorage temp_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    TGrad sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      int C_index = 0;
+      int B_index = i * inner_size + j;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        C_index += (B_index % B_dims.data[d]) * C_strides.data[d];
+        B_index /= B_dims.data[d];
+      }
+#if __CUDA_ARCH__ >= 350
+      sum += -__ldg(dC + C_index) * __ldg(C + C_index) / __ldg(B + i);
+#else
+      sum += -dC[C_index] * C[C_index] / B[i];
+#endif
+    }
+    sum = BlockReduce<TGrad>(temp_storage).Reduce(sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dB[i] = sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename TGrad, typename TIn, int D>
+void ComputeDivAGradientCUDAImpl(
+    const int outer_size,
+    const int inner_size,
+    const int* C_dims,
+    const int* B_dims,
+    const int* A_axes,
+    const TGrad* dC,
+    const TIn* B,
+    TGrad* dA,
+    CUDAContext* context) {
+  SimpleArray<int, D> C_dims_arr;
+  SimpleArray<int, D> C_strides_arr;
+  SimpleArray<int, D> B_strides_arr;
+  SimpleArray<int, D> A_dims_arr;
+  std::copy_n(C_dims, D, C_dims_arr.data);
+  math::utils::ComputeTransposedStrides(D, C_dims, A_axes, C_strides_arr.data);
+  int cur_stride = 1;
+  for (int i = D - 1; i >= 0; --i) {
+    B_strides_arr.data[i] = B_dims[i] == 1 ? 0 : cur_stride;
+    cur_stride *= B_dims[i];
+  }
+  for (int i = 0; i < D; ++i) {
+    A_dims_arr.data[i] = C_dims[A_axes[i]];
+  }
+  ComputeDivAGradientCUDAKernel<TGrad, TIn, D>
+      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          outer_size,
+          inner_size,
+          C_dims_arr,
+          C_strides_arr,
+          B_strides_arr,
+          A_dims_arr,
+          dC,
+          B,
+          dA);
+}
+
+template <typename TGrad, typename TIn, typename TOut, int D>
+void ComputeDivBGradientCUDAImpl(
+    const int outer_size,
+    const int inner_size,
+    const int* C_dims,
+    const int* B_axes,
+    const TGrad* dC,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dB,
+    CUDAContext* context) {
+  SimpleArray<int, D> C_strides_arr;
+  SimpleArray<int, D> B_dims_arr;
+  math::utils::ComputeTransposedStrides(D, C_dims, B_axes, C_strides_arr.data);
+  for (int i = 0; i < D; ++i) {
+    B_dims_arr.data[i] = C_dims[B_axes[i]];
+  }
+  ComputeDivBGradientCUDAKernel<TGrad, TIn, TOut, D>
+      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          outer_size, inner_size, C_strides_arr, B_dims_arr, dC, B, C, dB);
+}
+
+template <typename TGrad, typename TIn>
+void ComputeDivAGradientCUDA(
+    const std::vector<int>& C_dims,
+    const std::vector<int>& B_dims,
+    const std::vector<int>& A_axes,
+    const TGrad* dC,
+    const TIn* B,
+    TGrad* dA,
+    CUDAContext* context) {
+  CAFFE_ENFORCE_EQ(C_dims.size(), B_dims.size());
+  const int ndim = C_dims.size();
+  std::vector<int> A_transpose_axes(ndim);
+  math::utils::ComputeTransposeAxesForReduceOp(
+      ndim, A_axes.size(), A_axes.data(), A_transpose_axes.data());
+  const int pivot = ndim - A_axes.size();
+  int outer_size = 1;
+  for (int i = 0; i < pivot; ++i) {
+    outer_size *= C_dims[A_transpose_axes[i]];
+  }
+  int inner_size = 1;
+  for (int i = pivot; i < ndim; ++i) {
+    inner_size *= C_dims[A_transpose_axes[i]];
+  }
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
+      ndim,
+      ComputeDivAGradientCUDAImpl,
+      TGrad,
+      TIn,
+      outer_size,
+      inner_size,
+      C_dims.data(),
+      B_dims.data(),
+      A_transpose_axes.data(),
+      dC,
+      B,
+      dA,
+      context);
+}
+
+template <typename TGrad, typename TIn, typename TOut>
+void ComputeDivBGradientCUDA(
+    const std::vector<int>& C_dims,
+    const std::vector<int>& B_axes,
+    const TGrad* dC,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dB,
+    CUDAContext* context) {
+  const int ndim = C_dims.size();
+  std::vector<int> B_transpose_axes(ndim);
+  math::utils::ComputeTransposeAxesForReduceOp(
+      ndim, B_axes.size(), B_axes.data(), B_transpose_axes.data());
+  const int pivot = ndim - B_axes.size();
+  int outer_size = 1;
+  for (int i = 0; i < pivot; ++i) {
+    outer_size *= C_dims[B_transpose_axes[i]];
+  }
+  int inner_size = 1;
+  for (int i = pivot; i < ndim; ++i) {
+    inner_size *= C_dims[B_transpose_axes[i]];
+  }
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(
+      ndim,
+      ComputeDivBGradientCUDAImpl,
+      TGrad,
+      TIn,
+      TOut,
+      outer_size,
+      inner_size,
+      C_dims.data(),
+      B_transpose_axes.data(),
+      dC,
+      B,
+      C,
+      dB,
+      context);
+}
+
+} // namespace
+
+template <>
+template <typename TGrad, typename TIn, typename TOut>
+bool DivFunctor<CUDAContext>::Backward(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    const TGrad* dC,
+    const TIn* /* A */,
+    const TIn* B,
+    const TOut* C,
+    TGrad* dA,
+    TGrad* dB,
+    CUDAContext* context) const {
+  if (A_dims == B_dims) {
+    const int size = std::accumulate(
+        A_dims.cbegin(), A_dims.cend(), 1, std::multiplies<int>());
+    math::Div(size, dC, B, dA, context);
+    ComputeSimpleDivBGradientCUDAKernel<TGrad, TIn, TOut>
+        <<<CAFFE_GET_BLOCKS(size),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context->cuda_stream()>>>(size, dC, B, C, dB);
+    return true;
+  }
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  std::vector<int> A_broadcast_dims(ndim);
+  std::vector<int> B_broadcast_dims(ndim);
+  std::vector<int> C_broadcast_dims(ndim);
+  math::utils::ComputeBroadcastBinaryOpDims(
+      A_dims.size(),
+      A_dims.data(),
+      B_dims.size(),
+      B_dims.data(),
+      A_broadcast_dims.data(),
+      B_broadcast_dims.data(),
+      C_broadcast_dims.data());
+  std::vector<int> A_axes;
+  std::vector<int> B_axes;
+  ComputeBinaryBroadcastBackwardAxes(A_dims, B_dims, &A_axes, &B_axes);
+  ComputeDivAGradientCUDA<TGrad, TIn>(
+      C_broadcast_dims, B_broadcast_dims, A_axes, dC, B, dA, context);
+  ComputeDivBGradientCUDA<TGrad, TIn, TOut>(
+      C_broadcast_dims, B_axes, dC, B, C, dB, context);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Div,
+    BinaryElementwiseOp<NumericTypes, CUDAContext, DivFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    DivGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CUDAContext,
+        DivFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_div_op.h b/caffe2/operators/elementwise_div_op.h
new file mode 100644
index 00000000000000..d89fcd4f5381a0
--- /dev/null
+++ b/caffe2/operators/elementwise_div_op.h
@@ -0,0 +1,48 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_DIV_OP_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_DIV_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct DivFunctor {
+  template <typename TIn, typename TOut>
+  bool Forward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TIn* A,
+      const TIn* B,
+      TOut* C,
+      Context* context) const {
+    math::Div(
+        A_dims.size(),
+        A_dims.data(),
+        B_dims.size(),
+        B_dims.data(),
+        A,
+        B,
+        C,
+        context);
+    return true;
+  }
+
+  template <typename TGrad, typename TIn, typename TOut>
+  bool Backward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TGrad* dC_data,
+      const TIn* A_data,
+      const TIn* B_data,
+      const TOut* C_data,
+      TGrad* dA_data,
+      TGrad* dB_data,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_DIV_OP_H_
diff --git a/caffe2/operators/elementwise_logical_ops.h b/caffe2/operators/elementwise_logical_ops.h
index 90ec047e390a0d..99b84c58303975 100644
--- a/caffe2/operators/elementwise_logical_ops.h
+++ b/caffe2/operators/elementwise_logical_ops.h
@@ -5,7 +5,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
-#include "caffe2/operators/elementwise_op.h"
+#include "caffe2/operators/elementwise_ops.h"
 
 #include <unordered_set>
 
diff --git a/caffe2/operators/elementwise_mul_op.cc b/caffe2/operators/elementwise_mul_op.cc
index 75a5e076aef159..9878e613b4b607 100644
--- a/caffe2/operators/elementwise_mul_op.cc
+++ b/caffe2/operators/elementwise_mul_op.cc
@@ -1,10 +1,117 @@
-#include "caffe2/operators/elementwise_op.h"
+#include "caffe2/operators/elementwise_mul_op.h"
+
+#include <algorithm>
+#include <functional>
 
 namespace caffe2 {
 
-// See the operations supported here:
-// https://eigen.tuxfamily.org/dox-devel/group__QuickRefPage.html
-#define EIGEN_MUL(x, y) ((x) * (y))
-EIGEN_FUNCTOR(Mul, EIGEN_MUL, NumericTypes, SameTypeAsInput);
-#undef EIGEN_MUL
+namespace {
+
+template <typename TGrad, typename TIn>
+void ComputeMulGradient(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    const int* C_dims,
+    const TGrad* dC,
+    const TIn* A,
+    const TIn* B,
+    TGrad* dA,
+    TGrad* dB,
+    CPUContext* context) {
+  const int A_size =
+      std::accumulate(A_dims, A_dims + ndim, 1, std::multiplies<int>());
+  const int B_size =
+      std::accumulate(B_dims, B_dims + ndim, 1, std::multiplies<int>());
+  const int C_size =
+      std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies<int>());
+  math::Set<TGrad, CPUContext>(A_size, TGrad(0), dA, context);
+  math::Set<TGrad, CPUContext>(B_size, TGrad(0), dB, context);
+  std::vector<int> index(ndim, 0);
+  for (int C_index = 0; C_index < C_size; ++C_index) {
+    const int A_index =
+        math::utils::GetIndexFromDims(ndim, A_dims, index.data());
+    const int B_index =
+        math::utils::GetIndexFromDims(ndim, B_dims, index.data());
+    dA[A_index] += dC[C_index] * B[B_index];
+    dB[B_index] += dC[C_index] * A[A_index];
+    math::utils::IncreaseIndexInDims(ndim, C_dims, index.data());
+  }
+}
+
+} // namespace
+
+template <>
+template <typename TGrad, typename TIn, typename TOut>
+bool MulFunctor<CPUContext>::Backward(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    const TGrad* dC,
+    const TIn* A,
+    const TIn* B,
+    const TOut* /* C */,
+    TGrad* dA,
+    TGrad* dB,
+    CPUContext* context) const {
+  if (A_dims == B_dims) {
+    const int size = std::accumulate(
+        A_dims.cbegin(), A_dims.cend(), 1, std::multiplies<int>());
+    math::Mul(size, dC, B, dA, context);
+    math::Mul(size, dC, A, dB, context);
+    return true;
+  }
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  std::vector<int> A_broadcast_dims(ndim);
+  std::vector<int> B_broadcast_dims(ndim);
+  std::vector<int> C_broadcast_dims(ndim);
+  math::utils::ComputeBroadcastBinaryOpDims(
+      A_dims.size(),
+      A_dims.data(),
+      B_dims.size(),
+      B_dims.data(),
+      A_broadcast_dims.data(),
+      B_broadcast_dims.data(),
+      C_broadcast_dims.data());
+  ComputeMulGradient<TGrad, TIn>(
+      ndim,
+      A_broadcast_dims.data(),
+      B_broadcast_dims.data(),
+      C_broadcast_dims.data(),
+      dC,
+      A,
+      B,
+      dA,
+      dB,
+      context);
+  return true;
 }
+
+REGISTER_CPU_OPERATOR(
+    Mul,
+    BinaryElementwiseOp<NumericTypes, CPUContext, MulFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    MulGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CPUContext,
+        MulFunctor<CPUContext>>);
+
+namespace {
+
+class GetMulGradient final : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "MulGradient",
+        "",
+        std::vector<std::string>{GO(0), I(0), I(1)},
+        std::vector<std::string>{GI(0), GI(1)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Mul, GetMulGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_mul_op.cu b/caffe2/operators/elementwise_mul_op.cu
new file mode 100644
index 00000000000000..1d891590525ffd
--- /dev/null
+++ b/caffe2/operators/elementwise_mul_op.cu
@@ -0,0 +1,197 @@
+#include "caffe2/operators/elementwise_mul_op.h"
+
+#include <algorithm>
+#include <functional>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/cub.cuh>
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;
+
+template <typename TGrad, typename TIn, int D>
+__global__ void ComputeMulGradientCUDAKernel(
+    const int outer_size,
+    const int inner_size,
+    const SimpleArray<int, D> Y_dims,
+    const SimpleArray<int, D> Y_strides,
+    const SimpleArray<int, D> W_strides,
+    const SimpleArray<int, D> X_dims,
+    const TGrad* dY,
+    const TIn* W,
+    TGrad* dX) {
+  __shared__ typename BlockReduce<TGrad>::TempStorage temp_storage;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    TGrad sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int X_index = i * inner_size + j;
+      int Y_index = 0;
+      int X_index_val = X_index;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        Y_index += (X_index_val % X_dims.data[d]) * Y_strides.data[d];
+        X_index_val /= X_dims.data[d];
+      }
+      int W_index = 0;
+      int Y_index_val = Y_index;
+#pragma unroll
+      for (int d = D - 1; d >= 0; --d) {
+        W_index += W_strides.data[d] == 0
+            ? 0
+            : (Y_index_val % Y_dims.data[d]) * W_strides.data[d];
+        Y_index_val /= Y_dims.data[d];
+      }
+#if __CUDA_ARCH__ >= 350
+      sum += __ldg(dY + Y_index) * __ldg(W + W_index);
+#else
+      sum += dY[Y_index] * W[W_index];
+#endif
+    }
+    sum = BlockReduce<TGrad>(temp_storage).Reduce(sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dX[i] = sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename TGrad, typename TIn, int D>
+void ComputeMulGradientCUDAImpl(
+    const int outer_size,
+    const int inner_size,
+    const int* Y_dims,
+    const int* W_dims,
+    const int* X_axes,
+    const TGrad* dY,
+    const TIn* W,
+    TGrad* dX,
+    CUDAContext* context) {
+  SimpleArray<int, D> Y_dims_arr;
+  SimpleArray<int, D> Y_strides_arr;
+  SimpleArray<int, D> W_strides_arr;
+  SimpleArray<int, D> X_dims_arr;
+  std::copy_n(Y_dims, D, Y_dims_arr.data);
+  math::utils::ComputeTransposedStrides(D, Y_dims, X_axes, Y_strides_arr.data);
+  int cur_stride = 1;
+  for (int i = D - 1; i >= 0; --i) {
+    W_strides_arr.data[i] = W_dims[i] == 1 ? 0 : cur_stride;
+    cur_stride *= W_dims[i];
+  }
+  for (int i = 0; i < D; ++i) {
+    X_dims_arr.data[i] = Y_dims[X_axes[i]];
+  }
+  ComputeMulGradientCUDAKernel<TGrad, TIn, D>
+      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          outer_size,
+          inner_size,
+          Y_dims_arr,
+          Y_strides_arr,
+          W_strides_arr,
+          X_dims_arr,
+          dY,
+          W,
+          dX);
+}
+
+template <typename TGrad, typename TIn>
+void ComputeMulGradientCUDA(
+    const std::vector<int>& Y_dims,
+    const std::vector<int>& W_dims,
+    const std::vector<int>& X_axes,
+    const TGrad* dY,
+    const TIn* W,
+    TGrad* dX,
+    CUDAContext* context) {
+  CAFFE_ENFORCE_EQ(Y_dims.size(), W_dims.size());
+  const int ndim = Y_dims.size();
+  std::vector<int> X_transpose_axes(ndim);
+  math::utils::ComputeTransposeAxesForReduceOp(
+      ndim, X_axes.size(), X_axes.data(), X_transpose_axes.data());
+  const int pivot = ndim - X_axes.size();
+  int outer_size = 1;
+  for (int i = 0; i < pivot; ++i) {
+    outer_size *= Y_dims[X_transpose_axes[i]];
+  }
+  int inner_size = 1;
+  for (int i = pivot; i < ndim; ++i) {
+    inner_size *= Y_dims[X_transpose_axes[i]];
+  }
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
+      ndim,
+      ComputeMulGradientCUDAImpl,
+      TGrad,
+      TIn,
+      outer_size,
+      inner_size,
+      Y_dims.data(),
+      W_dims.data(),
+      X_transpose_axes.data(),
+      dY,
+      W,
+      dX,
+      context);
+}
+
+} // namespace
+
+template <>
+template <typename TGrad, typename TIn, typename TOut>
+bool MulFunctor<CUDAContext>::Backward(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    const TGrad* dC,
+    const TIn* A,
+    const TIn* B,
+    const TOut* /* C */,
+    TGrad* dA,
+    TGrad* dB,
+    CUDAContext* context) const {
+  if (A_dims == B_dims) {
+    const int size = std::accumulate(
+        A_dims.cbegin(), A_dims.cend(), 1, std::multiplies<int>());
+    math::Mul(size, dC, B, dA, context);
+    math::Mul(size, dC, A, dB, context);
+    return true;
+  }
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  std::vector<int> A_broadcast_dims(ndim);
+  std::vector<int> B_broadcast_dims(ndim);
+  std::vector<int> C_broadcast_dims(ndim);
+  math::utils::ComputeBroadcastBinaryOpDims(
+      A_dims.size(),
+      A_dims.data(),
+      B_dims.size(),
+      B_dims.data(),
+      A_broadcast_dims.data(),
+      B_broadcast_dims.data(),
+      C_broadcast_dims.data());
+  std::vector<int> A_axes;
+  std::vector<int> B_axes;
+  ComputeBinaryBroadcastBackwardAxes(A_dims, B_dims, &A_axes, &B_axes);
+  ComputeMulGradientCUDA<TGrad, TIn>(
+      C_broadcast_dims, B_broadcast_dims, A_axes, dC, B, dA, context);
+  ComputeMulGradientCUDA<TGrad, TIn>(
+      C_broadcast_dims, A_broadcast_dims, B_axes, dC, A, dB, context);
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    Mul,
+    BinaryElementwiseOp<NumericTypes, CUDAContext, MulFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    MulGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CUDAContext,
+        MulFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_mul_op.h b/caffe2/operators/elementwise_mul_op.h
new file mode 100644
index 00000000000000..f1c42edc48b07a
--- /dev/null
+++ b/caffe2/operators/elementwise_mul_op.h
@@ -0,0 +1,48 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_MUL_OP_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_MUL_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct MulFunctor {
+  template <typename TIn, typename TOut>
+  bool Forward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TIn* A,
+      const TIn* B,
+      TOut* C,
+      Context* context) const {
+    math::Mul(
+        A_dims.size(),
+        A_dims.data(),
+        B_dims.size(),
+        B_dims.data(),
+        A,
+        B,
+        C,
+        context);
+    return true;
+  }
+
+  template <typename TGrad, typename TIn, typename TOut>
+  bool Backward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TGrad* dC_data,
+      const TIn* A_data,
+      const TIn* B_data,
+      const TOut* C_data,
+      TGrad* dA_data,
+      TGrad* dB_data,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_MUL_OP_H_
diff --git a/caffe2/operators/elementwise_op.cc b/caffe2/operators/elementwise_op.cc
deleted file mode 100644
index 43499beaa23413..00000000000000
--- a/caffe2/operators/elementwise_op.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-#include "caffe2/operators/elementwise_op.h"
-
-namespace caffe2 {
-
-// For some comparison and logical operators, eigen does not have vectorized
-// math so we need to improvise.
-#define NAIVE_FUNCTOR(name, op, input_type, output_type)                       \
-  struct Naive##name##Functor {                                                \
-    template <int b_is_scalar, typename T, typename R>                         \
-    inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) {   \
-      for (int i = 0; i < n; ++i) {                                            \
-        out[i] = op(a[i], b[b_is_scalar ? 0 : i]);                             \
-      }                                                                        \
-    }                                                                          \
-    template <typename T, typename R>                                          \
-    void RunWithBroadcast(                                                     \
-        const T* a,                                                            \
-        const T* b,                                                            \
-        R* out,                                                                \
-        size_t pre,                                                            \
-        size_t n,                                                              \
-        CPUContext*) {                                                         \
-      for (int i = 0; i < pre; ++i) {                                          \
-        for (int j = 0; j < n; ++j) {                                          \
-          out[i * n + j] = op(a[i * n + j], b[j]);                             \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    template <typename T, typename R>                                          \
-    void RunWithBroadcast2(                                                    \
-        const T* a,                                                            \
-        const T* b,                                                            \
-        R* out,                                                                \
-        size_t pre,                                                            \
-        size_t n,                                                              \
-        size_t post,                                                           \
-        CPUContext*) {                                                         \
-      for (int i = 0; i < pre; ++i) {                                          \
-        for (int j = 0; j < n; ++j) {                                          \
-          for (int k = 0; k < post; ++k) {                                     \
-            out[(i * n + j) * post + k] = op(a[(i * n + j) * post + k], b[j]); \
-          }                                                                    \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-  };                                                                           \
-  REGISTER_CPU_OPERATOR(                                                       \
-      name,                                                                    \
-      BinaryElementwiseOp<                                                     \
-          input_type,                                                          \
-          CPUContext,                                                          \
-          Naive##name##Functor,                                                \
-          output_type>)
-
-#define NAIVE_LT(x, y) ((x) < (y))
-NAIVE_FUNCTOR(LT, NAIVE_LT, NumericTypes, FixedType<bool>);
-#undef NAIVE_LT
-#define NAIVE_LE(x, y) ((x) <= (y))
-NAIVE_FUNCTOR(LE, NAIVE_LE, NumericTypes, FixedType<bool>);
-#undef NAIVE_LE
-#define NAIVE_GT(x, y) ((x) > (y))
-NAIVE_FUNCTOR(GT, NAIVE_GT, NumericTypes, FixedType<bool>);
-#undef NAIVE_GT
-#define NAIVE_GE(x, y) ((x) >= (y))
-NAIVE_FUNCTOR(GE, NAIVE_GE, NumericTypes, FixedType<bool>);
-#undef NAIVE_GE
-#define NAIVE_EQ(x, y) ((x) == (y))
-NAIVE_FUNCTOR(EQ, NAIVE_EQ, IntBoolTypes, FixedType<bool>);
-#undef NAIVE_EQ
-#define NAIVE_AND(x, y) ((x) & (y))
-NAIVE_FUNCTOR(And, NAIVE_AND, BoolTypes, FixedType<bool>);
-#undef NAIVE_AND
-#define NAIVE_OR(x, y) ((x) | (y))
-NAIVE_FUNCTOR(Or, NAIVE_OR, BoolTypes, FixedType<bool>);
-#undef NAIVE_OR
-#define NAIVE_XOR(x, y) ((x) ^ (y))
-NAIVE_FUNCTOR(Xor, NAIVE_XOR, BoolTypes, FixedType<bool>);
-#undef NAIVE_XOR
-
-struct NotFunctor {
-  inline void operator()(const int n, const bool* x, bool* y, CPUContext*) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = !x[i];
-    }
-  }
-};
-REGISTER_CPU_OPERATOR(
-    Not,
-    UnaryElementwiseOp<BoolTypes, CPUContext, NotFunctor>);
-
-template <typename T>
-void SRLHelper::sum2one(const T* x, T* y, size_t n) {
-  *y = ConstEigenArrayMap<T>(x, n, 1).sum();
-}
-
-template <typename T>
-void SRLHelper::RunWithBroadcastFront(
-    const T* x,
-    T* y,
-    size_t pre,
-    size_t n,
-    CPUContext*) {
-  EigenArrayMap<T>(y, n, 1) = ConstEigenArrayMap<T>(x, n, pre).rowwise().sum();
-}
-
-template <typename T>
-void SRLHelper::RunWithBroadcastBack(
-    const T* x,
-    T* y,
-    size_t post,
-    size_t n,
-    CPUContext*) {
-  EigenArrayMap<T>(y, 1, n) = ConstEigenArrayMap<T>(x, post, n).colwise().sum();
-}
-
-template <typename T>
-void SRLHelper::RunWithBroadcast2(
-    const T* a,
-    T* y,
-    size_t pre,
-    size_t n,
-    size_t post,
-    CPUContext*) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = 0;
-    for (int j = 0; j < pre; ++j) {
-      for (int k = 0; k < post; ++k) {
-        y[i] += a[(j * n + i) * post + k];
-      }
-    }
-  }
-}
-
-template <>
-template <typename T>
-bool SumReduceLikeOp<CPUContext>::DoRunWithType() {
-  const auto& A = Input(0);
-  const auto& B = Input(1);
-  auto* C = Output(0);
-  CAFFE_ENFORCE(&B != C, "In-place is not allowed.");
-  C->ResizeLike(B);
-  const T* Adata = A.template data<T>();
-  auto* Cdata = C->template mutable_data<T>();
-  if (B.size() == 1) {
-    auto count = A.size();
-    SRLHelper::sum2one<T>(Adata, Cdata, count);
-  } else {
-    size_t pre, n, post;
-    std::tie(pre, n, post) = calculate_broadcast_sizes(A, B, axis_);
-    if (post == 1) {
-      SRLHelper::RunWithBroadcastFront<T>(Adata, Cdata, pre, n, &context_);
-    } else if (pre == 1) {
-      SRLHelper::RunWithBroadcastBack<T>(Adata, Cdata, post, n, &context_);
-    } else {
-      SRLHelper::RunWithBroadcast2<T>(Adata, Cdata, pre, n, post, &context_);
-    }
-  }
-  return true;
-}
-REGISTER_CPU_OPERATOR(SumReduceLike, SumReduceLikeOp<CPUContext>);
-
-}  // namespace caffe2
diff --git a/caffe2/operators/elementwise_op.cu b/caffe2/operators/elementwise_op.cu
deleted file mode 100644
index 567537f1e7fac1..00000000000000
--- a/caffe2/operators/elementwise_op.cu
+++ /dev/null
@@ -1,427 +0,0 @@
-#define CUB_STDERR
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_reduce.cuh>
-#include <cub/device/device_reduce.cuh>
-#include "caffe2/core/common_gpu.h"
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/utils/conversions.h"
-
-namespace caffe2 {
-
-#define CUDA_FUNCTOR(name, op, input_type, output_type) \
-template <int b_is_scalar, typename T, typename R> \
-__global__ void name##Kernel(const T* a, const T* b, R* out, int n) { \
-  CUDA_1D_KERNEL_LOOP(i, n) { \
-    out[i] = op(a[i], b[b_is_scalar ? 0 : i]); \
-  } \
-} \
-template <typename T, typename R> \
-__global__ void name##BroadcastKernel( \
-    const T* a, const T* b, R* out, int pre, int n) { \
-  CUDA_1D_KERNEL_LOOP(i, pre * n) { \
-    out[i] = op(a[i], b[i % n]); \
-  } \
-} \
-template <typename T, typename R> \
-__global__ void name##Broadcast2Kernel( \
-    const T* a, const T* b, R* out, int pre, int n, int post) { \
-  CUDA_1D_KERNEL_LOOP(i, pre * n * post) { \
-    out[i] = op(a[i], b[(i / post) % n]); \
-  } \
-} \
- \
-struct Cuda##name##Functor { \
-  template <bool b_is_scalar, typename T, typename R> \
-  inline void Run( \
-      size_t n, const T* a, const T* b, R* out, CUDAContext* context) { \
-    name##Kernel<b_is_scalar, T, R><<<CAFFE_GET_BLOCKS(n), \
-                                      CAFFE_CUDA_NUM_THREADS, \
-                                      0, context->cuda_stream()>>>( \
-        a, b, out, n); \
-  } \
-  template <typename T, typename R> \
-  void RunWithBroadcast( \
-      const T* a, const T* b, R* out, size_t pre, size_t n, \
-      CUDAContext* context) { \
-    name##BroadcastKernel<T, R><<<CAFFE_GET_BLOCKS(pre * n), \
-                                  CAFFE_CUDA_NUM_THREADS, \
-                                  0, context->cuda_stream()>>>( \
-        a, b, out, pre, n); \
-  } \
-  template <typename T, typename R> \
-  void RunWithBroadcast2( \
-      const T* a, const T* b, R* out, size_t pre, size_t n, size_t post, \
-      CUDAContext* context) { \
-    name##Broadcast2Kernel<T, R><<<CAFFE_GET_BLOCKS(pre * n * post), \
-                                   CAFFE_CUDA_NUM_THREADS, \
-                                   0, context->cuda_stream()>>>( \
-        a, b, out, pre, n, post); \
-  } \
-}; \
-REGISTER_CUDA_OPERATOR( \
-    name, BinaryElementwiseOp< \
-        input_type, CUDAContext, Cuda##name##Functor, output_type>)
-
-#define CUDA_SUB(x, y) ((x) - (y))
-CUDA_FUNCTOR(Sub, CUDA_SUB, NumericTypes, SameTypeAsInput);
-#undef CUDA_SUB
-#define CUDA_MUL(x, y) ((x) * (y))
-CUDA_FUNCTOR(Mul, CUDA_MUL, NumericTypes, SameTypeAsInput);
-#undef CUDA_MUL
-#define CUDA_DIV(x, y) ((x) / (y))
-CUDA_FUNCTOR(Div, CUDA_DIV, NumericTypes, SameTypeAsInput);
-#undef CUDA_DIV
-#define CUDA_LT(x, y) ((x) < (y))
-CUDA_FUNCTOR(LT, CUDA_LT, NumericTypes, FixedType<bool>);
-#undef CUDA_LT
-#define CUDA_LE(x, y) ((x) <= (y))
-CUDA_FUNCTOR(LE, CUDA_LE, NumericTypes, FixedType<bool>);
-#undef CUDA_LE
-#define CUDA_GT(x, y) ((x) > (y))
-CUDA_FUNCTOR(GT, CUDA_GT, NumericTypes, FixedType<bool>);
-#undef CUDA_GT
-#define CUDA_GE(x, y) ((x) >= (y))
-CUDA_FUNCTOR(GE, CUDA_GE, NumericTypes, FixedType<bool>);
-#undef CUDA_GE
-#define CUDA_EQ(x, y) ((x) == (y))
-CUDA_FUNCTOR(EQ, CUDA_EQ, IntBoolTypes, FixedType<bool>);
-#undef CUDA_EQ
-#define CUDA_AND(x, y) ((x) & (y))
-CUDA_FUNCTOR(And, CUDA_AND, BoolTypes, FixedType<bool>);
-#undef CUDA_AND
-#define CUDA_OR(x, y) ((x) | (y))
-CUDA_FUNCTOR(Or, CUDA_OR, BoolTypes, FixedType<bool>);
-#undef CUDA_OR
-#define CUDA_XOR(x, y) ((x) ^ (y))
-CUDA_FUNCTOR(Xor, CUDA_XOR, BoolTypes, FixedType<bool>);
-#undef CUDA_XOR
-
-__global__ void NotKernel(const int n, const bool* x, bool* y) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    y[i] = !x[i];
-  }
-}
-struct CudaNotFunctor {
-  inline void operator()(
-      const int n, const bool* x, bool* y, CUDAContext* context) {
-    NotKernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, 0,
-                context->cuda_stream()>>>(n, x, y);
-  }
-};
-REGISTER_CUDA_OPERATOR(
-    Not,
-    UnaryElementwiseOp<BoolTypes, CUDAContext, CudaNotFunctor>);
-
-__global__ void DivKernel(const int n, float *dXdata, float *dYdata,
-                          const float *dZdata, const float *Ydata,
-                          const float *Zdata) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    dXdata[i] = dZdata[i] / Ydata[i];
-    dYdata[i] = - (dZdata[i] * Zdata[i]) / Ydata[i];
-  }
-}
-
-void ElementWiseDivide(
-    CUDAContext& context,
-    const int n,
-    float* dXdata,
-    float* dYdata,
-    const float* dZdata,
-    const float* Ydata,
-    const float* Zdata) {
-  DivKernel<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, 0,
-              context.cuda_stream()>>>(n, dXdata, dYdata, dZdata, Ydata, Zdata);
-}
-
-REGISTER_CUDA_OPERATOR(DivGradient, DivGradientOp<CUDAContext>);
-
-namespace {
-
-template <typename T>
-__global__ void
-reduce_sum_like_post1(const T* g_idata, T* g_odata, int pre, int N) {
-  int n = blockIdx.x * blockDim.x + threadIdx.x;
-  if (n >= N) {
-    return;
-  }
-
-  float sum = 0.0;
-  for (int i = 0; i < pre; ++i) {
-    sum += convert::To<T, float>(g_idata[i * N + n]);
-  }
-
-  g_odata[n] = convert::To<float, T>(sum);
-}
-
-template <typename T>
-void device_reduce(
-    const T* d_in,
-    T* d_out,
-    int N,
-    Tensor<CUDAContext>* buffer,
-    CUDAContext* context) {
-  // Determine temporary device storage requirements
-  size_t temp_storage_bytes = 0;
-  cub::DeviceReduce::Sum(
-      NULL, temp_storage_bytes, d_in, d_out, N, context->cuda_stream());
-
-  auto buffer_size = temp_storage_bytes / sizeof(T);
-  buffer_size += temp_storage_bytes % sizeof(T) != 0 ? 1 : 0;
-  buffer->Resize(buffer_size);
-  void* d_temp_storage = static_cast<void*>(buffer->template mutable_data<T>());
-  // Run sum-reduction
-  cub::DeviceReduce::Sum(
-      d_temp_storage,
-      temp_storage_bytes,
-      d_in,
-      d_out,
-      N,
-      context->cuda_stream());
-}
-
-template <>
-void device_reduce<float16>(
-    const float16* in,
-    float16* out,
-    int N,
-    Tensor<CUDAContext>* buffer,
-    CUDAContext* context) {
-  auto buffer_size = 1;
-
-  if (buffer->size() != buffer_size) {
-    buffer->Resize(buffer_size);
-
-    math::Set<float16, CUDAContext>(
-        N,
-        convert::To<float,float16>(1.),
-        buffer->mutable_data<float16>(),
-        context);
-  }
-
-  CUBLAS_ENFORCE(cublasDotEx(
-              context->cublas_handle(),
-              N,
-              in,
-              CUDA_R_16F,
-              1,
-              buffer->data<float16>(),
-              CUDA_R_16F,
-              0,
-              out,
-              CUDA_R_16F,
-              CUDA_R_32F));
-}
-
-template <typename T, int BLOCK_THREADS>
-__global__ void
-reduce_sum_like(const T* g_idata, T* g_odata, int pre, int N, int post) {
-  int n = blockIdx.x;
-  float sum = 0.0;
-  int limit = pre * post;
-  for (int i = threadIdx.x; i < limit; i += blockDim.x) {
-    int curPre = i / post;
-    int curPost = i % post;
-
-    sum += convert::To<T, float>(g_idata[curPre * N * post + n * post + curPost]);
-  }
-  // uses a shared memory reduction within block
-  typedef cub::BlockReduce<float, BLOCK_THREADS> BlockReduceT;
-  // Shared memory
-  __shared__ typename BlockReduceT::TempStorage temp_storage;
-  float aggregate = BlockReduceT(temp_storage).Sum(sum);
-  if (threadIdx.x == 0) {
-    g_odata[n] = convert::To<float, T>(aggregate);
-  }
-}
-} // namespace
-
-template <>
-template <typename T>
-bool SumReduceLikeOp<CUDAContext>::DoRunWithType() {
-  const auto& A = Input(0);
-  const auto& B = Input(1);
-  auto* C = Output(0);
-  auto count = A.size();
-  CAFFE_ENFORCE(&B != C, "In-place is not allowed.");
-  C->ResizeLike(B);
-  const T* Adata = A.template data<T>();
-  auto* Cdata = C->template mutable_data<T>();
-  if (B.size() == 1) {
-    device_reduce<T>(Adata, Cdata, count, &sum_buffer_, &context_);
-  } else {
-    size_t pre, n, post;
-    std::tie(pre, n, post) = calculate_broadcast_sizes(A, B, axis_);
-    // because we check shape(B) \in shape(A) before,
-    // post and pre cannot be 1 at same time
-    if (post == 1) {
-      reduce_sum_like_post1<T>
-          <<<CAFFE_GET_BLOCKS(n),
-             CAFFE_CUDA_NUM_THREADS,
-             0,
-             context_.cuda_stream()>>>(Adata, Cdata, pre, n);
-    } else {
-      if (post >= 128) {
-        reduce_sum_like<T, 512>
-            <<<n, 512, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
-      } else if (post >= 64) {
-        reduce_sum_like<T, 128>
-            <<<n, 128, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
-      } else if (post >= 32) {
-        reduce_sum_like<T, 64>
-            <<<n, 64, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
-      } else {
-        reduce_sum_like<T, 32>
-            <<<n, 32, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
-      }
-    }
-  }
-  return true;
-}
-
-template <>
-bool SumReduceLikeOp<CUDAContext>::RunOnDevice() {
-  return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
-}
-
-REGISTER_CUDA_OPERATOR(SumReduceLike, SumReduceLikeOp<CUDAContext>);
-
-namespace {
-
-template <bool is_scaler, typename T, typename M>
-__global__ void binary_add_kernel(const int N, const T* a, const T* b, T* r) {
-  CUDA_1D_KERNEL_LOOP(idx, N) {
-    r[idx] = convert::To<M, T>(
-        convert::To<T, M>(a[idx]) +
-        convert::To<T, M>(is_scaler ? b[0] : b[idx]));
-  }
-}
-
-template <bool no_post, typename T, typename M>
-__global__ void binary_add_kernel_broadcast(
-    const T* a,
-    const T* b,
-    T* r,
-    const int pre,
-    const int post,
-    const int n) {
-  CUDA_1D_KERNEL_LOOP(idx, no_post ? pre * n : pre * post * n) {
-    r[idx] = convert::To<M, T>(
-        convert::To<T, M>(a[idx]) +
-        convert::To<T, M>(no_post ? b[idx % n] : b[(idx / post) % n]));
-  }
-}
-} // namespace
-
-// Actual Add operator, because the above macros are read-only.
-class CUDAAddOp final : public Operator<CUDAContext> {
- public:
-  CUDAAddOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CUDAContext>(operator_def, ws),
-        OP_SINGLE_ARG(bool, "broadcast", enable_broadcast_, 0),
-        OP_SINGLE_ARG(int, "axis", axis_, -1),
-        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
-        OP_SINGLE_ARG(string, "order", order_, "NCHW") {
-    // Figure out the correct axis to use.
-    if (enable_broadcast_) {
-      if (axis_ != -1) {
-        // Get axis from an explicit axis argument.
-        CAFFE_ENFORCE_EQ(
-            axis_str_.size(),
-            0,
-            "Args axis and axis_str cannot be used simultaneously.");
-      } else if (axis_str_.size()) {
-        // Get the axis index semantically.
-        CAFFE_ENFORCE_EQ(
-            axis_str_.size(), 1, "Unsupported axis string", axis_str_);
-        size_t semantic_axis_ = order_.find(axis_str_);
-        CAFFE_ENFORCE_NE(
-            semantic_axis_,
-            string::npos,
-            "Unrecognizable axis string ",
-            axis_str_,
-            " from order string ",
-            order_);
-        axis_ = semantic_axis_;
-      }
-    } else {
-      CAFFE_ENFORCE(
-          axis_ == -1 && axis_str_.size() == 0,
-          "Do not specify axis or axis_str if broadcast is not enabled.");
-    }
-  }
-
-  ~CUDAAddOp() {}
-
-  template <typename T, typename M>
-  bool DoRunWithType() {
-    auto& X0 = Input(0);
-    auto& X1 = Input(1);
-    auto* output = Output(0);
-
-    output->ResizeLike(X0);
-
-    const T* X0data = X0.template data<T>();
-    const T* X1data = X1.template data<T>();
-    T* outputData = output->template mutable_data<T>();
-
-    if (!enable_broadcast_) {
-      CAFFE_ENFORCE_EQ(
-          X0.dims(),
-          X1.dims(),
-          "Dimension mismatch - did you forget to set broadcast=1?");
-      binary_add_kernel<false, T, M><<<
-          CAFFE_GET_BLOCKS(X0.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(X0.size(), X0data, X1data, outputData);
-    } else if (X1.size() == 1) {
-      binary_add_kernel<true, T, M><<<
-          CAFFE_GET_BLOCKS(X0.size()),
-          CAFFE_CUDA_NUM_THREADS,
-          0,
-          context_.cuda_stream()>>>(X0.size(), X0data, X1data, outputData);
-    } else {
-      size_t pre, n, post;
-      std::tie(pre, n, post) = calculate_broadcast_sizes(X0, X1, axis_);
-      if (post == 1) {
-        binary_add_kernel_broadcast<true, T, M><<<
-            CAFFE_GET_BLOCKS(pre * n),
-            CAFFE_CUDA_NUM_THREADS,
-            0,
-            context_.cuda_stream()>>>(X0data, X1data, outputData, pre, post, n);
-      } else {
-        binary_add_kernel_broadcast<false, T, M><<<
-            CAFFE_GET_BLOCKS(pre * post * n),
-            CAFFE_CUDA_NUM_THREADS,
-            0,
-            context_.cuda_stream()>>>(X0data, X1data, outputData, pre, post, n);
-      }
-    }
-    return true;
-  }
-
-  bool RunOnDevice() override {
-    if (Input(0).IsType<float>()) {
-      return DoRunWithType<float, float>();
-    } else if (Input(0).IsType<float16>()) {
-      return DoRunWithType<float16, float>();
-    } else if (Input(0).IsType<int32_t>()) {
-      return DoRunWithType<int32_t, int32_t>();
-    } else if (Input(0).IsType<int64_t>()) {
-      return DoRunWithType<int64_t, int64_t>();
-    } else {
-      return false;
-    }
-  }
-
- private:
-  bool enable_broadcast_;
-  int axis_;
-  string axis_str_;
-  string order_;
-};
-
-REGISTER_CUDA_OPERATOR(Add, CUDAAddOp);
-
-}  // namespace caffe2
diff --git a/caffe2/operators/elementwise_op.h b/caffe2/operators/elementwise_op.h
deleted file mode 100644
index d078f91d8a1bf2..00000000000000
--- a/caffe2/operators/elementwise_op.h
+++ /dev/null
@@ -1,432 +0,0 @@
-#ifndef CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
-#define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
-
-#include "caffe2/core/common_omp.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/utils/math.h"
-
-#include <tuple>
-
-namespace caffe2 {
-
-using NumericTypes = TensorTypes<int32_t, int64_t, float, double>;
-using IntTypes = TensorTypes<int32_t, int64_t>;
-using BoolTypes = TensorTypes<bool>;
-using IntBoolTypes = TensorTypes<int32_t, int64_t, bool>; // discrete types
-
-struct SameTypeAsInput {
-  template <typename T>
-  using type = T;
-};
-
-template <typename R>
-struct FixedType {
-  template <typename T>
-  using type = R;
-};
-
-template <
-    typename InputTypes,
-    class Context,
-    class Functor,
-    class TypeMap = SameTypeAsInput>
-class UnaryElementwiseWithArgsOp : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  UnaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws), functor_(*this) {}
-
-  bool RunOnDevice() override {
-    return DispatchHelper<InputTypes>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType() {
-    auto& input = Input(0);
-    auto* output = Output(0);
-    output->ResizeLike(input);
-    using R = typename TypeMap::template type<T>;
-    functor_(
-        input.size(),
-        input.template data<T>(),
-        output->template mutable_data<R>(),
-        &context_);
-    return true;
-  }
-
- private:
-  Functor functor_;
-};
-
-/**
- * WithDefaultConstructor is a functor that can be used as the functor of an
- * UnaryElementwiseWithArgsOp. It simply forwards the operator() call into
- * another functor that doesn't accept arguments in its constructor.
- */
-template <typename Functor>
-struct WithDefaultConstructor {
-  explicit WithDefaultConstructor(OperatorBase& /*op*/) {}
-
-  template <typename In, typename Out, typename Context>
-  void operator()(int n, const In* in, Out* out, Context* c) {
-    Functor()(n, in, out, c);
-  }
-};
-
-/**
- * UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the
- * difference that it takes a functor with default constructor, e.g. that does
- * not need to take into consideration any arguments during operator creation.
- */
-template <
-    typename InputTypes,
-    class Context,
-    class Functor,
-    class OutputType = SameTypeAsInput>
-using UnaryElementwiseOp = UnaryElementwiseWithArgsOp<
-    InputTypes,
-    Context,
-    WithDefaultConstructor<Functor>,
-    OutputType>;
-
-template <typename Context>
-std::tuple<size_t, size_t, size_t> calculate_broadcast_sizes(
-    const Tensor<Context>& A,
-    const Tensor<Context>& B,
-    int axis) {
-  CAFFE_ENFORCE_GE(
-      A.ndim(),
-      B.ndim(),
-      "If you are doing broadcasting, input1 should have "
-      "a smaller or equal number of dimensions.");
-  if (axis == -1) {
-    axis = A.ndim() - B.ndim();
-  }
-  CAFFE_ENFORCE(
-      axis >= 0 && axis <= A.ndim() - B.ndim(),
-      "Broadcast axis should be in the range of"
-      "[0, A.ndim() - B.ndim()], but axis = ",
-      axis);
-
-  int b_dim_start = 0;
-  while (b_dim_start < B.ndim() && B.dim(b_dim_start) == 1) {
-    ++b_dim_start;
-  }
-  int b_dim_end = B.ndim() - 1;
-  while (b_dim_end >= b_dim_start && B.dim(b_dim_end) == 1) {
-    --b_dim_end;
-  }
-  size_t pre = 1, n = 1, post = 1;
-  for (int i = 0; i < axis + b_dim_start; ++i) {
-    pre *= A.dim(i);
-  }
-  for (int i = b_dim_start; i <= b_dim_end; ++i) {
-    CAFFE_ENFORCE_EQ(
-        A.dim(i + axis), B.dim(i), "Broadcast dimension mismatch.");
-    n *= B.dim(i);
-  }
-  for (int i = axis + b_dim_end + 1; i < A.ndim(); ++i) {
-    post *= A.dim(i);
-  }
-  return std::make_tuple(pre, n, post);
-}
-
-/**
- * Performs a binary operation (e.g. +, - or /) with optional broadcast support.
- *
- * Functor specifies actual operation to be performed.
- *
- * If AllowBroadcast=false tensors has to be of exactly the same shape.
- *
- * If AllowBroadcast=true it support limited broadcasting of the right-hand-side
- * argument to match the shape of left-hand-side argument. Only suffix matching
- * is supported for now (1-dim expansion is allowed on both ends). E.g. this
- * will be accepted:
- * A dims: 2 3 4 5 6
- * B dims:   1 4 1
- *           ^
- *           |
- *          axis = 1
- */
-template <
-    typename InputTypes,
-    class Context,
-    class Functor,
-    class TypeMap = SameTypeAsInput>
-class BinaryElementwiseOp : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  BinaryElementwiseOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        OP_SINGLE_ARG(bool, "broadcast", enable_broadcast_, 0),
-        OP_SINGLE_ARG(int, "axis", axis_, -1),
-        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
-        OP_SINGLE_ARG(string, "order", order_, "NCHW"),
-        functor_() {
-    // Figure out the correct axis to use.
-    if (enable_broadcast_) {
-      if (axis_ != -1) {
-        // Get axis from an explicit axis argument.
-        CAFFE_ENFORCE_EQ(
-            axis_str_.size(),
-            0,
-            "Args axis and axis_str cannot be used simultaneously.");
-      } else if (axis_str_.size()) {
-        // Get the axis index semantically.
-        CAFFE_ENFORCE_EQ(
-            axis_str_.size(), 1, "Unsupported axis string", axis_str_);
-        size_t semantic_axis_ = order_.find(axis_str_);
-        CAFFE_ENFORCE_NE(
-            semantic_axis_,
-            string::npos,
-            "Unrecognizable axis string ",
-            axis_str_,
-            " from order string ",
-            order_);
-        axis_ = semantic_axis_;
-      }
-    } else {
-      CAFFE_ENFORCE(
-          axis_ == -1 && axis_str_.size() == 0,
-          "Do not specify axis or axis_str if broadcast is not enabled.");
-    }
-  }
-
-  bool RunOnDevice() override {
-    return DispatchHelper<InputTypes>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType() {
-    const auto& A = Input(0);
-    const auto& B = Input(1);
-    auto* C = Output(0);
-    CAFFE_ENFORCE(
-        &B != C || !enable_broadcast_,
-        "In-place is allowed only with the first tensor when broadcasting");
-    C->ResizeLike(A);
-    const T* Adata = A.template data<T>();
-    const T* Bdata = B.template data<T>();
-    auto* Cdata =
-        C->template mutable_data<typename TypeMap::template type<T>>();
-    if (!enable_broadcast_) {
-      CAFFE_ENFORCE_EQ(
-          A.dims(),
-          B.dims(),
-          "Dimension mismatch - did you forget to set broadcast=1?");
-      functor_.template Run<false>(A.size(), Adata, Bdata, Cdata, &context_);
-    } else if (B.size() == 1) {
-      functor_.template Run<true>(A.size(), Adata, Bdata, Cdata, &context_);
-    } else {
-      size_t pre, n, post;
-      std::tie(pre, n, post) = calculate_broadcast_sizes(A, B, axis_);
-      if (post == 1) {
-        functor_.RunWithBroadcast(Adata, Bdata, Cdata, pre, n, &context_);
-      } else {
-        functor_.RunWithBroadcast2(
-            Adata, Bdata, Cdata, pre, n, post, &context_);
-      }
-    }
-    return true;
-  }
-
- private:
-  bool enable_broadcast_;
-  int axis_;
-  string axis_str_;
-  string order_;
-  Functor functor_;
-};
-
-template <typename Functor>
-struct WithoutBroadcast {
-  template <bool b_is_scalar, typename T, typename R, typename Context>
-  inline void Run(size_t n, const T* a, const T* b, R* out, Context* c) {
-    if (b_is_scalar) {
-      CAFFE_THROW("Broadcast not supported.");
-    } else {
-      Functor().Run(n, a, b, out, c);
-    }
-  }
-  template <typename T, typename R, typename Context>
-  inline void RunWithBroadcast(
-      const T* /*a*/,
-      const T* /*b*/,
-      R* /*out*/,
-      size_t /*pre*/,
-      size_t /*n*/,
-      Context*) {
-    CAFFE_NOT_IMPLEMENTED;
-  }
-  template <typename T, typename R, typename Context>
-  inline void RunWithBroadcast2(
-      const T* /*a*/,
-      const T* /*b*/,
-      R* /*out*/,
-      size_t /*pre*/,
-      size_t /*n*/,
-      size_t /*post*/,
-      Context*) {
-    CAFFE_NOT_IMPLEMENTED;
-  }
-};
-
-// Gradient operator for elementwise division.
-template <class Context>
-class DivGradientOp final : public Operator<Context> {
- public:
-  USE_SIMPLE_CTOR_DTOR(DivGradientOp);
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override;
-};
-
-namespace SRLHelper {
-
-template <typename T>
-void sum2one(const T* a, T* y, size_t n);
-
-template <typename T>
-void RunWithBroadcastFront(const T* a, T* y, size_t pre, size_t n, CPUContext*);
-
-template <typename T>
-void RunWithBroadcastBack(const T* a, T* y, size_t post, size_t n, CPUContext*);
-
-template <typename T>
-void RunWithBroadcast2(
-    const T* a,
-    T* y,
-    size_t pre,
-    size_t n,
-    size_t post,
-    CPUContext*);
-
-} // namespace SRLHelper
-
-// Sum reduction operator that is used for computing the gradient in cases
-// where the forward op is in broadcast mode.
-template <class Context>
-class SumReduceLikeOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  SumReduceLikeOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        OP_SINGLE_ARG(int, "axis", axis_, -1),
-        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
-        OP_SINGLE_ARG(string, "order", order_, "NCHW") {
-    if (axis_ != -1) {
-      // Get axis from an explicit axis argument.
-      CAFFE_ENFORCE_EQ(
-          axis_str_.size(),
-          0,
-          "Args axis and axis_str cannot be used simultaneously.");
-    } else if (axis_str_.size()) {
-      // Get the axis index semantically.
-      CAFFE_ENFORCE_EQ(
-          axis_str_.size(), 1, "Unsupported axis string", axis_str_);
-      size_t semantic_axis = order_.find(axis_str_);
-      CAFFE_ENFORCE_NE(
-          semantic_axis,
-          string::npos,
-          "Unrecognizable axis string ",
-          axis_str_,
-          " from order string ",
-          order_);
-      axis_ = semantic_axis;
-    }
-  }
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<float, double>>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType();
-
- private:
-  int axis_;
-  string axis_str_;
-  string order_;
-  Tensor<Context> ones_;
-  Tensor<Context> sum_buffer_;
-};
-
-template <class Context>
-bool DivGradientOp<Context>::RunOnDevice() {
-  auto& Y = Input(0);
-  auto& Z = Input(1);
-  auto& dZ = Input(2);
-  auto* dX = Output(0);
-  auto* dY = Output(1);
-
-  dX->ResizeLike(Y);
-  dY->ResizeLike(Y);
-
-  const float* Ydata = Y.template data<float>();
-  const float* Zdata = Z.template data<float>();
-  const float* dZdata = dZ.template data<float>();
-  float* dXdata = dX->template mutable_data<float>();
-  float* dYdata = dY->template mutable_data<float>();
-
-  ElementWiseDivide(context_, Y.size(), dXdata, dYdata, dZdata, Ydata, Zdata);
-  return true;
-}
-
-// For arithmetic operators, Eigen provides a good way to vectorize even
-// when broadcasting.
-#define EIGEN_FUNCTOR(name, eigen_op, input_type, output_type)               \
-  struct Eigen##name##Functor {                                              \
-    template <int b_is_scalar, typename T, typename R>                       \
-    inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \
-      if (b_is_scalar) {                                                     \
-        EigenVectorArrayMap<R>(out, n) =                                     \
-            eigen_op((ConstEigenVectorArrayMap<T>(a, n)), (b[0]));           \
-      } else {                                                               \
-        EigenVectorArrayMap<R>(out, n) = eigen_op(                           \
-            (ConstEigenVectorArrayMap<T>(a, n)),                             \
-            (ConstEigenVectorArrayMap<T>(b, n)));                            \
-      }                                                                      \
-    }                                                                        \
-    template <typename T, typename R>                                        \
-    void RunWithBroadcast(                                                   \
-        const T* a,                                                          \
-        const T* b,                                                          \
-        R* out,                                                              \
-        size_t pre,                                                          \
-        size_t n,                                                            \
-        CPUContext*) {                                                       \
-      EigenArrayMap<R>(out, n, pre) = eigen_op(                              \
-          (ConstEigenArrayMap<T>(a, n, pre).colwise()),                      \
-          (ConstEigenVectorArrayMap<T>(b, n)));                              \
-    }                                                                        \
-    template <typename T, typename R>                                        \
-    void RunWithBroadcast2(                                                  \
-        const T* a,                                                          \
-        const T* b,                                                          \
-        R* out,                                                              \
-        size_t pre,                                                          \
-        size_t n,                                                            \
-        size_t post,                                                         \
-        CPUContext*) {                                                       \
-      for (int i = 0; i < pre; ++i) {                                        \
-        EigenArrayMap<R>(out + i * n * post, post, n) = eigen_op(            \
-            (ConstEigenArrayMap<T>(a + i * n * post, post, n).rowwise()),    \
-            (Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>>(b, n)));   \
-      }                                                                      \
-    }                                                                        \
-  };                                                                         \
-  REGISTER_CPU_OPERATOR(                                                     \
-      name,                                                                  \
-      BinaryElementwiseOp<                                                   \
-          input_type,                                                        \
-          CPUContext,                                                        \
-          Eigen##name##Functor,                                              \
-          output_type>)
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
diff --git a/caffe2/operators/elementwise_op_test.h b/caffe2/operators/elementwise_op_test.h
index a284b1cd196de1..9b8f08136de5bd 100644
--- a/caffe2/operators/elementwise_op_test.h
+++ b/caffe2/operators/elementwise_op_test.h
@@ -1,11 +1,12 @@
 #ifndef CAFFE2_OPERATORS_ELEMENTWISE_OP_TEST_H_
 #define CAFFE2_OPERATORS_ELEMENTWISE_OP_TEST_H_
 
+#include "caffe2/operators/elementwise_ops.h"
+
 #include <iostream>
 #include <string>
 #include <vector>
 
-#include "caffe2/operators/elementwise_op.h"
 #include <gtest/gtest.h>
 
 template <typename Context, typename T>
diff --git a/caffe2/operators/elementwise_ops.cc b/caffe2/operators/elementwise_ops.cc
new file mode 100644
index 00000000000000..3a3dda53de2c0d
--- /dev/null
+++ b/caffe2/operators/elementwise_ops.cc
@@ -0,0 +1,183 @@
+#include "caffe2/operators/elementwise_ops.h"
+
+#include <algorithm>
+
+namespace caffe2 {
+
+std::vector<int> ComputeBinaryBroadcastForwardDims(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims) {
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  std::vector<int> C_dims(ndim);
+  int i = A_dims.size() - 1;
+  int j = B_dims.size() - 1;
+  int k = ndim - 1;
+  for (; i >= 0 && j >= 0; --k) {
+    CAFFE_ENFORCE(A_dims[i] == B_dims[j] || A_dims[i] == 1 || B_dims[j] == 1);
+    C_dims[k] = std::max(A_dims[i--], B_dims[j--]);
+  }
+  for (; i >= 0; --i) {
+    C_dims[k--] = A_dims[i];
+  }
+  for (; j >= 0; --j) {
+    C_dims[k--] = B_dims[j];
+  }
+  return C_dims;
+}
+
+void ComputeBinaryBroadcastBackwardAxes(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    std::vector<int>* A_axes,
+    std::vector<int>* B_axes) {
+  A_axes->clear();
+  B_axes->clear();
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  int i = A_dims.size() - 1;
+  int j = B_dims.size() - 1;
+  int k = ndim - 1;
+  for (; i >= 0 && j >= 0; --k) {
+    CAFFE_ENFORCE(A_dims[i] == B_dims[j] || A_dims[i] == 1 || B_dims[j] == 1);
+    if (A_dims[i] != B_dims[j]) {
+      if (A_dims[i] == 1) {
+        A_axes->push_back(k);
+      }
+      if (B_dims[j] == 1) {
+        B_axes->push_back(k);
+      }
+    }
+    --i;
+    --j;
+  }
+  if (i < 0) {
+    for (; k >= 0; --k) {
+      A_axes->push_back(k);
+    }
+  } else {
+    for (; k >= 0; --k) {
+      B_axes->push_back(k);
+    }
+  }
+  std::reverse(A_axes->begin(), A_axes->end());
+  std::reverse(B_axes->begin(), B_axes->end());
+}
+
+REGISTER_CPU_OPERATOR(
+    Not,
+    UnaryElementwiseOp<BoolTypes, CPUContext, NotFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    Sign,
+    UnaryElementwiseOp<NumericTypes, CPUContext, SignFunctor<CPUContext>>);
+
+#define REGISTER_CPU_COMPARE_OPERATOR(Op)                     \
+  REGISTER_CPU_OPERATOR(                                      \
+      Op,                                                     \
+      BinaryElementwiseOp<                                    \
+          TensorTypes<bool, int32_t, int64_t, float, double>, \
+          CPUContext,                                         \
+          Op##Functor<CPUContext>,                            \
+          FixedType<bool>>)
+
+REGISTER_CPU_COMPARE_OPERATOR(EQ);
+REGISTER_CPU_COMPARE_OPERATOR(NE);
+REGISTER_CPU_COMPARE_OPERATOR(LT);
+REGISTER_CPU_COMPARE_OPERATOR(LE);
+REGISTER_CPU_COMPARE_OPERATOR(GT);
+REGISTER_CPU_COMPARE_OPERATOR(GE);
+
+#undef REGISTER_CPU_COMPARE_OPERATOR
+
+#define REGISTER_CPU_LOGICAL_BINARY_OPERATOR(Op) \
+  REGISTER_CPU_OPERATOR(                         \
+      Op, BinaryElementwiseOp<BoolTypes, CPUContext, Op##Functor<CPUContext>>)
+
+REGISTER_CPU_LOGICAL_BINARY_OPERATOR(And);
+REGISTER_CPU_LOGICAL_BINARY_OPERATOR(Or);
+REGISTER_CPU_LOGICAL_BINARY_OPERATOR(Xor);
+
+#undef REGISTER_CPU_LOGICAL_BINARY_OPERATOR
+
+#define REGISTER_CPU_BITWISE_BINARY_OPERATOR(Op) \
+  REGISTER_CPU_OPERATOR(                         \
+      Op,                                        \
+      BinaryElementwiseOp<IntBoolTypes, CPUContext, Op##Functor<CPUContext>>)
+
+REGISTER_CPU_BITWISE_BINARY_OPERATOR(BitwiseAnd);
+REGISTER_CPU_BITWISE_BINARY_OPERATOR(BitwiseOr);
+REGISTER_CPU_BITWISE_BINARY_OPERATOR(BitwiseXor);
+
+#undef REGISTER_CPU_BITWISE_BINARY_OPERATOR
+
+template <typename T>
+void SRLHelper::sum2one(const T* x, T* y, size_t n) {
+  *y = ConstEigenArrayMap<T>(x, n, 1).sum();
+}
+
+template <typename T>
+void SRLHelper::RunWithBroadcastFront(
+    const T* x,
+    T* y,
+    size_t pre,
+    size_t n,
+    CPUContext*) {
+  EigenArrayMap<T>(y, n, 1) = ConstEigenArrayMap<T>(x, n, pre).rowwise().sum();
+}
+
+template <typename T>
+void SRLHelper::RunWithBroadcastBack(
+    const T* x,
+    T* y,
+    size_t post,
+    size_t n,
+    CPUContext*) {
+  EigenArrayMap<T>(y, 1, n) = ConstEigenArrayMap<T>(x, post, n).colwise().sum();
+}
+
+template <typename T>
+void SRLHelper::RunWithBroadcast2(
+    const T* a,
+    T* y,
+    size_t pre,
+    size_t n,
+    size_t post,
+    CPUContext*) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = 0;
+    for (int j = 0; j < pre; ++j) {
+      for (int k = 0; k < post; ++k) {
+        y[i] += a[(j * n + i) * post + k];
+      }
+    }
+  }
+}
+
+template <>
+template <typename T>
+bool SumReduceLikeOp<CPUContext>::DoRunWithType() {
+  const auto& A = Input(0);
+  const auto& B = Input(1);
+  auto* C = Output(0);
+  CAFFE_ENFORCE(&B != C, "In-place is not allowed.");
+  C->ResizeLike(B);
+  const T* Adata = A.template data<T>();
+  auto* Cdata = C->template mutable_data<T>();
+  if (B.size() == 1) {
+    auto count = A.size();
+    SRLHelper::sum2one<T>(Adata, Cdata, count);
+  } else {
+    size_t pre, n, post;
+    std::tie(pre, n, post) = ComputeLegacyBroadcastSizes(A, B, axis_);
+    if (post == 1) {
+      SRLHelper::RunWithBroadcastFront<T>(Adata, Cdata, pre, n, &context_);
+    } else if (pre == 1) {
+      SRLHelper::RunWithBroadcastBack<T>(Adata, Cdata, post, n, &context_);
+    } else {
+      SRLHelper::RunWithBroadcast2<T>(Adata, Cdata, pre, n, post, &context_);
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(SumReduceLike, SumReduceLikeOp<CPUContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu
new file mode 100644
index 00000000000000..4b1e355a926631
--- /dev/null
+++ b/caffe2/operators/elementwise_ops.cu
@@ -0,0 +1,214 @@
+#include "caffe2/operators/elementwise_ops.h"
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/device/device_reduce.cuh>
+
+#include "caffe2/core/common_gpu.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/conversions.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Not,
+    UnaryElementwiseOp<BoolTypes, CUDAContext, NotFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    Sign,
+    UnaryElementwiseOp<NumericTypes, CUDAContext, SignFunctor<CUDAContext>>);
+
+#define REGISTER_CUDA_COMPARE_OPERATOR(Op)                    \
+  REGISTER_CUDA_OPERATOR(                                     \
+      Op,                                                     \
+      BinaryElementwiseOp<                                    \
+          TensorTypes<bool, int32_t, int64_t, float, double>, \
+          CUDAContext,                                        \
+          Op##Functor<CUDAContext>,                           \
+          FixedType<bool>>)
+
+REGISTER_CUDA_COMPARE_OPERATOR(EQ);
+REGISTER_CUDA_COMPARE_OPERATOR(NE);
+REGISTER_CUDA_COMPARE_OPERATOR(LT);
+REGISTER_CUDA_COMPARE_OPERATOR(LE);
+REGISTER_CUDA_COMPARE_OPERATOR(GT);
+REGISTER_CUDA_COMPARE_OPERATOR(GE);
+
+#undef REGISTER_CUDA_COMPARE_OPERATOR
+
+#define REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Op) \
+  REGISTER_CUDA_OPERATOR(                         \
+      Op,                                         \
+      BinaryElementwiseOp<BoolTypes, CUDAContext, Op##Functor<CUDAContext>>)
+
+REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(And);
+REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Or);
+REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Xor);
+
+#undef REGISTER_CUDA_LOGICAL_BINARY_OPERATOR
+
+#define REGISTER_CUDA_BITWISE_BINARY_OPERATOR(Op) \
+  REGISTER_CUDA_OPERATOR(                         \
+      Op,                                         \
+      BinaryElementwiseOp<                        \
+          IntBoolTypes,                           \
+          CUDAContext,                            \
+          Op##Functor<CUDAContext>>)
+
+REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseAnd);
+REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseOr);
+REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseXor);
+
+#undef REGISTER_CUDA_BITWISE_BINARY_OPERATOR
+
+namespace {
+
+template <typename T>
+__global__ void
+reduce_sum_like_post1(const T* g_idata, T* g_odata, int pre, int N) {
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n >= N) {
+    return;
+  }
+
+  float sum = 0.0;
+  for (int i = 0; i < pre; ++i) {
+    sum += convert::To<T, float>(g_idata[i * N + n]);
+  }
+
+  g_odata[n] = convert::To<float, T>(sum);
+}
+
+template <typename T>
+void device_reduce(
+    const T* d_in,
+    T* d_out,
+    int N,
+    Tensor<CUDAContext>* buffer,
+    CUDAContext* context) {
+  // Determine temporary device storage requirements
+  size_t temp_storage_bytes = 0;
+  cub::DeviceReduce::Sum(
+      NULL, temp_storage_bytes, d_in, d_out, N, context->cuda_stream());
+
+  auto buffer_size = temp_storage_bytes / sizeof(T);
+  buffer_size += temp_storage_bytes % sizeof(T) != 0 ? 1 : 0;
+  buffer->Resize(buffer_size);
+  void* d_temp_storage = static_cast<void*>(buffer->template mutable_data<T>());
+  // Run sum-reduction
+  cub::DeviceReduce::Sum(
+      d_temp_storage,
+      temp_storage_bytes,
+      d_in,
+      d_out,
+      N,
+      context->cuda_stream());
+}
+
+template <>
+void device_reduce<float16>(
+    const float16* in,
+    float16* out,
+    int N,
+    Tensor<CUDAContext>* buffer,
+    CUDAContext* context) {
+  auto buffer_size = 1;
+
+  if (buffer->size() != buffer_size) {
+    buffer->Resize(buffer_size);
+
+    math::Set<float16, CUDAContext>(
+        N,
+        convert::To<float, float16>(1.),
+        buffer->mutable_data<float16>(),
+        context);
+  }
+
+  CUBLAS_ENFORCE(cublasDotEx(
+      context->cublas_handle(),
+      N,
+      in,
+      CUDA_R_16F,
+      1,
+      buffer->data<float16>(),
+      CUDA_R_16F,
+      0,
+      out,
+      CUDA_R_16F,
+      CUDA_R_32F));
+}
+
+template <typename T, int BLOCK_THREADS>
+__global__ void
+reduce_sum_like(const T* g_idata, T* g_odata, int pre, int N, int post) {
+  int n = blockIdx.x;
+  float sum = 0.0;
+  int limit = pre * post;
+  for (int i = threadIdx.x; i < limit; i += blockDim.x) {
+    int curPre = i / post;
+    int curPost = i % post;
+
+    sum +=
+        convert::To<T, float>(g_idata[curPre * N * post + n * post + curPost]);
+  }
+  // uses a shared memory reduction within block
+  typedef cub::BlockReduce<float, BLOCK_THREADS> BlockReduceT;
+  // Shared memory
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+  float aggregate = BlockReduceT(temp_storage).Sum(sum);
+  if (threadIdx.x == 0) {
+    g_odata[n] = convert::To<float, T>(aggregate);
+  }
+}
+} // namespace
+
+template <>
+template <typename T>
+bool SumReduceLikeOp<CUDAContext>::DoRunWithType() {
+  const auto& A = Input(0);
+  const auto& B = Input(1);
+  auto* C = Output(0);
+  auto count = A.size();
+  CAFFE_ENFORCE(&B != C, "In-place is not allowed.");
+  C->ResizeLike(B);
+  const T* Adata = A.template data<T>();
+  auto* Cdata = C->template mutable_data<T>();
+  if (B.size() == 1) {
+    device_reduce<T>(Adata, Cdata, count, &sum_buffer_, &context_);
+  } else {
+    size_t pre, n, post;
+    std::tie(pre, n, post) = ComputeLegacyBroadcastSizes(A, B, axis_);
+    // because we check shape(B) \in shape(A) before,
+    // post and pre cannot be 1 at same time
+    if (post == 1) {
+      reduce_sum_like_post1<T>
+          <<<CAFFE_GET_BLOCKS(n),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context_.cuda_stream()>>>(Adata, Cdata, pre, n);
+    } else {
+      if (post >= 128) {
+        reduce_sum_like<T, 512>
+            <<<n, 512, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
+      } else if (post >= 64) {
+        reduce_sum_like<T, 128>
+            <<<n, 128, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
+      } else if (post >= 32) {
+        reduce_sum_like<T, 64>
+            <<<n, 64, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
+      } else {
+        reduce_sum_like<T, 32>
+            <<<n, 32, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
+      }
+    }
+  }
+  return true;
+}
+
+template <>
+bool SumReduceLikeOp<CUDAContext>::RunOnDevice() {
+  return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+}
+
+REGISTER_CUDA_OPERATOR(SumReduceLike, SumReduceLikeOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_ops.h b/caffe2/operators/elementwise_ops.h
new file mode 100644
index 00000000000000..5ae4c3d8748494
--- /dev/null
+++ b/caffe2/operators/elementwise_ops.h
@@ -0,0 +1,568 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_OPS_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_OPS_H_
+
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/tensor.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+using NumericTypes = TensorTypes<int32_t, int64_t, float, double>;
+using IntTypes = TensorTypes<int32_t, int64_t>;
+using BoolTypes = TensorTypes<bool>;
+using IntBoolTypes = TensorTypes<int32_t, int64_t, bool>; // discrete types
+
+struct SameTypeAsInput {
+  template <typename T>
+  using type = T;
+};
+
+template <typename R>
+struct FixedType {
+  template <typename T>
+  using type = R;
+};
+
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class OutputTypeMap = SameTypeAsInput>
+class UnaryElementwiseWithArgsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  UnaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws), functor_(*this) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<InputTypes>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    Y->ResizeLike(X);
+    return functor_(
+        X.size(),
+        X.template data<T>(),
+        Y->template mutable_data<typename OutputTypeMap::template type<T>>(),
+        &context_);
+  }
+
+ private:
+  Functor functor_;
+};
+
+// UnaryFunctorWithDefaultCtor is a functor that can be used as the functor of
+// an UnaryElementwiseWithArgsOp. It simply forwards the operator() call into
+// another functor that doesn't accept arguments in its constructor.
+template <class Functor>
+struct UnaryFunctorWithDefaultCtor {
+  explicit UnaryFunctorWithDefaultCtor(OperatorBase& /* op */) {}
+
+  template <typename TIn, typename TOut, class Context>
+  bool operator()(const int size, const TIn* X, TOut* Y, Context* context)
+      const {
+    return functor(size, X, Y, context);
+  }
+
+  Functor functor{};
+};
+
+// UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the
+// difference that it takes a functor with default constructor, e.g. that does
+// not need to take into consideration any arguments during operator creation.
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class OutputTypeMap = SameTypeAsInput>
+using UnaryElementwiseOp = UnaryElementwiseWithArgsOp<
+    InputTypes,
+    Context,
+    UnaryFunctorWithDefaultCtor<Functor>,
+    OutputTypeMap>;
+
+template <typename Context>
+std::tuple<size_t, size_t, size_t> ComputeLegacyBroadcastSizes(
+    const Tensor<Context>& A,
+    const Tensor<Context>& B,
+    int axis) {
+  CAFFE_ENFORCE_GE(
+      A.ndim(),
+      B.ndim(),
+      "If you are doing broadcasting, input1 should have "
+      "a smaller or equal number of dimensions.");
+  if (axis == -1) {
+    axis = A.ndim() - B.ndim();
+  }
+  CAFFE_ENFORCE(
+      axis >= 0 && axis <= A.ndim() - B.ndim(),
+      "Broadcast axis should be in the range of"
+      "[0, A.ndim() - B.ndim()], but axis = ",
+      axis);
+
+  int b_dim_start = 0;
+  while (b_dim_start < B.ndim() && B.dim(b_dim_start) == 1) {
+    ++b_dim_start;
+  }
+  int b_dim_end = B.ndim() - 1;
+  while (b_dim_end >= b_dim_start && B.dim(b_dim_end) == 1) {
+    --b_dim_end;
+  }
+  size_t pre = 1, n = 1, post = 1;
+  for (int i = 0; i < axis + b_dim_start; ++i) {
+    pre *= A.dim(i);
+  }
+  for (int i = b_dim_start; i <= b_dim_end; ++i) {
+    CAFFE_ENFORCE_EQ(
+        A.dim(i + axis), B.dim(i), "Broadcast dimension mismatch.");
+    n *= B.dim(i);
+  }
+  for (int i = axis + b_dim_end + 1; i < A.ndim(); ++i) {
+    post *= A.dim(i);
+  }
+  return std::make_tuple(pre, n, post);
+}
+
+std::vector<int> ComputeBinaryBroadcastForwardDims(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims);
+
+void ComputeBinaryBroadcastBackwardAxes(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    std::vector<int>* A_axes,
+    std::vector<int>* B_axes);
+
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class OutputTypeMap = SameTypeAsInput>
+class BinaryElementwiseWithArgsOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  BinaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(bool, "broadcast", legacy_broadcast_, false),
+        OP_SINGLE_ARG(int, "axis", axis_, -1),
+        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
+        OP_SINGLE_ARG(string, "order", order_, "NCHW"),
+        functor_(*this) {
+    if (legacy_broadcast_) {
+      if (axis_ != -1) {
+        // Get axis from an explicit axis argument.
+        CAFFE_ENFORCE_EQ(
+            axis_str_.size(),
+            0,
+            "Args axis and axis_str cannot be used simultaneously.");
+      } else if (axis_str_.size()) {
+        // Get the axis index semantically.
+        CAFFE_ENFORCE_EQ(
+            axis_str_.size(), 1, "Unsupported axis string", axis_str_);
+        const size_t semantic_axis_ = order_.find(axis_str_);
+        CAFFE_ENFORCE_NE(
+            semantic_axis_,
+            string::npos,
+            "Unrecognizable axis string ",
+            axis_str_,
+            " from order string ",
+            order_);
+        axis_ = semantic_axis_;
+      } else {
+        CAFFE_ENFORCE(
+            axis_ == -1 && axis_str_.empty(),
+            "Do not specify axis or axis_str if broadcast is not enabled.");
+      }
+    }
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<InputTypes>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& A = Input(0);
+    const auto& B = Input(1);
+    auto* C = Output(0);
+    const T* A_data = A.template data<T>();
+    const T* B_data = B.template data<T>();
+    std::vector<int> A_dims;
+    std::vector<int> B_dims;
+
+    if (legacy_broadcast_) {
+      CAFFE_ENFORCE_NE(
+          C,
+          &B,
+          "In-place is allowed only with the first tensor when "
+          "legacy-broadcasting");
+      C->ResizeLike(A);
+      if (B.size() == 1) {
+        A_dims = {static_cast<int>(A.size())};
+        B_dims = {1};
+      } else {
+        size_t pre, n, post;
+        std::tie(pre, n, post) = ComputeLegacyBroadcastSizes(A, B, axis_);
+        A_dims = {
+            static_cast<int>(pre), static_cast<int>(n), static_cast<int>(post)};
+        B_dims = {static_cast<int>(n), 1};
+      }
+    } else {
+      std::copy(A.dims().cbegin(), A.dims().cend(), std::back_inserter(A_dims));
+      std::copy(B.dims().cbegin(), B.dims().cend(), std::back_inserter(B_dims));
+      const std::vector<int> C_dims =
+          ComputeBinaryBroadcastForwardDims(A_dims, B_dims);
+      if (C == &A) {
+        CAFFE_ENFORCE_EQ(C_dims, A_dims);
+      } else if (C == &B) {
+        CAFFE_ENFORCE_EQ(C_dims, B_dims);
+      } else {
+        C->Resize(C_dims);
+      }
+    }
+    auto* C_data =
+        C->template mutable_data<typename OutputTypeMap::template type<T>>();
+    return functor_.Forward(A_dims, B_dims, A_data, B_data, C_data, &context_);
+  }
+
+ private:
+  const bool legacy_broadcast_;
+  int axis_;
+  const std::string axis_str_;
+  const std::string order_;
+
+  Functor functor_;
+};
+
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class OutputTypeMap = SameTypeAsInput,
+    class GradientTypeMap = SameTypeAsInput>
+class BinaryElementwiseWithArgsGradientOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  BinaryElementwiseWithArgsGradientOp(
+      const OperatorDef& operator_def,
+      Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(bool, "broadcast", legacy_broadcast_, false),
+        OP_SINGLE_ARG(int, "axis", axis_, -1),
+        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
+        OP_SINGLE_ARG(string, "order", order_, "NCHW"),
+        functor_(*this) {
+    if (legacy_broadcast_) {
+      if (axis_ != -1) {
+        // Get axis from an explicit axis argument.
+        CAFFE_ENFORCE_EQ(
+            axis_str_.size(),
+            0,
+            "Args axis and axis_str cannot be used simultaneously.");
+      } else if (axis_str_.size()) {
+        // Get the axis index semantically.
+        CAFFE_ENFORCE_EQ(
+            axis_str_.size(), 1, "Unsupported axis string", axis_str_);
+        const size_t semantic_axis_ = order_.find(axis_str_);
+        CAFFE_ENFORCE_NE(
+            semantic_axis_,
+            string::npos,
+            "Unrecognizable axis string ",
+            axis_str_,
+            " from order string ",
+            order_);
+        axis_ = semantic_axis_;
+      } else {
+        CAFFE_ENFORCE(
+            axis_ == -1 && axis_str_.empty(),
+            "Do not specify axis or axis_str if broadcast is not enabled.");
+      }
+    }
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<InputTypes>::call(this, Input(1));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& dC = Input(0);
+    const auto& A = Input(1);
+    const auto& B = Input(2);
+    auto* dA = Output(0);
+    auto* dB = Output(1);
+    vector<int> A_dims;
+    vector<int> B_dims;
+    if (legacy_broadcast_) {
+      if (B.size() == 1) {
+        A_dims = {static_cast<int>(A.size())};
+        B_dims = {1};
+      } else {
+        size_t pre, n, post;
+        std::tie(pre, n, post) = ComputeLegacyBroadcastSizes(A, B, axis_);
+        A_dims = {
+            static_cast<int>(pre), static_cast<int>(n), static_cast<int>(post)};
+        B_dims = {static_cast<int>(n), 1};
+      }
+    } else {
+      std::copy(A.dims().cbegin(), A.dims().cend(), std::back_inserter(A_dims));
+      std::copy(B.dims().cbegin(), B.dims().cend(), std::back_inserter(B_dims));
+    }
+    const typename OutputTypeMap::template type<T>* C_data = nullptr;
+    if (InputSize() == 4) {
+      const auto& C = Input(3);
+      C_data = C.template data<typename OutputTypeMap::template type<T>>();
+    }
+    const auto* dC_data =
+        dC.template data<typename GradientTypeMap::template type<T>>();
+    const T* A_data = A.template data<T>();
+    const T* B_data = B.template data<T>();
+    dA->ResizeLike(A);
+    dB->ResizeLike(B);
+    auto* dA_data =
+        dA->template mutable_data<typename GradientTypeMap::template type<T>>();
+    auto* dB_data =
+        dB->template mutable_data<typename GradientTypeMap::template type<T>>();
+    return functor_.Backward(
+        A_dims,
+        B_dims,
+        dC_data,
+        A_data,
+        B_data,
+        C_data,
+        dA_data,
+        dB_data,
+        &context_);
+  }
+
+ private:
+  const bool legacy_broadcast_;
+  int axis_;
+  const std::string axis_str_;
+  const std::string order_;
+
+  Functor functor_;
+};
+
+template <class Functor>
+struct BinaryFunctorWithDefaultCtor {
+  explicit BinaryFunctorWithDefaultCtor(OperatorBase& /* op */) {}
+
+  template <typename TIn, typename TOut, class Context>
+  bool Forward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TIn* A_data,
+      const TIn* B_data,
+      TOut* C_data,
+      Context* context) const {
+    return functor.Forward(A_dims, B_dims, A_data, B_data, C_data, context);
+  }
+
+  template <typename TGrad, typename TIn, typename TOut, class Context>
+  bool Backward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TGrad* dC_data,
+      const TIn* A_data,
+      const TIn* B_data,
+      const TOut* C_data,
+      TGrad* dA_data,
+      TGrad* dB_data,
+      Context* context) const {
+    return functor.Backward(
+        A_dims,
+        B_dims,
+        dC_data,
+        A_data,
+        B_data,
+        C_data,
+        dA_data,
+        dB_data,
+        context);
+  }
+
+  Functor functor{};
+};
+
+// BinaryElementwiseOp is a wrapper around BinaryElementwiseWithArgsOp, with the
+// difference that it takes a functor with default constructor, e.g. that does
+// not need to take into consideration any arguments during operator creation.
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class TypeMap = SameTypeAsInput>
+using BinaryElementwiseOp = BinaryElementwiseWithArgsOp<
+    InputTypes,
+    Context,
+    BinaryFunctorWithDefaultCtor<Functor>,
+    TypeMap>;
+
+// BinaryElementwiseGradientOp is a wrapper around
+// BinaryElementwiseGradientWithArgsOp, with the difference that it takes a
+// functor with default constructor, e.g. that does not need to take into
+// consideration any arguments during operator creation.
+template <
+    typename InputTypes,
+    class Context,
+    class Functor,
+    class OutputTypeMap = SameTypeAsInput,
+    class GradientTypeMap = SameTypeAsInput>
+using BinaryElementwiseGradientOp = BinaryElementwiseWithArgsGradientOp<
+    InputTypes,
+    Context,
+    BinaryFunctorWithDefaultCtor<Functor>,
+    OutputTypeMap,
+    GradientTypeMap>;
+
+// Forward-only Unary Functors.
+template <class Context>
+struct NotFunctor {
+  bool operator()(const int N, const bool* X, bool* Y, Context* context) const {
+    math::Not(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct SignFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Sign(N, X, Y, context);
+    return true;
+  }
+};
+
+// Forward-only Binary Functors.
+#define CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(FunctorName) \
+  template <class Context>                                     \
+  struct FunctorName##Functor {                                \
+    template <typename TIn, typename TOut>                     \
+    bool Forward(                                              \
+        const std::vector<int>& A_dims,                        \
+        const std::vector<int>& B_dims,                        \
+        const TIn* A,                                          \
+        const TIn* B,                                          \
+        TOut* C,                                               \
+        Context* context) const {                              \
+      math::FunctorName(                                       \
+          A_dims.size(),                                       \
+          A_dims.data(),                                       \
+          B_dims.size(),                                       \
+          B_dims.data(),                                       \
+          A,                                                   \
+          B,                                                   \
+          C,                                                   \
+          context);                                            \
+      return true;                                             \
+    }                                                          \
+  };
+
+// Compare functors.
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(EQ);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(NE);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(LT);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(LE);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(GT);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(GE);
+
+// Logical functors.
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(And);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(Or);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(Xor);
+
+// Bitwise functors.
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(BitwiseAnd);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(BitwiseOr);
+CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR(BitwiseXor);
+
+#undef CAFFE2_DECLARE_FOWARD_ONLY_BINARY_FUNCTOR
+
+namespace SRLHelper {
+
+template <typename T>
+void sum2one(const T* a, T* y, size_t n);
+
+template <typename T>
+void RunWithBroadcastFront(const T* a, T* y, size_t pre, size_t n, CPUContext*);
+
+template <typename T>
+void RunWithBroadcastBack(const T* a, T* y, size_t post, size_t n, CPUContext*);
+
+template <typename T>
+void RunWithBroadcast2(
+    const T* a,
+    T* y,
+    size_t pre,
+    size_t n,
+    size_t post,
+    CPUContext*);
+
+} // namespace SRLHelper
+
+// Sum reduction operator that is used for computing the gradient in cases
+// where the forward op is in broadcast mode.
+template <class Context>
+class SumReduceLikeOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  SumReduceLikeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        OP_SINGLE_ARG(int, "axis", axis_, -1),
+        OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
+        OP_SINGLE_ARG(string, "order", order_, "NCHW") {
+    if (axis_ != -1) {
+      // Get axis from an explicit axis argument.
+      CAFFE_ENFORCE_EQ(
+          axis_str_.size(),
+          0,
+          "Args axis and axis_str cannot be used simultaneously.");
+    } else if (axis_str_.size()) {
+      // Get the axis index semantically.
+      CAFFE_ENFORCE_EQ(
+          axis_str_.size(), 1, "Unsupported axis string", axis_str_);
+      size_t semantic_axis = order_.find(axis_str_);
+      CAFFE_ENFORCE_NE(
+          semantic_axis,
+          string::npos,
+          "Unrecognizable axis string ",
+          axis_str_,
+          " from order string ",
+          order_);
+      axis_ = semantic_axis;
+    }
+  }
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, double>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType();
+
+ private:
+  int axis_;
+  string axis_str_;
+  string order_;
+  Tensor<Context> ones_;
+  Tensor<Context> sum_buffer_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_OPS_H_
diff --git a/caffe2/operators/elementwise_op_schema.cc b/caffe2/operators/elementwise_ops_schema.cc
similarity index 62%
rename from caffe2/operators/elementwise_op_schema.cc
rename to caffe2/operators/elementwise_ops_schema.cc
index 860f2f55ed80bf..cab6179364b2ca 100644
--- a/caffe2/operators/elementwise_op_schema.cc
+++ b/caffe2/operators/elementwise_ops_schema.cc
@@ -1,5 +1,6 @@
+#include "caffe2/operators/elementwise_ops.h"
+
 #include "caffe2/core/operator_gradient.h"
-#include "caffe2/operators/elementwise_op.h"
 #include "caffe2/utils/proto_utils.h"
 
 namespace caffe2 {
@@ -56,6 +57,11 @@ OPERATOR_SCHEMA(Add)
     .IdenticalTypeAndShapeOfInput(0)
     .FillUsing(MathDocGenerator("addition"))
     .InheritOnnxSchema("Add");
+OPERATOR_SCHEMA(AddGradient)
+    .NumInputs(3)
+    .NumOutputs(2)
+    .AllowInplace({{0, 0}, {1, 0}});
+
 OPERATOR_SCHEMA(Sub)
     .NumInputs(2)
     .NumOutputs(1)
@@ -64,6 +70,11 @@ OPERATOR_SCHEMA(Sub)
     .IdenticalTypeAndShapeOfInput(0)
     .FillUsing(MathDocGenerator("subtraction"))
     .InheritOnnxSchema("Sub");
+OPERATOR_SCHEMA(SubGradient)
+    .NumInputs(3)
+    .NumOutputs(2)
+    .AllowInplace({{0, 0}, {1, 0}});
+
 OPERATOR_SCHEMA(Mul)
     .NumInputs(2)
     .NumOutputs(1)
@@ -72,6 +83,11 @@ OPERATOR_SCHEMA(Mul)
     .IdenticalTypeAndShapeOfInput(0)
     .FillUsing(MathDocGenerator("multiplication"))
     .InheritOnnxSchema("Mul");
+OPERATOR_SCHEMA(MulGradient)
+    .NumInputs(3)
+    .NumOutputs(2)
+    .AllowInplace({{0, 0}, {1, 0}});
+
 OPERATOR_SCHEMA(Div)
     .NumInputs(2)
     .NumOutputs(1)
@@ -80,7 +96,7 @@ OPERATOR_SCHEMA(Div)
     .IdenticalTypeAndShapeOfInput(0)
     .FillUsing(MathDocGenerator("division"))
     .InheritOnnxSchema("Div");
-OPERATOR_SCHEMA(DivGradient).NumInputs(3).NumOutputs(2).AllowInplace({{0, 0}});
+OPERATOR_SCHEMA(DivGradient).NumInputs(4).NumOutputs(2).AllowInplace({{0, 0}});
 
 OPERATOR_SCHEMA(SumReduceLike)
     .NumInputs(2)
@@ -120,159 +136,6 @@ For example, the following tensor shapes are supported:
         "If broadcasting is disabled it should be of the same size.")
     .Output(0, "C", "Result, has same dimensions and type as B");
 
-class GetAddGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    if (!ArgumentHelper::HasArgument(Def(), "broadcast")) {
-      SetDense(0, GO(0));
-      SetDense(1, GO(0));
-      return vector<OperatorDef>();
-    }
-    SetDense(0, GO(0));
-
-    return SingleGradientDef(
-        "SumReduceLike",
-        "",
-        vector<string>{GO(0), I(1)},
-        vector<string>{GI(1)});
-  }
-};
-REGISTER_GRADIENT(Add, GetAddGradient);
-
-// TODO(jiayq): Although we have Sub gradient implemented, we are still missing
-// the Negative unary operator to be implemented.
-class GetSubGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    if (!ArgumentHelper::HasArgument(Def(), "broadcast")) {
-      SetDense(0, GO(0));
-      return SingleGradientDef(
-          "Negative", "", vector<string>{GO(0)}, vector<string>{GI(1)});
-    } else {
-      SetDense(0, GO(0));
-      vector<OperatorDef> grad_ops;
-      grad_ops.push_back(CreateOperatorDef(
-          "Negative",
-          "",
-          vector<string>{GO(0)},
-          vector<string>{GI(1) + "_autogen_pre_red"}));
-
-      Argument axis, axis_str, order;
-      if (ArgumentHelper::HasArgument(Def(), "axis")) {
-        axis = GetArgument(Def(), "axis");
-      } else {
-        axis = MakeArgument<int>("axis", -1);
-      }
-      if (ArgumentHelper::HasArgument(Def(), "axis_str")) {
-        axis_str = GetArgument(Def(), "axis_str");
-      } else {
-        axis_str = MakeArgument<string>("axis_str", "");
-      }
-      if (ArgumentHelper::HasArgument(Def(), "order")) {
-        order = GetArgument(Def(), "order");
-      } else {
-        order = MakeArgument<string>("order", "NCHW");
-      }
-      grad_ops.push_back(CreateOperatorDef(
-          "SumReduceLike",
-          "",
-          vector<string>{GI(1) + "_autogen_pre_red", I(1)},
-          vector<string>{GI(1)},
-          vector<Argument>{axis, axis_str, order}));
-
-      return grad_ops;
-    }
-  }
-  // Make sure the broadcast argument is not copied over.
-  bool CopyArguments() const override {
-    return false;
-  }
-};
-REGISTER_GRADIENT(Sub, GetSubGradient);
-
-class GetMulGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    CAFFE_ENFORCE(
-        Def().input(0) != Def().output(0) && Def().input(1) != Def().output(0),
-        "Gradient computation cannot be carried out if Mul uses in-place "
-        "computation: ",
-        ProtoDebugString(Def()));
-    if (!ArgumentHelper::HasArgument(Def(), "broadcast")) {
-      return vector<OperatorDef>{
-          CreateOperatorDef(
-              "Mul", "", vector<string>{GO(0), I(1)}, vector<string>{GI(0)}),
-          CreateOperatorDef(
-              "Mul", "", vector<string>{GO(0), I(0)}, vector<string>{GI(1)})};
-    } else {
-      Argument broadcast, axis, axis_str, order;
-      if (ArgumentHelper::HasArgument(Def(), "broadcast")) {
-        broadcast = GetArgument(Def(), "broadcast");
-      } else {
-        broadcast = MakeArgument<int>("broadcast", 0);
-      }
-      if (ArgumentHelper::HasArgument(Def(), "axis")) {
-        axis = GetArgument(Def(), "axis");
-      } else {
-        axis = MakeArgument<int>("axis", -1);
-      }
-      if (ArgumentHelper::HasArgument(Def(), "axis_str")) {
-        axis_str = GetArgument(Def(), "axis_str");
-      } else {
-        axis_str = MakeArgument<string>("axis_str", "");
-      }
-      if (ArgumentHelper::HasArgument(Def(), "order")) {
-        order = GetArgument(Def(), "order");
-      } else {
-        order = MakeArgument<string>("order", "NCHW");
-      }
-
-      vector<OperatorDef> grad_ops;
-      grad_ops.push_back(CreateOperatorDef(
-          "Mul",
-          "mul_grad_1st_op",
-          vector<string>{GO(0), I(1)},
-          vector<string>{GI(0)},
-          vector<Argument>{broadcast, axis, axis_str, order}));
-      grad_ops.push_back(CreateOperatorDef(
-          "Mul",
-          "mul_gradient_2nd_op",
-          vector<string>{GO(0), I(0)},
-          vector<string>{GI(1) + "_autogen_pre_red"}));
-
-      grad_ops.push_back(CreateOperatorDef(
-          "SumReduceLike",
-          "mul_with_broadcast_grad_3",
-          vector<string>{GI(1) + "_autogen_pre_red", I(1)},
-          vector<string>{GI(1)},
-          vector<Argument>{axis, axis_str, order}));
-
-      return grad_ops;
-    }
-  }
-
-  // Make sure the broadcast argument is not copied over.
-  bool CopyArguments() const override {
-    return false;
-  }
-};
-REGISTER_GRADIENT(Mul, GetMulGradient);
-
-class GetDivGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    CAFFE_ENFORCE(
-        !ArgumentHelper::HasArgument(Def(), "broadcast"),
-        "Gradient not ready yet for Div with broadcasting.");
-    return SingleGradientDef(
-        "DivGradient",
-        "",
-        vector<string>{I(1), O(0), GO(0)},
-        vector<string>{GI(0), GI(1)});
-  }
-};
-REGISTER_GRADIENT(Div, GetDivGradient);
-
 std::function<void(OpSchema&)> ComparisonDocGenerator(
     const char* name,
     const char* desc) {
@@ -324,11 +187,12 @@ Performs element-wise {desc} comparison `{name}` (with limited broadcast support
       .FillUsing(ComparisonDocGenerator(symbol, desc));                        \
   SHOULD_NOT_DO_GRADIENT(name)
 
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(EQ, "==", "equal to");
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(NE, "!=", "not equal to");
 CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(LT, "<", "less than");
 CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(LE, "<=", "less or equal than");
 CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(GT, ">", "greater than");
 CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(GE, ">=", "greater or equal than");
-CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(EQ, "==", "equality");
 
 std::function<void(OpSchema&)> LogicalDocGenerator(const char* name) {
   return [=](OpSchema& schema) {
@@ -354,17 +218,56 @@ Both input operands should be of type `bool`.
 }
 
 #define CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(name, symbol, onnx_schema) \
+  OPERATOR_SCHEMA(name)                                                \
+      .NumInputs(2)                                                    \
+      .NumOutputs(1)                                                   \
+      .AllowInplace({{0, 0}})                                          \
+      .FillUsing(LogicalDocGenerator(symbol))                          \
+      .InheritOnnxSchema(onnx_schema);                                 \
+  SHOULD_NOT_DO_GRADIENT(name)
+
+CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Or, "or", "Or");
+CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(And, "and", "And");
+CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Xor, "xor", "Xor");
+
+#undef CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP
+
+std::function<void(OpSchema&)> BitwiseDocGenerator(const char* name) {
+  return [=](OpSchema& schema) {
+    string doc = R"DOC(
+Performs element-wise bitwise operation `{name}` (with limited broadcast support).
+Both input operands should be of type `bool`.
+{broadcast_doc})DOC";
+    ReplaceAll(doc, "{name}", name);
+    ReplaceAll(doc, "{broadcast_doc}", kBroadcastDoc);
+    schema.SetDoc(doc);
+    schema.Arg("broadcast", "Pass 1 to enable broadcasting");
+    schema.Arg(
+        "axis",
+        "If set, defines the broadcast dimensions. See doc for details.");
+    schema.Input(0, "A", "First operand.");
+    schema.Input(
+        1,
+        "B",
+        "Second operand. With broadcasting can be of smaller size than A. "
+        "If broadcasting is disabled it should be of the same size.");
+    schema.Output(0, "C", "Result, has same dimensions and type with A.");
+  };
+}
+
+#define CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(name, symbol) \
   OPERATOR_SCHEMA(name)                                   \
       .NumInputs(2)                                       \
       .NumOutputs(1)                                      \
       .AllowInplace({{0, 0}})                             \
-      .FillUsing(LogicalDocGenerator(symbol))             \
-      .InheritOnnxSchema(onnx_schema);                    \
+      .FillUsing(LogicalDocGenerator(symbol));            \
   SHOULD_NOT_DO_GRADIENT(name)
 
-CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Or, "or", "Or");
-CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(And, "and", "And");
-CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Xor, "xor", "Xor");
+CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(BitwiseOr, "bitwise_or");
+CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(BitwiseAnd, "bitwise_and");
+CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(BitwiseXor, "bitwise_xor");
+
+#undef CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP
 
 OPERATOR_SCHEMA(Not)
     .NumInputs(1)
@@ -375,4 +278,12 @@ OPERATOR_SCHEMA(Not)
     .InheritOnnxSchema("Not");
 SHOULD_NOT_DO_GRADIENT(Not);
 
+OPERATOR_SCHEMA(Sign)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(Performs element-wise sign.)DOC")
+    .Input(0, "X", "Input tensor.")
+    .Output(0, "Y", "Output tensor.");
+SHOULD_NOT_DO_GRADIENT(Sign);
+
 } // namespace caffe2
diff --git a/caffe2/operators/elementwise_sub_op.cc b/caffe2/operators/elementwise_sub_op.cc
index 5cfa15cf75d5d2..b1de78e66cb37a 100644
--- a/caffe2/operators/elementwise_sub_op.cc
+++ b/caffe2/operators/elementwise_sub_op.cc
@@ -1,10 +1,33 @@
-#include "caffe2/operators/elementwise_op.h"
+#include "caffe2/operators/elementwise_sub_op.h"
 
 namespace caffe2 {
 
-// See the operations supported here:
-// https://eigen.tuxfamily.org/dox-devel/group__QuickRefPage.html
-#define EIGEN_SUB(x, y) ((x) - (y))
-EIGEN_FUNCTOR(Sub, EIGEN_SUB, NumericTypes, SameTypeAsInput);
-#undef EIGEN_SUB
-}
+REGISTER_CPU_OPERATOR(
+    Sub,
+    BinaryElementwiseOp<NumericTypes, CPUContext, SubFunctor<CPUContext>>);
+REGISTER_CPU_OPERATOR(
+    SubGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CPUContext,
+        SubFunctor<CPUContext>>);
+
+namespace {
+
+class GetSubGradient final : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SubGradient",
+        "",
+        std::vector<std::string>{GO(0), I(0), I(1)},
+        std::vector<std::string>{GI(0), GI(1)});
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Sub, GetSubGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/elementwise_sub_op.h b/caffe2/operators/elementwise_sub_op.h
new file mode 100644
index 00000000000000..20d38bfff58c8a
--- /dev/null
+++ b/caffe2/operators/elementwise_sub_op.h
@@ -0,0 +1,76 @@
+#ifndef CAFFE2_OPERATORS_ELEMENTWISE_SUB_OP_H_
+#define CAFFE2_OPERATORS_ELEMENTWISE_SUB_OP_H_
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SubFunctor {
+  template <typename TIn, typename TOut>
+  bool Forward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TIn* A,
+      const TIn* B,
+      TOut* C,
+      Context* context) const {
+    math::Sub(
+        A_dims.size(),
+        A_dims.data(),
+        B_dims.size(),
+        B_dims.data(),
+        A,
+        B,
+        C,
+        context);
+    return true;
+  }
+
+  template <typename TGrad, typename TIn, typename TOut>
+  bool Backward(
+      const std::vector<int>& A_dims,
+      const std::vector<int>& B_dims,
+      const TGrad* dC,
+      const TIn* /* A */,
+      const TIn* /* B */,
+      const TOut* /* C */,
+      TGrad* dA,
+      TGrad* dB,
+      Context* context) const {
+    const std::vector<int> C_dims =
+        ComputeBinaryBroadcastForwardDims(A_dims, B_dims);
+    std::vector<int> A_axes;
+    std::vector<int> B_axes;
+    ComputeBinaryBroadcastBackwardAxes(A_dims, B_dims, &A_axes, &B_axes);
+    math::ReduceSum(
+        C_dims.size(),
+        C_dims.data(),
+        A_axes.size(),
+        A_axes.data(),
+        dC,
+        dA,
+        context);
+    math::ReduceSum(
+        C_dims.size(),
+        C_dims.data(),
+        B_axes.size(),
+        B_axes.data(),
+        dC,
+        dB,
+        context);
+    const int size = std::accumulate(
+        B_dims.cbegin(), B_dims.cend(), 1, std::multiplies<int>());
+    math::Neg(size, dB, dB, context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_ELEMENTWISE_SUB_OP_H_
diff --git a/caffe2/operators/elementwise_sub_op_gpu.cc b/caffe2/operators/elementwise_sub_op_gpu.cc
new file mode 100644
index 00000000000000..72a0f115b18204
--- /dev/null
+++ b/caffe2/operators/elementwise_sub_op_gpu.cc
@@ -0,0 +1,17 @@
+#include "caffe2/operators/elementwise_sub_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Sub,
+    BinaryElementwiseOp<NumericTypes, CUDAContext, SubFunctor<CUDAContext>>);
+REGISTER_CUDA_OPERATOR(
+    SubGradient,
+    BinaryElementwiseGradientOp<
+        NumericTypes,
+        CUDAContext,
+        SubFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/exp_op.cc b/caffe2/operators/exp_op.cc
index d02f4b6b1f684e..d4b0b7b97a6ee8 100644
--- a/caffe2/operators/exp_op.cc
+++ b/caffe2/operators/exp_op.cc
@@ -1,19 +1,13 @@
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/utils/math.h"
+#include "caffe2/operators/exp_op.h"
 
-namespace caffe2 {
+#include <string>
+#include <vector>
 
-struct ExpCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* device_context) {
-    math::Exp<T, CPUContext>(n, x, y, device_context);
-  }
-};
+namespace caffe2 {
 
 REGISTER_CPU_OPERATOR(
     Exp,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, ExpCPUFunctor>);
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, ExpFunctor<CPUContext>>);
 
 OPERATOR_SCHEMA(Exp)
     .NumInputs(1)
@@ -33,15 +27,21 @@ and output blobs.
         "element-wise")
     .InheritOnnxSchema("Exp");
 
+namespace {
+
 class GetExpGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "Mul",
         "",
-        std::vector<string>{O(0), GO(0)},
-        std::vector<string>{GI(0)});
+        std::vector<std::string>{O(0), GO(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
+
+} // namespace
+
 REGISTER_GRADIENT(Exp, GetExpGradient);
+
 } // namespace caffe2
diff --git a/caffe2/operators/exp_op.cu b/caffe2/operators/exp_op.cu
deleted file mode 100644
index c31b5441d7a45c..00000000000000
--- a/caffe2/operators/exp_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <cmath>
-
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
-
-namespace caffe2 {
-
-template <typename T>
-__global__ void ExpKernel(const int N, const T* X, T* Y) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-    Y[i] = __expf(X[i]);
-  }
-}
-
-struct ExpCUDAFunctor {
-  template <typename T>
-  inline void operator()(const int n, const T* x,
-                         T* y, CUDAContext* device_context) {
-    ExpKernel<T><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS,
-                    0, device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-  inline bool InplaceAllowed() {
-    return true;
-  }
-};
-
-REGISTER_CUDA_OPERATOR(
-    Exp, UnaryElementwiseOp<TensorTypes<float>, CUDAContext, ExpCUDAFunctor>);
-}  // namespace caffe2
diff --git a/caffe2/operators/exp_op.h b/caffe2/operators/exp_op.h
new file mode 100644
index 00000000000000..051827bcb5548a
--- /dev/null
+++ b/caffe2/operators/exp_op.h
@@ -0,0 +1,20 @@
+#ifndef CAFFE2_OPERATORS_EXP_OP_
+#define CAFFE2_OPERATORS_EXP_OP_
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct ExpFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Exp(N, X, Y, context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_EXP_OP_
diff --git a/caffe2/operators/exp_op_gpu.cc b/caffe2/operators/exp_op_gpu.cc
new file mode 100644
index 00000000000000..0b79a61430784a
--- /dev/null
+++ b/caffe2/operators/exp_op_gpu.cc
@@ -0,0 +1,14 @@
+#include "caffe2/operators/exp_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Exp,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        ExpFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/group_norm_op.cc b/caffe2/operators/group_norm_op.cc
index 59de7940d3341f..38db78ceb7866f 100644
--- a/caffe2/operators/group_norm_op.cc
+++ b/caffe2/operators/group_norm_op.cc
@@ -38,7 +38,7 @@ void GroupNormForward(
     const int i_mu = index[0] * dims[kGDim] + index[kGDim];
     const int i_gamma = index[kGDim] * dims[kDDim] + index[kDDim];
     Y[i] = gamma[i_gamma] * (X[i] - mu[i_mu]) * rsig[i_mu] + beta[i_gamma];
-    math::internal::IncreaseIndexInDims(4, dims.data(), index.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
   }
 }
 
@@ -59,7 +59,7 @@ void ComputeInternalGradients(
     const int i_gamma = index[kGDim] * dims[kDDim] + index[kDDim];
     ds[i_mu] += gamma[i_gamma] * dY[i] * X[i];
     db[i_mu] += gamma[i_gamma] * dY[i];
-    math::internal::IncreaseIndexInDims(4, dims.data(), index.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
   }
 }
 
@@ -102,7 +102,7 @@ void GroupNormBackward(
     dX[i] = gamma[i_gamma] * dY[i] * rsig[i_mu] + (u - v) * denom;
     dgamma[i_gamma] += dY[i] * (X[i] - mu[i_mu]) * rsig[i_mu];
     dbeta[i_gamma] += dY[i];
-    math::internal::IncreaseIndexInDims(4, dims.data(), index.data());
+    math::utils::IncreaseIndexInDims(4, dims.data(), index.data());
   }
 }
 
diff --git a/caffe2/operators/group_norm_op.cu b/caffe2/operators/group_norm_op.cu
index 20e61c1f31cc2b..116b963b74ae0c 100644
--- a/caffe2/operators/group_norm_op.cu
+++ b/caffe2/operators/group_norm_op.cu
@@ -23,7 +23,7 @@ template <typename T>
 using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;
 
 template <typename T>
-inline __device__ T CubeCUDA(const T x) {
+inline __host__ __device__ T CubeCUDA(const T x) {
   return x * x * x;
 }
 
diff --git a/caffe2/operators/log_op.cc b/caffe2/operators/log_op.cc
index 1aa2bc4791f5c8..68e95e6f25ffb3 100644
--- a/caffe2/operators/log_op.cc
+++ b/caffe2/operators/log_op.cc
@@ -1,20 +1,13 @@
-#include "caffe2/operators/math_ops.h"
-#include "caffe2/utils/math.h"
+#include "caffe2/operators/log_op.h"
 
+#include <string>
+#include <vector>
 
 namespace caffe2 {
 
-struct LogCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* device_context) {
-    math::Log<T, CPUContext>(n, x, y, device_context);
-  }
-};
-
 REGISTER_CPU_OPERATOR(
     Log,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, LogCPUFunctor>);
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, LogFunctor<CPUContext>>);
 
 OPERATOR_SCHEMA(Log)
     .NumInputs(1)
@@ -34,16 +27,21 @@ and output blobs.
         "element-wise")
     .InheritOnnxSchema("Log");
 
+namespace {
+
 class GetLogGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "Div",
         "",
-        std::vector<string>{GO(0), I(0)},
-        std::vector<string>{GI(0)});
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
+
+} // namespace
+
 REGISTER_GRADIENT(Log, GetLogGradient);
 
 } // namespace caffe2
diff --git a/caffe2/operators/log_op.cu b/caffe2/operators/log_op.cu
deleted file mode 100644
index 3fb422cd7a3032..00000000000000
--- a/caffe2/operators/log_op.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/math_ops.h"
-
-namespace caffe2 {
-
-struct LogCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    math::Log<T, CUDAContext>(n, x, y, device_context);
-  }
-};
-
-REGISTER_CUDA_OPERATOR(
-    Log,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, LogCUDAFunctor>);
-}
diff --git a/caffe2/operators/log_op.h b/caffe2/operators/log_op.h
new file mode 100644
index 00000000000000..7c420c11d6004c
--- /dev/null
+++ b/caffe2/operators/log_op.h
@@ -0,0 +1,20 @@
+#ifndef CAFFE2_OPERATORS_LOG_OP_H_
+#define CAFFE2_OPERATORS_LOG_OP_H_
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct LogFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Log(N, X, Y, context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LOG_OP_H_
diff --git a/caffe2/operators/log_op_gpu.cc b/caffe2/operators/log_op_gpu.cc
new file mode 100644
index 00000000000000..dbf920d8afc143
--- /dev/null
+++ b/caffe2/operators/log_op_gpu.cc
@@ -0,0 +1,14 @@
+#include "caffe2/operators/log_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Log,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        LogFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/logit_op.cc b/caffe2/operators/logit_op.cc
index e9eff0feb50300..8d1859a405a49d 100644
--- a/caffe2/operators/logit_op.cc
+++ b/caffe2/operators/logit_op.cc
@@ -1,28 +1,23 @@
 #include "caffe2/operators/logit_op.h"
-#include "caffe2/operators/elementwise_op.h"
 
-namespace caffe2 {
-struct LogitCPUFunctor {
-  explicit LogitCPUFunctor(OperatorBase& op)
-      : eps_(op.GetSingleArgument<float>("eps", 1e-6f)) {
-    CAFFE_ENFORCE_GT(eps_, 0.0);
-    CAFFE_ENFORCE_LT(eps_, 0.5);
-  }
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* /* unused */) {
-    ConstEigenArrayMap<T> X(x, n, 1);
-    EigenArrayMap<T> Y(y, n, 1);
-    const T k_one = 1.0;
+#include <string>
+#include <vector>
 
-    Y = X.min(k_one - eps_);
-    Y = Y.max(eps_);
-    Y = (Y / (k_one - Y)).log();
-  }
+#include "caffe2/operators/elementwise_ops.h"
 
- private:
-  float eps_;
-};
+namespace caffe2 {
+
+template <>
+template <typename T>
+bool LogitFunctor<CPUContext>::
+operator()(const int size, const T* X, T* Y, CPUContext* /* context */) const {
+  ConstEigenVectorMap<T> X_vec(X, size);
+  EigenVectorMap<T> Y_vec(Y, size);
+  Y_vec = X_vec.array().min(static_cast<T>(1.0f - eps_));
+  Y_vec = Y_vec.array().max(eps_);
+  Y_vec = (Y_vec.array() / (T(1) - Y_vec.array())).log();
+  return true;
+}
 
 template <>
 bool LogitGradientOp<float, CPUContext>::RunOnDevice() {
@@ -47,7 +42,7 @@ REGISTER_CPU_OPERATOR(
     UnaryElementwiseWithArgsOp<
         TensorTypes<float>,
         CPUContext,
-        LogitCPUFunctor>);
+        LogitFunctor<CPUContext>>);
 
 REGISTER_CPU_OPERATOR(LogitGradient, LogitGradientOp<float, CPUContext>);
 
@@ -72,16 +67,21 @@ OPERATOR_SCHEMA(LogitGradient)
     .Output(0, "dX", "output float tensor")
     .Arg("eps", "small positive epsilon value, the default is 1e-6.");
 
+namespace {
+
 class GetLogitGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return vector<OperatorDef>{CreateOperatorDef(
         "LogitGradient",
         "",
-        std::vector<string>{I(0), GO(0)},
-        std::vector<string>{GI(0)})};
+        std::vector<std::string>{I(0), GO(0)},
+        std::vector<std::string>{GI(0)})};
   }
 };
 
+} // namespace
+
 REGISTER_GRADIENT(Logit, GetLogitGradient);
+
 } // namespace caffe2
diff --git a/caffe2/operators/logit_op.cu b/caffe2/operators/logit_op.cu
index fd419e15fe5764..d2e8351fcac950 100644
--- a/caffe2/operators/logit_op.cu
+++ b/caffe2/operators/logit_op.cu
@@ -1,49 +1,46 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
 #include "caffe2/operators/logit_op.h"
 
+#include "caffe2/core/context_gpu.h"
+
 namespace caffe2 {
 
+namespace {
+
 template <typename T>
 __global__ void LogitKernel(const int N, const T* X, const float eps, T* Y) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    Y[i] = fminf(X[i], (1.0 - eps));
+    Y[i] = fminf(X[i], (T(1) - eps));
     Y[i] = fmaxf(Y[i], eps);
-    Y[i] = logf(Y[i] / (1.0 - Y[i]));
+    Y[i] = logf(Y[i] / (T(1) - Y[i]));
   }
 }
 
+template <typename T>
 __global__ void LogitGradientKernel(
     const int N,
-    const float* X,
-    const float* dY,
+    const T* X,
+    const T* dY,
     const float eps,
-    float* dX) {
+    T* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    dX[i] = (X[i] < eps || X[i] > 1.0 - eps) ? 0 : (dY[i] / X[i] / (1 - X[i]));
+    dX[i] = (X[i] < eps || X[i] > T(1) - eps) ? T(0)
+                                              : (dY[i] / X[i] / (T(1) - X[i]));
   }
 }
 
-struct LogitCUDAFunctor {
-  explicit LogitCUDAFunctor(OperatorBase& op)
-      : eps_(op.GetSingleArgument<float>("eps", 1e-6)) {
-    CAFFE_ENFORCE_GT(eps_, 0.0);
-    CAFFE_ENFORCE_LT(eps_, 0.5);
-  }
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    LogitKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, eps_, y);
-    return;
-  }
+} // namespace
 
- private:
-  float eps_;
-};
+template <>
+template <typename T>
+bool LogitFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  LogitKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, X, eps_, Y);
+  return true;
+}
 
 template <>
 bool LogitGradientOp<float, CUDAContext>::RunOnDevice() {
@@ -66,6 +63,7 @@ REGISTER_CUDA_OPERATOR(
     UnaryElementwiseWithArgsOp<
         TensorTypes<float>,
         CUDAContext,
-        LogitCUDAFunctor>);
+        LogitFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(LogitGradient, LogitGradientOp<float, CUDAContext>);
+
 } // namespace caffe2
diff --git a/caffe2/operators/logit_op.h b/caffe2/operators/logit_op.h
index 09a6620bbf4477..1e460baec894e8 100644
--- a/caffe2/operators/logit_op.h
+++ b/caffe2/operators/logit_op.h
@@ -1,9 +1,26 @@
+#ifndef CAFFE2_OPERATORS_LOGIT_OP_H_
+#define CAFFE2_OPERATORS_LOGIT_OP_H_
+
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
+#include "caffe2/operators/elementwise_ops.h"
 
 namespace caffe2 {
 
+template <class Context>
+struct LogitFunctor {
+  explicit LogitFunctor(OperatorBase& op)
+      : eps_(op.GetSingleArgument<float>("eps", 1e-6f)) {
+    CAFFE_ENFORCE_GT(eps_, 0.0);
+    CAFFE_ENFORCE_LT(eps_, 0.5);
+  }
+
+  template <typename T>
+  bool operator()(const int size, const T* X, T* Y, Context* context) const;
+
+  const float eps_;
+};
+
 template <typename T, class Context>
 class LogitGradientOp final : public Operator<Context> {
  public:
@@ -20,3 +37,5 @@ class LogitGradientOp final : public Operator<Context> {
 };
 
 } // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LOGIT_OP_H_
diff --git a/caffe2/operators/math_ops.cc b/caffe2/operators/math_ops.cc
deleted file mode 100644
index 72fe0eb59a5346..00000000000000
--- a/caffe2/operators/math_ops.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-#include "caffe2/operators/math_ops.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-struct SqrCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* device_context) {
-    math::Sqr<T, CPUContext>(n, x, y, device_context);
-  }
-};
-
-REGISTER_CPU_OPERATOR(
-    Sqr,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, SqrCPUFunctor>);
-
-OPERATOR_SCHEMA(Sqr)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .AllowInplace({{0, 0}})
-    .IdenticalTypeAndShape()
-    .SetDoc("Square (x^2) the elements of the input")
-    .Input(0, "input", "Input tensor")
-    .Output(0, "output", "Squared elements of the input");
-
-class GetSqrGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    Argument scale_arg;
-    scale_arg.set_name("scale");
-    scale_arg.set_f(2.0);
-    return vector<OperatorDef>{CreateOperatorDef(
-                                   "Scale",
-                                   "",
-                                   std::vector<string>{GO(0)},
-                                   std::vector<string>{GO(0)},
-                                   std::vector<Argument>{scale_arg}),
-                               CreateOperatorDef(
-                                   "Mul",
-                                   "",
-                                   std::vector<string>{GO(0), I(0)},
-                                   std::vector<string>{GI(0)})};
-  }
-};
-REGISTER_GRADIENT(Sqr, GetSqrGradient);
-
-struct SignCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* device_context) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = (-T(1) * (x[i] < 0)) + (x[i] > 0);
-    }
-  }
-};
-
-REGISTER_CPU_OPERATOR(
-    Sign,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, SignCPUFunctor>);
-
-OPERATOR_SCHEMA(Sign)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc("Computes sign for each element of the input: -1, 0 or 1.")
-    .IdenticalTypeAndShape();
-SHOULD_NOT_DO_GRADIENT(Sign);
-
-} // namespace caffe2
diff --git a/caffe2/operators/math_ops.cu b/caffe2/operators/math_ops.cu
deleted file mode 100644
index 9cd6ad64ac66bc..00000000000000
--- a/caffe2/operators/math_ops.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/math_ops.h"
-
-namespace caffe2 {
-
-struct SqrCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    math::Sqr<T, CUDAContext>(n, x, y, device_context);
-  }
-};
-
-template <typename T>
-__global__ void SignKernel(int n, const T* x, T* y) {
-  CUDA_1D_KERNEL_LOOP(i, n) {
-    y[i] = (-T(1) * (x[i] < 0)) + (x[i] > 0);
-  }
-}
-
-struct SignCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    SignKernel<<<
-        CAFFE_GET_BLOCKS(n),
-        CAFFE_CUDA_NUM_THREADS,
-        0,
-        device_context->cuda_stream()>>>(n, x, y);
-  }
-};
-
-REGISTER_CUDA_OPERATOR(
-    Sqr,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, SqrCUDAFunctor>);
-REGISTER_CUDA_OPERATOR(
-    Sign,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, SignCUDAFunctor>);
-}
diff --git a/caffe2/operators/math_ops.h b/caffe2/operators/math_ops.h
deleted file mode 100644
index b411b8ed6920ea..00000000000000
--- a/caffe2/operators/math_ops.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef CAFFE2_OPERATORS_MATH_OP_H_
-#define CAFFE2_OPERATORS_MATH_OP_H_
-
-#include "caffe2/core/common_omp.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/utils/math.h"
-
-#endif
diff --git a/caffe2/operators/moments_op.cc b/caffe2/operators/moments_op.cc
index 930339e737203f..49ecdf7d577c53 100644
--- a/caffe2/operators/moments_op.cc
+++ b/caffe2/operators/moments_op.cc
@@ -23,13 +23,13 @@ bool MomentsGradientOp<T, Context>::Compute(
   const T norm = static_cast<T>(dY_size) / static_cast<T>(dX_size);
   for (int dX_index = 0; dX_index < dX_size; ++dX_index) {
     const int dY_index =
-        math::internal::GetIndexFromDims(ndim, dY_dims.data(), index.data());
+        math::utils::GetIndexFromDims(ndim, dY_dims.data(), index.data());
     dX_data[dX_index] =
         (dmean_data[dY_index] +
          static_cast<T>(2) * (X_data[dX_index] - mean_data[dY_index]) *
              dvariance_data[dY_index]) *
         norm;
-    math::internal::IncreaseIndexInDims(ndim, dX_dims.data(), index.data());
+    math::utils::IncreaseIndexInDims(ndim, dX_dims.data(), index.data());
   }
   return true;
 }
diff --git a/caffe2/operators/negative_op.cc b/caffe2/operators/negative_op.cc
index 239cd972d0869b..6f2b054bd2963d 100644
--- a/caffe2/operators/negative_op.cc
+++ b/caffe2/operators/negative_op.cc
@@ -1,21 +1,13 @@
-#include "caffe2/operators/elementwise_op.h"
+#include "caffe2/operators/negative_op.h"
 
-namespace caffe2 {
+#include <string>
+#include <vector>
 
-struct NegativeCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) {
-    EigenVectorMap<T>(y, n) = -ConstEigenVectorMap<T>(x, n);
-    // for (int i = 0; i < n; ++i) {
-    //  y[i] = -x[i];
-    //}
-  }
-};
+namespace caffe2 {
 
 REGISTER_CPU_OPERATOR(
-    Negative, UnaryElementwiseOp<
-        TensorTypes<float, double, int, long>, CPUContext, NegativeCPUFunctor>);
+    Negative,
+    UnaryElementwiseOp<NumericTypes, CPUContext, NegativeFunctor<CPUContext>>);
 
 // Input: X, output: Y
 OPERATOR_SCHEMA(Negative)
@@ -30,14 +22,21 @@ Computes the element-wise negative of the input.
     .Output(0, "Y", "1D input tensor")
     .InheritOnnxSchema("Neg");
 
+namespace {
+
 class GetNegativeGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
-        "Negative", "",
-        vector<string>{GO(0)},
-        vector<string>{GI(0)});
+        "Negative",
+        "",
+        std::vector<std::string>{GO(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
+
+} // namespace
+
 REGISTER_GRADIENT(Negative, GetNegativeGradient);
-}  // namespace caffe2
+
+} // namespace caffe2
diff --git a/caffe2/operators/negative_op.cu b/caffe2/operators/negative_op.cu
deleted file mode 100644
index 1bd5cea474ae46..00000000000000
--- a/caffe2/operators/negative_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
-
-namespace caffe2 {
-
-template <typename T>
-__global__ void NegativeKernel(const int N, const T* x, T* y) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-    y[i] = -x[i];
-  }
-}
-
-struct NegativeCUDAFunctor {
-  template <typename T>
-  inline void operator()(const int n, const T* x,
-                         T* y, CUDAContext* device_context) {
-    NegativeKernel<T><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS,
-                    0, device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-};
-
-REGISTER_CUDA_OPERATOR(
-    Negative, UnaryElementwiseOp<
-        TensorTypes<float, double, int, long>, CUDAContext,
-        NegativeCUDAFunctor>);
-}  // namespace caffe2
diff --git a/caffe2/operators/negative_op.h b/caffe2/operators/negative_op.h
new file mode 100644
index 00000000000000..876a83732307bd
--- /dev/null
+++ b/caffe2/operators/negative_op.h
@@ -0,0 +1,20 @@
+#ifndef CAFFE2_OPERATORS_NEGATIVE_OP_H_
+#define CAFFE2_OPERATORS_NEGATIVE_OP_H_
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct NegativeFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Neg(N, X, Y, context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_NEGATIVE_OP_H_
diff --git a/caffe2/operators/negative_op_gpu.cc b/caffe2/operators/negative_op_gpu.cc
new file mode 100644
index 00000000000000..dffc947eff6755
--- /dev/null
+++ b/caffe2/operators/negative_op_gpu.cc
@@ -0,0 +1,14 @@
+#include "caffe2/operators/negative_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Negative,
+    UnaryElementwiseOp<
+        NumericTypes,
+        CUDAContext,
+        NegativeFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/pow_op.h b/caffe2/operators/pow_op.h
index 054ed7278ab380..475499744c07f6 100644
--- a/caffe2/operators/pow_op.h
+++ b/caffe2/operators/pow_op.h
@@ -7,7 +7,7 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
 // definition of NumericTypes and SameTypeAsInput is in below header file
-#include "caffe2/operators/elementwise_op.h"
+#include "caffe2/operators/elementwise_ops.h"
 
 namespace caffe2 {
 
@@ -103,7 +103,7 @@ class PowOp : public Operator<Context> {
             A.size(), Adata, Bdata, 0, Cdata, &context_);
       } else {
         size_t pre, n, post;
-        std::tie(pre, n, post) = calculate_broadcast_sizes(A, B, axis_);
+        std::tie(pre, n, post) = ComputeLegacyBroadcastSizes(A, B, axis_);
         if (post == 1) {
           functor_.template RunWithBroadcast<T, T, T>(
               Adata, Bdata, Cdata, pre, n, &context_);
diff --git a/caffe2/operators/reduce_ops.cc b/caffe2/operators/reduce_ops.cc
index fe1a5b5f3f9bcf..aa635118dbdbbc 100644
--- a/caffe2/operators/reduce_ops.cc
+++ b/caffe2/operators/reduce_ops.cc
@@ -24,10 +24,10 @@ void ComputeReduceMinMaxGradient(
   std::vector<int> index(ndim, 0);
   for (int dX_index = 0; dX_index < dX_size; ++dX_index) {
     const int dY_index =
-        math::internal::GetIndexFromDims(ndim, dY_dims.data(), index.data());
+        math::utils::GetIndexFromDims(ndim, dY_dims.data(), index.data());
     dX_data[dX_index] =
         Y_data[dY_index] == X_data[dX_index] ? dY_data[dY_index] : T(0);
-    math::internal::IncreaseIndexInDims(ndim, dX_dims.data(), index.data());
+    math::utils::IncreaseIndexInDims(ndim, dX_dims.data(), index.data());
   }
 }
 
diff --git a/caffe2/operators/reduce_ops.h b/caffe2/operators/reduce_ops.h
index 16e53193b1bc4a..2112ba797e566e 100644
--- a/caffe2/operators/reduce_ops.h
+++ b/caffe2/operators/reduce_ops.h
@@ -120,7 +120,7 @@ class ReduceGradientOp final : public Operator<Context> {
 
  private:
   std::vector<int> axes_;
-  const Reducer reducer_{};
+  Reducer reducer_{};
 };
 
 template <class Context>
diff --git a/caffe2/operators/rsqrt_op.cc b/caffe2/operators/rsqrt_op.cc
index 16cea45de71e14..2180a10c010162 100644
--- a/caffe2/operators/rsqrt_op.cc
+++ b/caffe2/operators/rsqrt_op.cc
@@ -1,32 +1,35 @@
 #include "caffe2/operators/rsqrt_op.h"
 
+#include <algorithm>
+#include <functional>
 #include <string>
-#include <vector>
 
 namespace caffe2 {
 
 template <>
 struct RSqrtFunctor<CPUContext> {
   template <typename T>
-  inline void operator()(
-      const int size,
-      const T* X,
-      T* Y,
-      CPUContext* /* context */) const {
+  bool operator()(const int size, const T* X, T* Y, CPUContext* /* context */)
+      const {
     EigenArrayMap<T>(Y, 1, size) = ConstEigenArrayMap<T>(X, 1, size).rsqrt();
+    return true;
   }
 };
 
 template <>
 template <typename T>
-void RSqrtGradientFunctor<CPUContext>::Run(
-    const int size,
+bool RSqrtGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* Y_dims */,
     const T* dY,
     const T* Y,
     T* dX,
     CPUContext* /* context */) const {
-  EigenArrayMap<T>(dX, 1, size) = ConstEigenArrayMap<T>(dY, 1, size) *
-      ConstEigenArrayMap<T>(Y, 1, size).cube() * static_cast<T>(-0.5);
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  EigenVectorMap<T>(dX, size) = ConstEigenVectorMap<T>(dY, size).array() *
+      ConstEigenVectorMap<T>(Y, size).array().cube() * static_cast<T>(-0.5);
+  return true;
 }
 
 REGISTER_CPU_OPERATOR(
@@ -40,7 +43,7 @@ REGISTER_CPU_OPERATOR(
     BinaryElementwiseOp<
         TensorTypes<float>,
         CPUContext,
-        WithoutBroadcast<RSqrtGradientFunctor<CPUContext>>>);
+        RSqrtGradientFunctor<CPUContext>>);
 
 OPERATOR_SCHEMA(RSqrt)
     .NumInputs(1)
diff --git a/caffe2/operators/rsqrt_op.cu b/caffe2/operators/rsqrt_op.cu
index 3788abf70689a1..710b7c9d898376 100644
--- a/caffe2/operators/rsqrt_op.cu
+++ b/caffe2/operators/rsqrt_op.cu
@@ -1,5 +1,8 @@
 #include "caffe2/operators/rsqrt_op.h"
 
+#include <algorithm>
+#include <functional>
+
 #include "caffe2/core/context_gpu.h"
 
 namespace caffe2 {
@@ -7,7 +10,7 @@ namespace caffe2 {
 namespace {
 
 template <typename T>
-inline __device__ T CubeCUDA(const T x) {
+inline __host__ __device__ T CubeCUDA(const T x) {
   return x * x * x;
 }
 
@@ -27,17 +30,21 @@ RSqrtGradientCUDAKernel(const int size, const T* dY, const T* Y, T* dX) {
 
 template <>
 template <typename T>
-void RSqrtGradientFunctor<CUDAContext>::Run<T>(
-    const int size,
+bool RSqrtGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* Y_dims */,
     const T* dY,
     const T* Y,
     T* dX,
     CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
   RSqrtGradientCUDAKernel<T>
       <<<CAFFE_GET_BLOCKS(size),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(size, dY, Y, dX);
+  return true;
 }
 
 REGISTER_CUDA_OPERATOR(
@@ -51,6 +58,6 @@ REGISTER_CUDA_OPERATOR(
     BinaryElementwiseOp<
         TensorTypes<float>,
         CUDAContext,
-        WithoutBroadcast<RSqrtGradientFunctor<CUDAContext>>>);
+        RSqrtGradientFunctor<CUDAContext>>);
 
 } // namespace caffe2
diff --git a/caffe2/operators/rsqrt_op.h b/caffe2/operators/rsqrt_op.h
index 575854a66600ca..93073458845658 100644
--- a/caffe2/operators/rsqrt_op.h
+++ b/caffe2/operators/rsqrt_op.h
@@ -1,9 +1,9 @@
 #ifndef CAFFE2_OPERATORS_RSQRT_OP_H_
 #define CAFFE2_OPERATORS_RSQRT_OP_H_
 
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/operators/elementwise_op.h"
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
 #include "caffe2/utils/math.h"
 
 namespace caffe2 {
@@ -11,17 +11,22 @@ namespace caffe2 {
 template <class Context>
 struct RSqrtFunctor {
   template <typename T>
-  inline void operator()(const int size, const T* X, T* Y, Context* context)
-      const {
-    math::InvSqrt<T, Context>(size, X, Y, context);
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::InvSqrt<T, Context>(N, X, Y, context);
+    return true;
   }
 };
 
 template <class Context>
 struct RSqrtGradientFunctor {
   template <typename T>
-  inline void
-  Run(const int size, const T* dY, const T* Y, T* dX, Context* context) const;
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& Y_dims,
+      const T* dY,
+      const T* Y,
+      T* dX,
+      Context* context) const;
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/shape_op.cc b/caffe2/operators/shape_op.cc
index 4d19fe08c2c939..de36a2b1280799 100644
--- a/caffe2/operators/shape_op.cc
+++ b/caffe2/operators/shape_op.cc
@@ -22,7 +22,7 @@ OPERATOR_SCHEMA(Shape)
       } else {
         out[0].add_dims(axes.size());
       }
-      out[0].set_data_type(TensorProto::INT32);
+      out[0].set_data_type(TensorProto::INT64);
       return out;
     })
     .SetDoc(R"DOC(
diff --git a/caffe2/operators/sigmoid_op.cc b/caffe2/operators/sigmoid_op.cc
index d908f7eeb705db..ad86954d975a83 100644
--- a/caffe2/operators/sigmoid_op.cc
+++ b/caffe2/operators/sigmoid_op.cc
@@ -1,72 +1,90 @@
-#include "caffe2/operators/elementwise_op.h"
+#include "caffe2/operators/sigmoid_op.h"
+
+#include <algorithm>
+#include <functional>
+
 #include "caffe2/utils/math.h"
 
 namespace caffe2 {
 
-struct SigmoidCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) {
-    ConstEigenVectorArrayMap<T> xM(x, n);
-    EigenVectorArrayMap<T>(y, n) = 1. / (1. + (-xM).exp());
-  }
-};
+template <>
+template <typename T>
+bool SigmoidFunctor<CPUContext>::
+operator()(const int N, const T* X, T* Y, CPUContext* /* context */) const {
+  ConstEigenVectorArrayMap<T> X_arr(X, N);
+  EigenVectorArrayMap<T>(Y, N) = 1. / (1. + (-X_arr).exp());
+  return true;
+}
 
-struct SigmoidGradientCPUFunctor {
-  template <typename T>
-  inline void Run(
-      const int n,
-      const T* y,
-      const T* dy,
-      T* dx,
-      CPUContext* /*device_context*/) {
-    ConstEigenVectorArrayMap<T> yM(y, n), dyM(dy, n);
-    EigenVectorArrayMap<T>(dx, n) = dyM * yM * (1. - yM);
-  }
-};
+template <>
+template <typename T>
+bool SigmoidGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* Y_dims */,
+    const T* dY,
+    const T* Y,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> Y_arr(Y, size);
+  EigenVectorArrayMap<T>(dX, size) = dY_arr * Y_arr * (1. - Y_arr);
+  return true;
+}
 
 REGISTER_CPU_OPERATOR(
-    Sigmoid, UnaryElementwiseOp<
-        TensorTypes<float>, CPUContext, SigmoidCPUFunctor>);
+    Sigmoid,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        SigmoidFunctor<CPUContext>>);
 REGISTER_CPU_OPERATOR(
     SigmoidGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CPUContext,
-        WithoutBroadcast<SigmoidGradientCPUFunctor>>);
+        SigmoidGradientFunctor<CPUContext>>);
 
 // Input: X, output: Y
 OPERATOR_SCHEMA(Sigmoid)
-  .NumInputs(1)
-  .NumOutputs(1)
-  .AllowInplace({{0, 0}})
-  .IdenticalTypeAndShape()
-  .SetDoc(R"DOC(
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
 Sigmoid takes one input data (Tensor<T>) and produces one output data
 (Tensor<T>) where the sigmoid function, y = 1 / (1 + exp(-x)), is applied to the
 tensor elementwise.
 )DOC")
-  .Input(0, "X", "1D input tensor")
-  .Output(0, "Y", "1D output tensor")
-  .InheritOnnxSchema("Sigmoid");
+    .Input(0, "X", "1D input tensor")
+    .Output(0, "Y", "1D output tensor")
+    .InheritOnnxSchema("Sigmoid");
 // Input: Y, dY, output: dX
 OPERATOR_SCHEMA(SigmoidGradient)
-  .NumInputs(2)
-  .NumOutputs(1)
-  .AllowInplace({{1, 0}})
-  .SetDoc(R"DOC(
+    .NumInputs(2)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .SetDoc(R"DOC(
 SigmoidGradient takes both Y and dY and uses this to update dX according to the
 chain rule and derivatives of the sigmoid function.
 )DOC");
 
+namespace {
+
 class GetSigmoidGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
-        "SigmoidGradient", "",
-        vector<string>{O(0), GO(0)},
-        vector<string>{GI(0)});
+        "SigmoidGradient",
+        "",
+        std::vector<std::string>{GO(0), O(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
+
+} // namespace
+
 REGISTER_GRADIENT(Sigmoid, GetSigmoidGradient);
-}  // namespace caffe2
+
+} // namespace caffe2
diff --git a/caffe2/operators/sigmoid_op.cu b/caffe2/operators/sigmoid_op.cu
index cacf6eb05ca7e8..3fb51842795a10 100644
--- a/caffe2/operators/sigmoid_op.cu
+++ b/caffe2/operators/sigmoid_op.cu
@@ -1,50 +1,81 @@
-#include <cmath>
+#include "caffe2/operators/sigmoid_op.h"
+
+#include <algorithm>
+#include <functional>
 
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
 
 namespace caffe2 {
 
+namespace {
+
 template <typename T>
-__global__ void SigmoidKernel(const int N, const T* x, T* y) {
+__global__ void SigmoidKernel(const int N, const T* X, T* Y) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    y[i] = 1. / (1. + exp(-x[i]));
+#if __CUDA_ARCH__ >= 350
+    Y[i] = T(1) / (T(1) + exp(-__ldg(X + i)));
+#else
+    Y[i] = T(1) / (T(1) + exp(-X[i]));
+#endif
   }
 }
 
 template <typename T>
-__global__ void SigmoidGradientKernel(const int N, const T* y, const T* dy,
-                              T* dx) {
+__global__ void
+SigmoidGradientKernel(const int N, const T* dY, const T* Y, T* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    dx[i] = dy[i] * y[i] * (1. - y[i]);
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) * __ldg(Y + i) * (T(1) - __ldg(Y + i));
+#else
+    dX[i] = dY[i] * Y[i] * (T(1) - Y[i]);
+#endif
   }
 }
 
-struct SigmoidCUDAFunctor {
-  template <typename T>
-  inline void operator()(const int n, const T* x,
-                         T* y, CUDAContext* device_context) {
-    SigmoidKernel<T><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS,
-                    0, device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-};
-
-struct SigmoidGradientCUDAFunctor {
-  template <typename T>
-  inline void Run(const int n, const T* y, const T* dy,
-                  T* dx, CUDAContext* device_context) {
-    SigmoidGradientKernel<T><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS,
-                            0, device_context->cuda_stream()>>>(n, y, dy, dx);
-    return;
-  }
-};
+} // namespace
+
+template <>
+template <typename T>
+bool SigmoidFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  SigmoidKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, X, Y);
+  return true;
+}
+
+template <>
+template <typename T>
+bool SigmoidGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* Y_dims */,
+    const T* dY,
+    const T* Y,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  SigmoidGradientKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, Y, dX);
+  return true;
+}
 
 REGISTER_CUDA_OPERATOR(
     Sigmoid,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, SigmoidCUDAFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SigmoidFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
-    SigmoidGradient, BinaryElementwiseOp<
-        TensorTypes<float>, CUDAContext,
-        WithoutBroadcast<SigmoidGradientCUDAFunctor>>);
-}  // namespace caffe2
+    SigmoidGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SigmoidGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sigmoid_op.h b/caffe2/operators/sigmoid_op.h
new file mode 100644
index 00000000000000..686fe12c4f6b38
--- /dev/null
+++ b/caffe2/operators/sigmoid_op.h
@@ -0,0 +1,30 @@
+#ifndef CAFFE2_OPERATORS_SIGMOID_OP_H_
+#define CAFFE2_OPERATORS_SIGMOID_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SigmoidFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const;
+};
+
+template <class Context>
+struct SigmoidGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& Y_dims,
+      const T* dY,
+      const T* Y,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SIGMOID_OP_H_
diff --git a/caffe2/operators/sin_op.cc b/caffe2/operators/sin_op.cc
index 8aa9cf77ca6845..d64202c8689164 100644
--- a/caffe2/operators/sin_op.cc
+++ b/caffe2/operators/sin_op.cc
@@ -1,35 +1,36 @@
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/utils/math.h"
+#include "caffe2/operators/sin_op.h"
 
-namespace caffe2 {
+#include <algorithm>
+#include <functional>
 
-struct SinCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* device_context) {
-    math::Sin<T, CPUContext>(n, x, y, device_context);
-  }
-};
+namespace caffe2 {
 
-struct SinGradientCPUFunctor {
-  template <typename T>
-  inline void
-  Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) {
-    ConstEigenVectorArrayMap<T> dyM(dy, n);
-    ConstEigenVectorArrayMap<T> xM(x, n);
-    EigenVectorMap<T>(dx, n) = dyM * cos(xM);
-  }
-};
+template <>
+template <typename T>
+bool SinGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = dY_arr * X_arr.cos();
+  return true;
+}
 
 REGISTER_CPU_OPERATOR(
     Sin,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, SinCPUFunctor>);
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, SinFunctor<CPUContext>>);
 REGISTER_CPU_OPERATOR(
     SinGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CPUContext,
-        WithoutBroadcast<SinGradientCPUFunctor>>);
+        SinGradientFunctor<CPUContext>>);
 
 OPERATOR_SCHEMA(Sin)
     .NumInputs(1)
@@ -43,15 +44,21 @@ Calculates the sine of the given input tensor, element-wise.
 
 OPERATOR_SCHEMA(SinGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
 
+namespace {
+
 class GetSinGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "SinGradient",
         "",
-        std::vector<string>{I(0), GO(0)},
-        std::vector<string>{GI(0)});
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
+
+} // namespace
+
 REGISTER_GRADIENT(Sin, GetSinGradient);
+
 } // namespace caffe2
diff --git a/caffe2/operators/sin_op.cu b/caffe2/operators/sin_op.cu
index 59849dcd2930b7..fb984f827e3bd8 100644
--- a/caffe2/operators/sin_op.cu
+++ b/caffe2/operators/sin_op.cu
@@ -1,61 +1,58 @@
-#include <cmath>
+#include "caffe2/operators/sin_op.h"
+
+#include <algorithm>
+#include <functional>
 
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
 
 namespace caffe2 {
 
-template <typename T>
-__global__ void SinKernel(const int N, const T* X, T* Y) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-    Y[i] = sin(X[i]);
-  }
-}
+namespace {
 
 template <typename T>
-__global__ void SinGradientKernel(const int N, const T* X, const T* dY, T* dX) {
+__global__ void
+SinGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) * cos(__ldg(X + i));
+#else
     dX[i] = dY[i] * cos(X[i]);
+#endif
   }
 }
 
-struct SinCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    SinKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-};
-
-struct SinGradientCUDAFunctor {
-  template <typename T>
-  inline void Run(
-      const int n,
-      const T* x,
-      const T* dy,
-      T* dx,
-      CUDAContext* device_context) {
-    SinGradientKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, dy, dx);
-    return;
-  }
-};
+} // namespace
+
+template <>
+template <typename T>
+bool SinGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  SinGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
 
 REGISTER_CUDA_OPERATOR(
     Sin,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, SinCUDAFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SinFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     SinGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CUDAContext,
-        WithoutBroadcast<SinGradientCUDAFunctor>>);
+        SinGradientFunctor<CUDAContext>>);
+
 } // namespace caffe2
diff --git a/caffe2/operators/sin_op.h b/caffe2/operators/sin_op.h
new file mode 100644
index 00000000000000..aa0aae9a049cd2
--- /dev/null
+++ b/caffe2/operators/sin_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_SIN_OP_H_
+#define CAFFE2_OPERATORS_SIN_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SinFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Sin(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct SinGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& X_dims,
+      const T* dY,
+      const T* X,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SIN_OP_H_
diff --git a/caffe2/operators/softsign_op.cc b/caffe2/operators/softsign_op.cc
index eee80ec834f7ec..e7c72309148e77 100644
--- a/caffe2/operators/softsign_op.cc
+++ b/caffe2/operators/softsign_op.cc
@@ -1,40 +1,49 @@
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/utils/math.h"
+#include "caffe2/operators/softsign_op.h"
 
-namespace caffe2 {
+#include <algorithm>
+#include <functional>
 
-struct SoftsignCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) {
-    ConstEigenVectorArrayMap<T> x_arr(x, n);
-    EigenVectorMap<T>(y, n) = (1 + x_arr.abs()).inverse() * x_arr;
-  }
-};
+namespace caffe2 {
 
-struct SoftsignGradientCPUFunctor {
-  template <typename T>
-  inline void Run(
-      const int n,
-      const T* x,
-      const T* dy,
-      T* dx,
-      CPUContext* /*device_context*/) {
-    ConstEigenVectorArrayMap<T> dy_arr(dy, n);
-    ConstEigenVectorArrayMap<T> x_arr(x, n);
-    EigenVectorMap<T>(dx, n) = dy_arr * (1 + x_arr.abs()).pow(2).inverse();
-  }
-};
+template <>
+template <typename T>
+bool SoftsignFunctor<CPUContext>::
+operator()(const int N, const T* X, T* Y, CPUContext* /* context */) const {
+  ConstEigenVectorArrayMap<T> X_arr(X, N);
+  EigenVectorMap<T>(Y, N) = (T(1) + X_arr.abs()).inverse() * X_arr;
+  return true;
+}
+
+template <>
+template <typename T>
+bool SoftsignGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) =
+      dY_arr * (T(1) + X_arr.abs()).square().inverse();
+  return true;
+}
 
 REGISTER_CPU_OPERATOR(
     Softsign,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, SoftsignCPUFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        SoftsignFunctor<CPUContext>>);
 REGISTER_CPU_OPERATOR(
     SoftsignGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CPUContext,
-        WithoutBroadcast<SoftsignGradientCPUFunctor>>);
+        SoftsignGradientFunctor<CPUContext>>);
 
 OPERATOR_SCHEMA(Softsign)
     .NumInputs(1)
@@ -100,7 +109,7 @@ print("Y:\n", workspace.FetchBlob("Y"))
 OPERATOR_SCHEMA(SoftsignGradient)
     .NumInputs(2)
     .NumOutputs(1)
-    .AllowInplace({{1, 0}})
+    .AllowInplace({{0, 0}})
     .SetDoc(R"DOC(
 Calculates the softsign gradient (sgn(x)/(1+|x|)^2) of the given input tensor
 element-wise.
@@ -113,9 +122,11 @@ element-wise.
         "The softsign gradient (sgn(x)/(1+|x|)^2) values of the input tensor "
         "computed element-wise");
 
+namespace {
+
 class GetSoftsignGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     CAFFE_ENFORCE(
         I(0) != O(0),
         "Cannot compute softsign gradient "
@@ -124,11 +135,13 @@ class GetSoftsignGradient : public GradientMakerBase {
     return SingleGradientDef(
         "SoftsignGradient",
         "",
-        vector<string>{I(0), GO(0)},
-        vector<string>{GI(0)});
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
 
+} // namespace
+
 REGISTER_GRADIENT(Softsign, GetSoftsignGradient);
 
 } // namespace caffe2
diff --git a/caffe2/operators/softsign_op.cu b/caffe2/operators/softsign_op.cu
index 8fec50c62a6754..e7bb5048194d04 100644
--- a/caffe2/operators/softsign_op.cu
+++ b/caffe2/operators/softsign_op.cu
@@ -1,55 +1,86 @@
-#include <cmath>
+#include "caffe2/operators/softsign_op.h"
+
+#include <algorithm>
+#include <functional>
 
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
 
 namespace caffe2 {
 
+namespace {
+
 template <typename T>
-__global__ void SoftsignKernel(const int N, const T* X, T* Y) {
+inline __host__ __device__ T SquareCUDA(const T x) {
+  return x * x;
+}
+
+template <typename T>
+__global__ void SoftsignCUDAKernel(const int N, const T* X, T* Y) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    Y[i] = X[i] / (1 + abs(X[i]));
+#if __CUDA_ARCH__ >= 350
+    Y[i] = __ldg(X + i) / (T(1) + abs(__ldg(X + i)));
+#else
+    Y[i] = X[i] / (T(1) + abs(X[i]));
+#endif
   }
 }
 
 template <typename T>
-__global__ void SoftsignGradientKernel(const int N, const T* x, const T* dy,
-                              T* dx) {
+__global__ void
+SoftsignGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    dx[i] = dy[i] / pow(1 + abs(x[i]), 2);
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) / SquareCUDA(T(1) + abs(__ldg(X + i)));
+#else
+    dX[i] = dY[i] / SquareCUDA(T(1) + abs(X[i]));
+#endif
   }
 }
 
-struct SoftsignCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    SoftsignKernel<T><<<
-        CAFFE_GET_BLOCKS(n),
-        CAFFE_CUDA_NUM_THREADS,
-        0,
-        device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-};
-
-struct SoftsignGradientCUDAFunctor {
-  template <typename T>
-  inline void
-  Run(const int n, const T* x, const T* dy, T* dx, CUDAContext* device_context) {
-    SoftsignGradientKernel<T><<<
-        CAFFE_GET_BLOCKS(n),
-        CAFFE_CUDA_NUM_THREADS,
-        0,
-        device_context->cuda_stream()>>>(n, x, dy, dx);
-    return;
-  }
-};
+} // namespace
+
+template <>
+template <typename T>
+bool SoftsignFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  SoftsignCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, X, Y);
+  return true;
+}
+
+template <>
+template <typename T>
+bool SoftsignGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  SoftsignGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
 
 REGISTER_CUDA_OPERATOR(
     Softsign,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, SoftsignCUDAFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SoftsignFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     SoftsignGradient,
-    BinaryElementwiseOp<TensorTypes<float>, CUDAContext, WithoutBroadcast<SoftsignGradientCUDAFunctor>>);
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SoftsignGradientFunctor<CUDAContext>>);
+
 } // namespace caffe2
diff --git a/caffe2/operators/softsign_op.h b/caffe2/operators/softsign_op.h
new file mode 100644
index 00000000000000..e2ce15605e46dd
--- /dev/null
+++ b/caffe2/operators/softsign_op.h
@@ -0,0 +1,31 @@
+#ifndef CAFFE2_OPERATORS_SOFTSIGN_OP_H_
+#define CAFFE2_OPERATORS_SOFTSIGN_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SoftsignFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const;
+};
+
+template <class Context>
+struct SoftsignGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& X_dims,
+      const T* dY,
+      const T* X,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SOFTSIGN_OP_H_
diff --git a/caffe2/operators/sqr_op.cc b/caffe2/operators/sqr_op.cc
new file mode 100644
index 00000000000000..732cf737538a2a
--- /dev/null
+++ b/caffe2/operators/sqr_op.cc
@@ -0,0 +1,47 @@
+#include "caffe2/operators/sqr_op.h"
+
+#include <string>
+#include <vector>
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(
+    Sqr,
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, SqrFunctor<CPUContext>>);
+
+OPERATOR_SCHEMA(Sqr)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc("Square (x^2) the elements of the input")
+    .Input(0, "input", "Input tensor")
+    .Output(0, "output", "Squared elements of the input");
+
+namespace {
+
+class GetSqrGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    Argument scale_arg;
+    scale_arg.set_name("scale");
+    scale_arg.set_f(2.0);
+    return std::vector<OperatorDef>{CreateOperatorDef(
+                                        "Scale",
+                                        "",
+                                        std::vector<std::string>{GO(0)},
+                                        std::vector<std::string>{GO(0)},
+                                        std::vector<Argument>{scale_arg}),
+                                    CreateOperatorDef(
+                                        "Mul",
+                                        "",
+                                        std::vector<std::string>{GO(0), I(0)},
+                                        std::vector<std::string>{GI(0)})};
+  }
+};
+
+} // namespace
+
+REGISTER_GRADIENT(Sqr, GetSqrGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sqr_op.h b/caffe2/operators/sqr_op.h
new file mode 100644
index 00000000000000..f5af6542b85f47
--- /dev/null
+++ b/caffe2/operators/sqr_op.h
@@ -0,0 +1,20 @@
+#ifndef CAFFE2_OPERATORS_SQR_OP_H_
+#define CAFFE2_OPERATORS_SQR_OP_H_
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SqrFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Sqr(N, X, Y, context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SQR_OP_H_
diff --git a/caffe2/operators/sqr_op_gpu.cc b/caffe2/operators/sqr_op_gpu.cc
new file mode 100644
index 00000000000000..6f7ed8e5aa63f9
--- /dev/null
+++ b/caffe2/operators/sqr_op_gpu.cc
@@ -0,0 +1,14 @@
+#include "caffe2/operators/sqr_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Sqr,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SqrFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/sqrt_op.cc b/caffe2/operators/sqrt_op.cc
index e3a7880720e86e..b9ceaaa33aa73c 100644
--- a/caffe2/operators/sqrt_op.cc
+++ b/caffe2/operators/sqrt_op.cc
@@ -1,19 +1,17 @@
-#include <Eigen/Core>
-#include "caffe2/operators/elementwise_op.h"
+#include "caffe2/operators/sqrt_op.h"
 
-namespace caffe2 {
+#include <string>
+#include <vector>
 
-struct SqrtCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) {
-    EigenVectorArrayMap<T>(y, n) = ConstEigenVectorArrayMap<T>(x, n).sqrt();
-  }
-};
+namespace caffe2 {
 
 REGISTER_CPU_OPERATOR(
     Sqrt,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, SqrtCPUFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        SqrtFunctor<CPUContext>>);
+
 // Input: X, output: Y
 OPERATOR_SCHEMA(Sqrt)
     .NumInputs(1)
@@ -26,24 +24,30 @@ Computes the element-wise sqrt of the input.
     .Input(0, "X", "ND input tensor")
     .Output(0, "Y", "ND input tensor");
 
+namespace {
+
 class GetSqrtGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     Argument scale_arg;
     scale_arg.set_name("scale");
     scale_arg.set_f(0.5);
-    return vector<OperatorDef>{CreateOperatorDef(
-                                   "Scale",
-                                   "",
-                                   std::vector<string>{GO(0)},
-                                   std::vector<string>{GI(0)},
-                                   std::vector<Argument>{scale_arg}),
-                               CreateOperatorDef(
-                                   "Div",
-                                   "",
-                                   std::vector<string>{GI(0), O(0)},
-                                   std::vector<string>{GI(0)})};
+    return std::vector<OperatorDef>{CreateOperatorDef(
+                                        "Scale",
+                                        "",
+                                        std::vector<std::string>{GO(0)},
+                                        std::vector<std::string>{GI(0)},
+                                        std::vector<Argument>{scale_arg}),
+                                    CreateOperatorDef(
+                                        "Div",
+                                        "",
+                                        std::vector<std::string>{GI(0), O(0)},
+                                        std::vector<std::string>{GI(0)})};
   }
 };
+
+} // namespace
+
 REGISTER_GRADIENT(Sqrt, GetSqrtGradient);
+
 } // namespace caffe2
diff --git a/caffe2/operators/sqrt_op.cu b/caffe2/operators/sqrt_op.cu
deleted file mode 100644
index af45e7c00b1e7c..00000000000000
--- a/caffe2/operators/sqrt_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/operators/math_ops.h"
-
-namespace caffe2 {
-
-template <typename T>
-__global__ void SqrtKernel(const int N, const T* x, T* y) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-    y[i] = sqrt(x[i]);
-  }
-}
-
-struct SqrtCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    SqrtKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-};
-
-REGISTER_CUDA_OPERATOR(
-    Sqrt,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, SqrtCUDAFunctor>);
-} // namespace caffe2
diff --git a/caffe2/operators/sqrt_op.h b/caffe2/operators/sqrt_op.h
new file mode 100644
index 00000000000000..4200068b67801e
--- /dev/null
+++ b/caffe2/operators/sqrt_op.h
@@ -0,0 +1,20 @@
+#ifndef CAFFE2_OPERATORS_SQRT_OP_H_
+#define CAFFE2_OPERATORS_SQRT_OP_H_
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct SqrtFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Sqrt(N, X, Y, context);
+    return true;
+  }
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SQRT_OP_H_
diff --git a/caffe2/operators/sqrt_op_gpu.cc b/caffe2/operators/sqrt_op_gpu.cc
new file mode 100644
index 00000000000000..7a4ae2d30de0dd
--- /dev/null
+++ b/caffe2/operators/sqrt_op_gpu.cc
@@ -0,0 +1,14 @@
+#include "caffe2/operators/sqrt_op.h"
+
+#include "caffe2/core/context_gpu.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(
+    Sqrt,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        SqrtFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/string_ops.h b/caffe2/operators/string_ops.h
index 17011fe7376323..b6f670f86938b4 100644
--- a/caffe2/operators/string_ops.h
+++ b/caffe2/operators/string_ops.h
@@ -2,7 +2,7 @@
 #define CAFFE2_OPERATORS_STRING_OPS_H_
 
 #include "caffe2/core/operator.h"
-#include "caffe2/operators/elementwise_op.h"
+#include "caffe2/operators/elementwise_ops.h"
 
 namespace caffe2 {
 
@@ -19,11 +19,13 @@ struct ForEach {
   explicit ForEach(OperatorBase& op) : functor(op) {}
 
   template <typename In, typename Out, typename Context>
-  void operator()(int n, const In* in, Out* out, Context* /*c*/) {
+  bool operator()(int n, const In* in, Out* out, Context* /*c*/) {
     for (int i = 0; i < n; ++i) {
       out[i] = functor(in[i]);
     }
+    return true;
   }
+
   Functor functor;
 };
 
diff --git a/caffe2/operators/swish_op.cc b/caffe2/operators/swish_op.cc
index 185553e481fb4c..cd8dfa7ea5d3f6 100644
--- a/caffe2/operators/swish_op.cc
+++ b/caffe2/operators/swish_op.cc
@@ -1,17 +1,21 @@
-#include "swish_op.h"
+#include "caffe2/operators/swish_op.h"
+
+#include <string>
+#include <vector>
+
 #include "caffe2/core/types.h"
-#include "caffe2/operators/elementwise_op.h"
 #include "caffe2/utils/math.h"
 
 namespace caffe2 {
-struct SwishCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) {
-    ConstEigenVectorArrayMap<T> xM(x, n);
-    EigenVectorArrayMap<T>(y, n) = xM / (1. + (-xM).exp());
-  }
-};
+
+template <>
+template <typename T>
+bool SwishFunctor<CPUContext>::
+operator()(const int N, const T* X, T* Y, CPUContext* /* context */) const {
+  ConstEigenVectorArrayMap<T> X_arr(X, N);
+  EigenVectorArrayMap<T>(Y, N) = X_arr / (T(1) + (-X_arr).exp());
+  return true;
+}
 
 template <>
 template <typename T>
@@ -35,16 +39,16 @@ bool SwishGradientOp<CPUContext>::DoRunWithType() {
   ConstEigenVectorArrayMap<float> dYvec(dYdata, DYin.size());
 
   // dx = dy * (y + sigmoid(x)*(1-y))
-  dXvec = dYvec * (Yvec + (1. / (1. + (-Xvec).exp())) * (1. - Yvec));
+  dXvec = dYvec * (Yvec + (T(1) / (T(1) + (-Xvec).exp())) * (T(1) - Yvec));
   return true;
 }
 
 REGISTER_CPU_OPERATOR(
     Swish,
     UnaryElementwiseOp<
-        TensorTypes<float, double>,
+        TensorTypes<float>,
         CPUContext,
-        SwishCPUFunctor>);
+        SwishFunctor<CPUContext>>);
 REGISTER_CPU_OPERATOR(SwishGradient, SwishGradientOp<CPUContext>);
 
 // Input: X, output: Y
@@ -69,5 +73,21 @@ SwishGradient takes X, Y and dY and uses this to update dX according to the
 chain rule and derivatives of the swish function.
 )DOC");
 
+namespace {
+
+class GetSwishGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  std::vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "SwishGradient",
+        "",
+        std::vector<std::string>{I(0), O(0), GO(0)},
+        std::vector<std::string>{GI(0)});
+  }
+};
+
+} // namespace
+
 REGISTER_GRADIENT(Swish, GetSwishGradient);
+
 } // namespace caffe2
diff --git a/caffe2/operators/swish_op.cu b/caffe2/operators/swish_op.cu
index 399f1b61a8abdf..d42ad3d86b2f17 100644
--- a/caffe2/operators/swish_op.cu
+++ b/caffe2/operators/swish_op.cu
@@ -1,38 +1,52 @@
-#include <cmath>
+#include "caffe2/operators/swish_op.h"
 
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/operators/swish_op.h"
 
 namespace caffe2 {
 
+namespace {
+
 template <typename T>
-__global__ void SwishKernel(const int N, const T* x, T* y) {
+__global__ void SwishCUDAKernel(const int N, const T* X, T* Y) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    y[i] = x[i] / (1. + exp(-x[i]));
+#if __CUDA_ARCH__ >= 350
+    Y[i] = __ldg(X + i) / (T(1) + exp(-__ldg(X + i)));
+#else
+    Y[i] = X[i] / (T(1) + exp(-X[i]));
+#endif
   }
 }
 
 template <typename T>
-__global__ void
-SwishGradientKernel(const int N, const T* x, const T* y, const T* dy, T* dx) {
+__global__ void SwishGradientCUDAKernel(
+    const int N,
+    const T* X,
+    const T* Y,
+    const T* dY,
+    T* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    dx[i] = dy[i] * (y[i] + (1. - y[i]) / (1. + exp(-x[i])));
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) *
+        (__ldg(Y + i) + (T(1) - __ldg(Y + i)) / (T(1) + exp(-__ldg(X + i))));
+#else
+    dX[i] = dY[i] * (Y[i] + (T(1) - Y[i]) / (T(1) + exp(-X[i])));
+#endif
   }
 }
 
-struct SwishCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    SwishKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-};
+} // namespace
+
+template <>
+template <typename T>
+bool SwishFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  SwishCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, X, Y);
+  return true;
+}
 
 template <>
 template <typename T>
@@ -50,7 +64,7 @@ bool SwishGradientOp<CUDAContext>::DoRunWithType() {
   const T* y = Yin.template data<T>();
   const T* dy = DYin.template data<T>();
   T* dx = DXout->template mutable_data<T>();
-  SwishGradientKernel<T>
+  SwishGradientCUDAKernel<T>
       <<<CAFFE_GET_BLOCKS(n),
          CAFFE_CUDA_NUM_THREADS,
          0,
@@ -68,6 +82,7 @@ REGISTER_CUDA_OPERATOR(
     UnaryElementwiseOp<
         TensorTypes<float, double>,
         CUDAContext,
-        SwishCUDAFunctor>);
+        SwishFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(SwishGradient, SwishGradientOp<CUDAContext>);
+
 } // namespace caffe2
diff --git a/caffe2/operators/swish_op.h b/caffe2/operators/swish_op.h
index 01e9683f97a2e5..c582691dab900b 100644
--- a/caffe2/operators/swish_op.h
+++ b/caffe2/operators/swish_op.h
@@ -1,9 +1,17 @@
-#pragma once
+#ifndef CAFFE2_OPERATORS_SWISH_OP_H_
+#define CAFFE2_OPERATORS_SWISH_OP_H_
 
-#include "caffe2/core/operator.h"
+#include "caffe2/operators/elementwise_ops.h"
 #include "caffe2/utils/math.h"
 
 namespace caffe2 {
+
+template <class Context>
+struct SwishFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const;
+};
+
 template <class Context>
 class SwishGradientOp final : public Operator<Context> {
  public:
@@ -22,15 +30,6 @@ class SwishGradientOp final : public Operator<Context> {
   OUTPUT_TAGS(DX);
 };
 
-class GetSwishGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "SwishGradient",
-        "",
-        vector<string>{I(0), O(0), GO(0)},
-        vector<string>{GI(0)});
-  }
-};
-
 } // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_SWISH_OP_H_
diff --git a/caffe2/operators/tan_op.cc b/caffe2/operators/tan_op.cc
index 1101bed7d36b31..9cc26beddfe488 100644
--- a/caffe2/operators/tan_op.cc
+++ b/caffe2/operators/tan_op.cc
@@ -1,35 +1,36 @@
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/utils/math.h"
+#include "caffe2/operators/tan_op.h"
+
+#include <algorithm>
+#include <functional>
 
 namespace caffe2 {
 
-struct TanCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* device_context) {
-    math::Tan<T, CPUContext>(n, x, y, device_context);
-  }
-};
-
-struct TanGradientCPUFunctor {
-  template <typename T>
-  inline void
-  Run(const int n, const T* x, const T* dy, T* dx, CPUContext* /* unused */) {
-    ConstEigenVectorArrayMap<T> dyM(dy, n);
-    ConstEigenVectorArrayMap<T> xM(x, n);
-    EigenVectorMap<T>(dx, n) = dyM / square(cos(xM));
-  }
-};
+template <>
+template <typename T>
+bool TanGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> X_arr(X, size);
+  EigenVectorMap<T>(dX, size) = dY_arr / X_arr.cos().square();
+  return true;
+}
 
 REGISTER_CPU_OPERATOR(
     Tan,
-    UnaryElementwiseOp<TensorTypes<float>, CPUContext, TanCPUFunctor>);
+    UnaryElementwiseOp<TensorTypes<float>, CPUContext, TanFunctor<CPUContext>>);
 REGISTER_CPU_OPERATOR(
     TanGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CPUContext,
-        WithoutBroadcast<TanGradientCPUFunctor>>);
+        TanGradientFunctor<CPUContext>>);
 
 OPERATOR_SCHEMA(Tan)
     .NumInputs(1)
@@ -46,16 +47,21 @@ Calculates the tangent of the given input tensor, element-wise.
 
 OPERATOR_SCHEMA(TanGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
 
+namespace {
+
 class GetTanGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "TanGradient",
         "",
-        std::vector<string>{I(0), GO(0)},
-        std::vector<string>{GI(0)});
+        std::vector<std::string>{GO(0), I(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
 
+} // namespace
+
 REGISTER_GRADIENT(Tan, GetTanGradient);
+
 } // namespace caffe2
diff --git a/caffe2/operators/tan_op.cu b/caffe2/operators/tan_op.cu
index fe74918011a573..e66fecda9020d0 100644
--- a/caffe2/operators/tan_op.cu
+++ b/caffe2/operators/tan_op.cu
@@ -1,61 +1,59 @@
-#include <cmath>
+#include "caffe2/operators/tan_op.h"
+
+#include <algorithm>
+#include <functional>
 
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
 
 namespace caffe2 {
 
 template <typename T>
-__global__ void TanKernel(const int N, const T* X, T* Y) {
-  CUDA_1D_KERNEL_LOOP(i, N) {
-    Y[i] = tan(X[i]);
-  }
+inline __host__ __device__ T Square(const T& x) {
+  return x * x;
 }
 
 template <typename T>
-__global__ void TanGradientKernel(const int N, const T* X, const T* dY, T* dX) {
+__global__ void
+TanGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    dX[i] = dY[i] / pow(cos(X[i]), 2);
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) / Square(cos(__ldg(X + i)));
+#else
+    dX[i] = dY[i] / Square(cos(X[i]));
+#endif
   }
 }
 
-struct TanCUDAFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CUDAContext* device_context) {
-    TanKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-};
-
-struct TanGradientCUDAFunctor {
-  template <typename T>
-  inline void Run(
-      const int n,
-      const T* x,
-      const T* dy,
-      T* dx,
-      CUDAContext* device_context) {
-    TanGradientKernel<T>
-        <<<CAFFE_GET_BLOCKS(n),
-           CAFFE_CUDA_NUM_THREADS,
-           0,
-           device_context->cuda_stream()>>>(n, x, dy, dx);
-    return;
-  }
-};
+template <>
+template <typename T>
+bool TanGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* X_dims */,
+    const T* dY,
+    const T* X,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  TanGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, X, dX);
+  return true;
+}
 
 REGISTER_CUDA_OPERATOR(
     Tan,
-    UnaryElementwiseOp<TensorTypes<float>, CUDAContext, TanCUDAFunctor>);
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        TanFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
     TanGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CUDAContext,
-        WithoutBroadcast<TanGradientCUDAFunctor>>);
+        TanGradientFunctor<CUDAContext>>);
+
 } // namespace caffe2
diff --git a/caffe2/operators/tan_op.h b/caffe2/operators/tan_op.h
new file mode 100644
index 00000000000000..579a20d12f11cb
--- /dev/null
+++ b/caffe2/operators/tan_op.h
@@ -0,0 +1,34 @@
+#ifndef CAFFE2_OPERATORS_TAN_OP_H_
+#define CAFFE2_OPERATORS_TAN_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct TanFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const {
+    math::Tan(N, X, Y, context);
+    return true;
+  }
+};
+
+template <class Context>
+struct TanGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& X_dims,
+      const T* dY,
+      const T* X,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TAN_OP_H_
diff --git a/caffe2/operators/tanh_op.cc b/caffe2/operators/tanh_op.cc
index a19ec8e92ebd8d..c1b5f9261876f4 100644
--- a/caffe2/operators/tanh_op.cc
+++ b/caffe2/operators/tanh_op.cc
@@ -1,52 +1,63 @@
-#include <cmath>
+#include "caffe2/operators/tanh_op.h"
 
-#include "caffe2/operators/elementwise_op.h"
-#include "caffe2/utils/math.h"
+#include <algorithm>
+#include <functional>
+#include <string>
 
 namespace caffe2 {
 
-struct TanhCPUFunctor {
-  template <typename T>
-  inline void
-  operator()(const int n, const T* x, T* y, CPUContext* /*device_context*/) {
+template <>
+template <>
+bool TanhFunctor<CPUContext>::operator()<float>(
+    const int N,
+    const float* X,
+    float* Y,
+    CPUContext* /* context */) const {
 #ifdef CAFFE2_USE_ACCELERATE
-    vvtanhf(y, x, &n);
+  vvtanhf(Y, X, &N);
 #else
-    ConstEigenVectorArrayMap<T> x_arr(x, n);
-    EigenVectorMap<T>(y, n) = 1 - 2 * ((x_arr * 2).exp() + 1).inverse();
+  ConstEigenVectorArrayMap<float> X_arr(X, N);
+  EigenVectorMap<float>(Y, N) = 1 - 2 * ((X_arr * 2).exp() + 1).inverse();
 #endif
-  }
-};
-
-struct TanhGradientCPUFunctor {
-  template <typename T>
-  inline void Run(
-      const int n,
-      const T* y,
-      const T* dy,
-      T* dx,
-      CPUContext* /*device_context*/) {
-    ConstEigenVectorArrayMap<T> dy_arr(dy, n);
-    ConstEigenVectorArrayMap<T> y_arr(y, n);
-    EigenVectorMap<T>(dx, n) = dy_arr * (1 - y_arr * y_arr);
-  }
-};
+  return true;
+}
+
+template <>
+template <typename T>
+bool TanhGradientFunctor<CPUContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* Y_dims */,
+    const T* dY,
+    const T* Y,
+    T* dX,
+    CPUContext* /* context */) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  ConstEigenVectorArrayMap<T> dY_arr(dY, size);
+  ConstEigenVectorArrayMap<T> Y_arr(Y, size);
+  EigenVectorMap<T>(dX, size) = dY_arr * (1 - Y_arr * Y_arr);
+  return true;
+}
 
 REGISTER_CPU_OPERATOR(
-    Tanh, UnaryElementwiseOp<TensorTypes<float>, CPUContext, TanhCPUFunctor>);
+    Tanh,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CPUContext,
+        TanhFunctor<CPUContext>>);
 REGISTER_CPU_OPERATOR(
     TanhGradient,
     BinaryElementwiseOp<
         TensorTypes<float>,
         CPUContext,
-        WithoutBroadcast<TanhGradientCPUFunctor>>);
+        TanhGradientFunctor<CPUContext>>);
 
 OPERATOR_SCHEMA(Tanh)
-  .NumInputs(1)
-  .NumOutputs(1)
-  .AllowInplace({{0, 0}})
-  .IdenticalTypeAndShape()
-  .SetDoc(R"DOC(
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
 Calculates the hyperbolic tangent of the given input tensor element-wise. This
 operation can be done in an in-place fashion too, by providing the same input
 and output blobs.
@@ -99,20 +110,31 @@ print("X:\n", workspace.FetchBlob("X"))
 </details>
 
 )DOC")
-  .Input(0, "input", "1-D input tensor")
-  .Output(0, "output", "The hyperbolic tangent values of the input tensor, computed element-wise")
-  .InheritOnnxSchema("Tanh");
+    .Input(0, "input", "1-D input tensor")
+    .Output(
+        0,
+        "output",
+        "The hyperbolic tangent values of the input tensor, computed "
+        "element-wise")
+    .InheritOnnxSchema("Tanh");
+
+OPERATOR_SCHEMA(TanhGradient).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
 
-OPERATOR_SCHEMA(TanhGradient).NumInputs(2).NumOutputs(1).AllowInplace({{1, 0}});
+namespace {
 
 class GetTanhGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
+  std::vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
-        "TanhGradient", "",
-        std::vector<string>{O(0), GO(0)},
-        std::vector<string>{GI(0)});
+        "TanhGradient",
+        "",
+        std::vector<std::string>{GO(0), O(0)},
+        std::vector<std::string>{GI(0)});
   }
 };
+
+} // namespace
+
 REGISTER_GRADIENT(Tanh, GetTanhGradient);
-}  // namespace caffe2
+
+} // namespace caffe2
diff --git a/caffe2/operators/tanh_op.cu b/caffe2/operators/tanh_op.cu
index 0feeb6c4b1527a..a15e6758d11bc3 100644
--- a/caffe2/operators/tanh_op.cu
+++ b/caffe2/operators/tanh_op.cu
@@ -1,52 +1,81 @@
-#include <cmath>
+#include "caffe2/operators/tanh_op.h"
+
+#include <algorithm>
+#include <functional>
 
 #include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/elementwise_op.h"
 
 namespace caffe2 {
 
+namespace {
+
 template <typename T>
-__global__ void TanhKernel(const int N, const T* X, T* Y) {
+__global__ void TanhCUDAKernel(const int N, const T* X, T* Y) {
   CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    Y[i] = tanh(__ldg(X + i));
+#else
     Y[i] = tanh(X[i]);
+#endif
   }
 }
 
 template <typename T>
-__global__ void TanhGradientKernel(const int N, const T* Y, const T* dY,
-                              T* dX) {
+__global__ void
+TanhGradientCUDAKernel(const int N, const T* dY, const T* Y, T* dX) {
   CUDA_1D_KERNEL_LOOP(i, N) {
-    dX[i] = dY[i]*(1 - Y[i]*Y[i]);
+#if __CUDA_ARCH__ >= 350
+    dX[i] = __ldg(dY + i) * (T(1) - __ldg(Y + i) * __ldg(Y + i));
+#else
+    dX[i] = dY[i] * (T(1) - Y[i] * Y[i]);
+#endif
   }
 }
 
-struct TanhCUDAFunctor {
-  template <typename T>
-  inline void operator()(const int n, const T* x,
-                         T* y, CUDAContext* device_context) {
-    TanhKernel<T><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS,
-                    0, device_context->cuda_stream()>>>(n, x, y);
-    return;
-  }
-  inline bool InplaceAllowed() {
-    return true;
-  }
-};
-
-struct TanhGradientCUDAFunctor {
-  template <typename T>
-  inline void Run(const int n, const T* y, const T* dy,
-                  T* dx, CUDAContext* device_context) {
-    TanhGradientKernel<T><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS,
-                            0, device_context->cuda_stream()>>>(n, y, dy, dx);
-    return;
-  }
-};
+} // namespace
+
+template <>
+template <typename T>
+bool TanhFunctor<CUDAContext>::
+operator()(const int N, const T* X, T* Y, CUDAContext* context) const {
+  TanhCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(N),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(N, X, Y);
+  return true;
+}
+
+template <>
+template <typename T>
+bool TanhGradientFunctor<CUDAContext>::Forward(
+    const std::vector<int>& dY_dims,
+    const std::vector<int>& /* Y_dims */,
+    const T* dY,
+    const T* Y,
+    T* dX,
+    CUDAContext* context) const {
+  const int size = std::accumulate(
+      dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
+  TanhGradientCUDAKernel<T>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(size, dY, Y, dX);
+  return true;
+}
 
 REGISTER_CUDA_OPERATOR(
-    Tanh, UnaryElementwiseOp<TensorTypes<float>, CUDAContext, TanhCUDAFunctor>);
+    Tanh,
+    UnaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        TanhFunctor<CUDAContext>>);
 REGISTER_CUDA_OPERATOR(
-    TanhGradient, BinaryElementwiseOp<
-        TensorTypes<float>, CUDAContext,
-        WithoutBroadcast<TanhGradientCUDAFunctor>>);
-}  // namespace caffe2
+    TanhGradient,
+    BinaryElementwiseOp<
+        TensorTypes<float>,
+        CUDAContext,
+        TanhGradientFunctor<CUDAContext>>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/tanh_op.h b/caffe2/operators/tanh_op.h
new file mode 100644
index 00000000000000..b8fdcff4e990df
--- /dev/null
+++ b/caffe2/operators/tanh_op.h
@@ -0,0 +1,31 @@
+#ifndef CAFFE2_OPERATORS_TANH_OP_H_
+#define CAFFE2_OPERATORS_TANH_OP_H_
+
+#include <vector>
+
+#include "caffe2/operators/elementwise_ops.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+struct TanhFunctor {
+  template <typename T>
+  bool operator()(const int N, const T* X, T* Y, Context* context) const;
+};
+
+template <class Context>
+struct TanhGradientFunctor {
+  template <typename T>
+  bool Forward(
+      const std::vector<int>& dY_dims,
+      const std::vector<int>& Y_dims,
+      const T* dY,
+      const T* Y,
+      T* dX,
+      Context* context) const;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_TANH_OP_H_
diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
index 38b5e22ae627ce..e0f8dd160b08a8 100644
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@@ -6,6 +6,8 @@
 
 using namespace nom;
 
+namespace {
+
 std::map<std::string, caffe2::Argument>
 getArgumentsFromOperator(caffe2::OperatorDef op) {
   std::map<std::string, caffe2::Argument> argMap;
@@ -83,6 +85,8 @@ std::vector<int> getDilations(std::map<std::string, caffe2::Argument> argMap) {
   return dilations;
 }
 
+} // namespace
+
 namespace caffe2 {
 
 std::unique_ptr<repr::NeuralNetOperator>
@@ -483,4 +487,4 @@ caffe2::NetDef convertToCaffe2Proto(repr::NNModule &m, const caffe2::NetDef& old
   return predictNet;
 }
 
-} // namespace caffe2 
+} // namespace caffe2
diff --git a/caffe2/python/attention.py b/caffe2/python/attention.py
index da9066bc78067b..73be94feaf2b38 100644
--- a/caffe2/python/attention.py
+++ b/caffe2/python/attention.py
@@ -346,9 +346,9 @@ def apply_soft_coverage_attention(
     )
 
     # [encoder_length, batch_size, encoder_output_dim]
-    decoder_hidden_encoder_outputs_sum = model.net.Add(
+    decoder_hidden_encoder_outputs_sum_tmp = model.net.Add(
         [weighted_encoder_outputs, weighted_decoder_hidden_state],
-        s(scope, 'decoder_hidden_encoder_outputs_sum'),
+        s(scope, 'decoder_hidden_encoder_outputs_sum_tmp'),
         broadcast=1,
     )
     # [batch_size, encoder_length]
@@ -374,8 +374,8 @@ def apply_soft_coverage_attention(
 
     # [encoder_length, batch_size, encoder_output_dim]
     decoder_hidden_encoder_outputs_sum = model.net.Add(
-        [decoder_hidden_encoder_outputs_sum, scaled_coverage_weights],
-        decoder_hidden_encoder_outputs_sum,
+        [decoder_hidden_encoder_outputs_sum_tmp, scaled_coverage_weights],
+        s(scope, 'decoder_hidden_encoder_outputs_sum'),
     )
 
     # [batch_size, encoder_length, 1]
diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py
index c5158e65e70098..bf25806f20dde2 100644
--- a/caffe2/python/core_gradients_test.py
+++ b/caffe2/python/core_gradients_test.py
@@ -710,7 +710,7 @@ def testAddOpInMiddle(self):
         input_to_grad = net.AddGradientOperators({"x4": "x4_grad"})
         sum_op = net.Proto().op[-2]
         self.assertEqual(sum_op.input[0], "x2_grad")
-        self.assertEqual(sum_op.input[1], "x4_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
         self.assertEqual(sum_op.output[0], "x2_grad")
         self.assertEqual(input_to_grad["x1"], "x1_grad")
 
@@ -762,7 +762,7 @@ def testSubOpInMiddle(self):
         print(str(net.Proto()))
         sum_op = net.Proto().op[-2]
         self.assertEqual(sum_op.input[0], "x2_grad")
-        self.assertEqual(sum_op.input[1], "x4_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
         self.assertEqual(sum_op.output[0], "x2_grad")
         self.assertEqual(input_to_grad["x1"], "x1_grad")
 
@@ -788,12 +788,12 @@ def testAddOpAtLeaf(self):
         net.DotProduct(["x4", "x5"], "x6")
         input_to_grad = net.AddGradientOperators({"x6": "x6_grad"})
         sum_op = net.Proto().op[-1]
-        self.assertEqual(sum_op.input[0], "x5_grad")
-        self.assertEqual(sum_op.input[1], "x4_grad")
+        self.assertEqual(sum_op.input[0], "x2_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
         self.assertEqual(sum_op.output[0], "x2_grad")
-        self.assertEqual(input_to_grad["x1"], "x4_grad")
+        self.assertEqual(input_to_grad["x1"], "x1_grad")
         self.assertEqual(input_to_grad["x2"], "x2_grad")
-        self.assertEqual(input_to_grad["x3"], "x5_grad")
+        self.assertEqual(input_to_grad["x3"], "x3_grad")
 
     def testSubOpAtLeaf(self):
         # x1
@@ -818,9 +818,9 @@ def testSubOpAtLeaf(self):
         input_to_grad = net.AddGradientOperators({"x6": "x6_grad"})
         sum_op = net.Proto().op[-1]
         self.assertEqual(sum_op.input[0], "x2_grad")
-        self.assertEqual(sum_op.input[1], "x5_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
         self.assertEqual(sum_op.output[0], "x2_grad")
-        self.assertEqual(input_to_grad["x1"], "x4_grad")
+        self.assertEqual(input_to_grad["x1"], "x1_grad")
         self.assertEqual(input_to_grad["x2"], "x2_grad")
         self.assertEqual(input_to_grad["x3"], "x3_grad")
 
@@ -846,12 +846,12 @@ def testMultiLayerAddOps(self):
         net.Add(["x4", "x5"], "x6")
         input_to_grad = net.AddGradientOperators({"x6": "x6_grad"})
         sum_op = net.Proto().op[-1]
-        self.assertEqual(sum_op.input[0], "x6_grad")
-        self.assertEqual(sum_op.input[1], "x6_grad")
+        self.assertEqual(sum_op.input[0], "x2_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
         self.assertEqual(sum_op.output[0], "x2_grad")
-        self.assertEqual(input_to_grad["x1"], "x6_grad")
+        self.assertEqual(input_to_grad["x1"], "x1_grad")
         self.assertEqual(input_to_grad["x2"], "x2_grad")
-        self.assertEqual(input_to_grad["x3"], "x6_grad")
+        self.assertEqual(input_to_grad["x3"], "x3_grad")
 
     def testMultiLayerSubOps(self):
         # x1
@@ -876,9 +876,9 @@ def testMultiLayerSubOps(self):
         input_to_grad = net.AddGradientOperators({"x6": "x6_grad"})
         sum_op = net.Proto().op[-1]
         self.assertEqual(sum_op.input[0], "x2_grad")
-        self.assertEqual(sum_op.input[1], "x5_grad")
+        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
         self.assertEqual(sum_op.output[0], "x2_grad")
-        self.assertEqual(input_to_grad["x1"], "x6_grad")
+        self.assertEqual(input_to_grad["x1"], "x1_grad")
         self.assertEqual(input_to_grad["x2"], "x2_grad")
         self.assertEqual(input_to_grad["x3"], "x3_grad")
 
diff --git a/caffe2/python/gradient_check_test.py b/caffe2/python/gradient_check_test.py
index ae6eccb8a335f0..6561efbea3ad67 100644
--- a/caffe2/python/gradient_check_test.py
+++ b/caffe2/python/gradient_check_test.py
@@ -507,6 +507,7 @@ def testIf(self):
 
 
 class TestWhile(test_util.TestCase):
+    @unittest.skip("Skip flaky test.")
     def testWhile(self):
         with NetBuilder(_use_control_ops=True) as nb:
             ops.Copy(ops.Const(0), "i")
diff --git a/caffe2/python/ideep/squeeze_op_test.py b/caffe2/python/ideep/squeeze_op_test.py
new file mode 100644
index 00000000000000..2a2f85dae07985
--- /dev/null
+++ b/caffe2/python/ideep/squeeze_op_test.py
@@ -0,0 +1,35 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import hypothesis.strategies as st
+from hypothesis import given
+import numpy as np
+from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.ideep_test_util as mu
+
+
+@unittest.skipIf(not workspace.C.use_ideep, "No IDEEP support.")
+class SqueezeTest(hu.HypothesisTestCase):
+    @given(
+        squeeze_dims=st.lists(st.integers(0, 3), min_size=1, max_size=3),
+        inplace=st.booleans(),
+        **mu.gcs
+    )
+    def test_squeeze(self, squeeze_dims, inplace, gc, dc):
+        shape = [
+            1 if dim in squeeze_dims else np.random.randint(1, 5)
+            for dim in range(4)
+        ]
+        X = np.random.rand(*shape).astype(np.float32)
+        op = core.CreateOperator(
+            "Squeeze", "X", "X" if inplace else "Y", dims=squeeze_dims
+        )
+        self.assertDeviceChecks(dc, op, [X], [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py
index 45baa59aed6bcc..ccf800e440f3cc 100644
--- a/caffe2/python/mkl/rewrite_graph.py
+++ b/caffe2/python/mkl/rewrite_graph.py
@@ -8,9 +8,10 @@
 from caffe2.python import core
 
 
-def rewrite_init_net_simple(net):
+def rewrite_init_net_simple(net, ideep=True):
+    device = caffe2_pb2.IDEEP if ideep else caffe2_pb2.MKLDNN
     for op in net.op:
-        op.device_option.device_type = caffe2_pb2.MKLDNN
+        op.device_option.device_type = device
 
 def last_producer(ops, blob):
     for (i, op) in reversed(list(enumerate(ops))):
@@ -19,7 +20,7 @@ def last_producer(ops, blob):
     raise ValueError("Failed to find last producer of blob, %s", blob)
 
 
-def rewrite_run_net_simple(net):
+def rewrite_run_net_simple(net, ideep=True):
     # Simple rewrite for now - assume entire graph can be executed
     # with MKL, so just insert copy ops for external_input[0] and
     # external_output[0]
@@ -32,12 +33,14 @@ def mkl_tmp(name):
             "Input blob: {} is not consumed by first op: {}".format(
                 input_blob, net.op[0]))
     # Modify input/outputs to point to copied MKL blobs.
+    from_cpu = "CopyCPUToIDEEP" if ideep else "CopyCPUToMKL"
+    to_cpu = "CopyIDEEPToCPU" if ideep else "CopyMKLToCPU"
     copy_input_op = core.CreateOperator(
-        "CopyCPUToMKL", input_blob, mkl_tmp(input_blob))
+        from_cpu, input_blob, mkl_tmp(input_blob))
     net.op[0].input[0] = mkl_tmp(input_blob)
 
     copy_output_ops = [
-        core.CreateOperator("CopyMKLToCPU", mkl_tmp(output_blob), output_blob)
+        core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
         for output_blob in net.external_output]
 
     for output_blob in net.external_output:
@@ -54,15 +57,16 @@ def mkl_tmp(name):
     ops = [copy_input_op] + net.op[:] + copy_output_ops
     del net.op[:]
     net.op.extend(ops)
+    device = caffe2_pb2.IDEEP if ideep else caffe2_pb2.MKLDNN
     for op in net.op:
         op.device_option.MergeFrom(
-            core.DeviceOption(device_type=caffe2_pb2.MKLDNN))
+            core.DeviceOption(device_type=device))
         op.engine = ""
 
 
-def rewrite_model_helper_simple(model):
+def rewrite_model_helper_simple(model, ideep=True):
     model = copy.deepcopy(model)
     # All parameter initialization should run on MKL
-    rewrite_init_net_simple(model.param_init_net.Proto())
-    rewrite_run_net_simple(model.net.Proto())
+    rewrite_init_net_simple(model.param_init_net.Proto(), ideep)
+    rewrite_run_net_simple(model.net.Proto(), ideep)
     return model
diff --git a/caffe2/python/mkl/rewrite_graph_test.py b/caffe2/python/mkl/rewrite_graph_test.py
index 65bf7fd53d44eb..87b08d5283e54c 100644
--- a/caffe2/python/mkl/rewrite_graph_test.py
+++ b/caffe2/python/mkl/rewrite_graph_test.py
@@ -180,15 +180,16 @@ def complex_resnet():
     return model, [(1, 1, 224, 224)]
 
 
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
+@unittest.skipIf(not workspace.C.has_mkldnn or not workspace.C.use_ideep,
+                 "Skipping as we do not have MKLDNN and IDEEP.")
 class MKLRewriteTest(hu.HypothesisTestCase):
     @given(gen=st.sampled_from([simple_relu, simple_fc,
-                                simple_mlp, simple_cnn]))
-    def test_mkl_simple_rewrite(self, gen):
+                                simple_mlp, simple_cnn]),
+           ideep=st.booleans())
+    def test_mkl_simple_rewrite(self, gen, ideep):
         cpu_model, (shape,) = gen()
         cpu_model = deterministic_io(cpu_model)
-        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model)
+        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model, ideep)
         X = np.random.randn(*shape).astype(np.float32)
 
         def run(model):
@@ -200,10 +201,11 @@ def run(model):
         np.testing.assert_allclose(run(cpu_model), run(mkl_model),
                                    atol=1e-4, rtol=1e-4)
 
-    def test_mkl_resnet_rewrite(self):
+    @given(ideep=st.booleans())
+    def test_mkl_resnet_rewrite(self, ideep):
         cpu_model, (shape,) = complex_resnet()
         cpu_model = deterministic_io(cpu_model)
-        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model)
+        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model, ideep)
         np.random.seed(1701)
         X = np.random.randn(*shape).astype(np.float32)
 
@@ -215,10 +217,11 @@ def run(model):
         np.testing.assert_allclose(run(cpu_model), run(mkl_model),
                                    atol=1e-4, rtol=1e-4)
 
-    def test_mkl_multi_output_rewrite(self):
+    @given(ideep=st.booleans())
+    def test_mkl_multi_output_rewrite(self, ideep):
         cpu_model, shapes = double_matmul()
         cpu_model = deterministic_io(cpu_model)
-        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model)
+        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model, ideep)
         np.random.seed(1701)
         Xs = [np.random.randn(*shape).astype(np.float32) for shape in shapes]
 
@@ -236,10 +239,11 @@ def run(model):
         np.testing.assert_allclose(run(cpu_model), run(mkl_model),
                                    atol=1e-4, rtol=1e-4)
 
-    def test_mkl_alexnet_rewrite(self):
+    @given(ideep=st.booleans())
+    def test_mkl_alexnet_rewrite(self, ideep):
         cpu_model, (shape,) = alexnet()
         cpu_model = deterministic_io(cpu_model)
-        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model)
+        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model, ideep)
         np.random.seed(1701)
         X = np.random.randn(*shape).astype(np.float32)
 
diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
index f7b1dca934dd5c..ff6bddeb2bb6d0 100644
--- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py
+++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
@@ -324,155 +324,3 @@ def test_semantic_broadcast(self, gc, dc):
         out = workspace.FetchBlob("out")
         np.testing.assert_array_almost_equal(out, X + Y)
         self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-    @given(**hu.gcs)
-    def test_sum_reduce(self, gc, dc):
-        # Set broadcast and no axis, i.e. broadcasting last dimensions.
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(4, 5).astype(np.float32)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        res = np.sum(X, axis=0)
-        res = np.sum(res, axis=0)
-        np.testing.assert_array_almost_equal(out, res)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-        # Set broadcast and no axis, i.e. broadcasting last dimensions.
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(2, 3).astype(np.float32)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        res = np.sum(X, axis=3)
-        res = np.sum(res, axis=2)
-        np.testing.assert_array_almost_equal(out, res, decimal=3)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-        # broadcasting intermediate dimensions
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(3, 4).astype(np.float32)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        res = np.sum(X, axis=0)
-        res = np.sum(res, axis=2)
-        np.testing.assert_array_almost_equal(out, res)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-        # broadcasting intermediate dimensions
-        X = np.random.rand(2, 3, 4, 500).astype(np.float64)
-        Y = np.random.rand(1).astype(np.float64)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        res = np.array(np.sum(X))
-        np.testing.assert_array_almost_equal(out, res, decimal=0)
-
-        # broadcasting with single elem dimensions at both ends
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(1, 3, 4, 1).astype(np.float32)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        res = np.sum(X, axis=0)
-        res = np.sum(res, axis=2).reshape(Y.shape)
-        np.testing.assert_array_almost_equal(out, res)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-        # fp64 is not supported with the CUDA op
-        dc_cpu_only = [d for d in dc if d.device_type != caffe2_pb2.CUDA]
-        self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0])
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(**hu.gcs_gpu_only)
-    def test_sum_reduce_fp16(self, gc, dc):
-        # Set broadcast and no axis, i.e. broadcasting last dimensions.
-        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
-        Y = np.random.rand(4, 5).astype(np.float16)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1, device_option=gc)
-
-        def ref_op(X, Y):
-            res = np.sum(X, axis=0)
-            res = np.sum(res, axis=0)
-            return [res]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, Y],
-            reference=ref_op,
-            threshold=1e-3)
-
-        # Set broadcast and no axis, i.e. broadcasting last dimensions.
-        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
-        Y = np.random.rand(2, 3).astype(np.float16)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0)
-
-        def ref_op(X, Y):
-            res = np.sum(X, axis=3)
-            res = np.sum(res, axis=2)
-            return [res]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, Y],
-            reference=ref_op,
-            threshold=1e-3)
-
-        # broadcasting intermediate dimensions
-        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
-        Y = np.random.rand(3, 4).astype(np.float16)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1)
-
-        def ref_op(X, Y):
-            res = np.sum(X, axis=0)
-            res = np.sum(res, axis=2)
-            return [res]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, Y],
-            reference=ref_op,
-            threshold=1e-3)
-
-        # broadcasting with single elem dimensions at both ends
-        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
-        Y = np.random.rand(1, 3, 4, 1).astype(np.float16)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1)
-
-        def ref_op(X, Y):
-            res = np.sum(X, axis=0)
-            res = np.sum(res, axis=2)
-            return [res.reshape(Y.shape)]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, Y],
-            reference=ref_op,
-            threshold=1e-3)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py
index 0232b4b7699a11..30d4a8da4f3532 100644
--- a/caffe2/python/operator_test/elementwise_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_ops_test.py
@@ -12,32 +12,6 @@
 
 class TestElementwiseOps(hu.HypothesisTestCase):
 
-    @given(n=st.integers(0, 10), m=st.integers(4, 6),
-           d=st.integers(2, 3), seed=st.integers(0, 1000), **hu.gcs)
-    def test_div(self, n, m, d, gc, dc, seed):
-        np.random.seed(seed)
-        X = np.random.rand(n, m, d).astype(np.float32)
-        Y = np.random.rand(n, m, d).astype(np.float32) + 5.0
-
-        def div_op(X, Y):
-            return [np.divide(X, Y)]
-
-        op = core.CreateOperator(
-            "Div",
-            ["X", "Y"],
-            ["Z"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, Y],
-            reference=div_op,
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X, Y], 0, [0], stepsize=1e-4, threshold=1e-2)
-
     @given(n=st.integers(0, 6), m=st.integers(4, 6),
            seed=st.integers(0, 1000), **hu.gcs)
     def test_log(self, n, m, gc, dc, seed):
@@ -307,3 +281,137 @@ def eq(X, Y):
         result_2 = net_2.EQ(["X", "Y"], 1)
         (shapes, types) = workspace.InferShapesAndTypes([net])
         self.assertTrue(str(result_2) not in shapes)
+
+    def _run_single_test(self, op, ref, A, B, test_grad, gc, dc):
+        inputs = [A, B]
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=ref,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        if test_grad:
+            for i in range(len(inputs)):
+                self.assertGradientChecks(gc, op, inputs, i, [0])
+
+        inputs = [B, A]
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=inputs,
+            reference=ref,
+        )
+        self.assertDeviceChecks(dc, op, inputs, [0])
+        if test_grad:
+            for i in range(len(inputs)):
+                self.assertGradientChecks(gc, op, inputs, i, [0])
+
+    def _test_binary_op(
+            self, op_name, np_ref, n, m, k, t, bias, test_grad, gc, dc):
+        op = core.CreateOperator(
+            op_name,
+            ["A", "B"],
+            ["C"],
+        )
+
+        def ref(A, B):
+            return [np_ref(A, B)]
+
+        A = np.random.rand(n, m, k, t).astype(np.float32) + bias
+        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, test_grad, gc, dc)
+
+        A = np.random.rand(1).astype(np.float32) + bias
+        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, test_grad, gc, dc)
+
+        A = np.random.rand(k, t).astype(np.float32) + bias
+        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, test_grad, gc, dc)
+
+        A = np.random.rand(n, m, 1, 1).astype(np.float32) + bias
+        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, test_grad, gc, dc)
+
+        A = np.random.rand(m, 1, t).astype(np.float32) + bias
+        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, test_grad, gc, dc)
+
+        A = np.random.rand(1, m, 1, t).astype(np.float32) + bias
+        B = np.random.rand(n, 1, k, 1).astype(np.float32) + bias
+        self._run_single_test(op, ref, A, B, test_grad, gc, dc)
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
+           t=st.integers(1, 5), **hu.gcs)
+    def test_add(self, n, m, k, t, gc, dc):
+        self._test_binary_op("Add", np.add, n, m, k, t, -0.5, True, gc, dc)
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
+           t=st.integers(1, 5), **hu.gcs)
+    def test_sub(self, n, m, k, t, gc, dc):
+        self._test_binary_op("Sub", np.subtract, n, m,
+                             k, t, -0.5, True, gc, dc)
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
+           t=st.integers(1, 5), **hu.gcs)
+    def test_mul(self, n, m, k, t, gc, dc):
+        self._test_binary_op("Mul", np.multiply, n, m,
+                             k, t, -0.5, True, gc, dc)
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
+           t=st.integers(1, 5), **hu.gcs)
+    def test_div(self, n, m, k, t, gc, dc):
+        self._test_binary_op("Div", np.divide, n, m, k, t, 1.0, True, gc, dc)
+
+    def _test_bitwise_binary_op(self, op_name, np_ref, n, m, k, t, gc, dc):
+        op = core.CreateOperator(
+            op_name,
+            ["A", "B"],
+            ["C"],
+        )
+
+        def ref(A, B):
+            return [np_ref(A, B)]
+
+        A = np.random.randint(128, size=(n, m, k, t))
+        B = np.random.randint(128, size=(n, m, k, t))
+        self._run_single_test(op, ref, A, B, False, gc, dc)
+
+        A = np.random.randint(128, size=1)
+        B = np.random.randint(128, size=(n, m, k, t))
+        self._run_single_test(op, ref, A, B, False, gc, dc)
+
+        A = np.random.randint(128, size=(k, t))
+        B = np.random.randint(128, size=(n, m, k, t))
+        self._run_single_test(op, ref, A, B, False, gc, dc)
+
+        A = np.random.randint(128, size=(n, m, 1, 1))
+        B = np.random.randint(128, size=(n, m, k, t))
+        self._run_single_test(op, ref, A, B, False, gc, dc)
+
+        A = np.random.randint(128, size=(m, 1, t))
+        B = np.random.randint(128, size=(n, m, k, t))
+        self._run_single_test(op, ref, A, B, False, gc, dc)
+
+        A = np.random.randint(128, size=(1, m, 1, t))
+        B = np.random.randint(128, size=(n, 1, k, 1))
+        self._run_single_test(op, ref, A, B, False, gc, dc)
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
+           t=st.integers(1, 5), **hu.gcs)
+    def test_bitwise_and(self, n, m, k, t, gc, dc):
+        self._test_bitwise_binary_op(
+            "BitwiseAnd", np.bitwise_and, n, m, k, t, gc, dc)
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
+           t=st.integers(1, 5), **hu.gcs)
+    def test_bitwise_or(self, n, m, k, t, gc, dc):
+        self._test_bitwise_binary_op(
+            "BitwiseOr", np.bitwise_or, n, m, k, t, gc, dc)
+
+    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
+           t=st.integers(1, 5), **hu.gcs)
+    def test_bitwise_xor(self, n, m, k, t, gc, dc):
+        self._test_bitwise_binary_op(
+            "BitwiseXor", np.bitwise_xor, n, m, k, t, gc, dc)
diff --git a/caffe2/python/operator_test/reshape_ops_test.py b/caffe2/python/operator_test/reshape_ops_test.py
index bf3a6564ef4594..175256e920d296 100644
--- a/caffe2/python/operator_test/reshape_ops_test.py
+++ b/caffe2/python/operator_test/reshape_ops_test.py
@@ -48,6 +48,8 @@ def test_zero_dim(self):
                      expected_shape=(4, 2, 1))
         _test_reshape(old_shape=(4, 2, 1), new_shape=(0, 2, 1),
                      expected_shape=(4, 2, 1), arg_shape=False)
+        _test_reshape(old_shape=(0, 0), new_shape=(0, 0, 0),
+                     expected_shape=(0, 0, 0), arg_shape=False)
 
     def test_zero_dim_and_missing_dim(self):
         _test_reshape(old_shape=(4, 2, 1), new_shape=(0, -1, 0),
diff --git a/caffe2/python/operator_test/shape_inference_test.py b/caffe2/python/operator_test/shape_inference_test.py
index f31f003c669648..59eb899c97c8df 100644
--- a/caffe2/python/operator_test/shape_inference_test.py
+++ b/caffe2/python/operator_test/shape_inference_test.py
@@ -494,6 +494,12 @@ def testInt8Conversion(self):
         workspace.FeedBlob('x', np.random.rand(100, 150).astype(np.float32))
         self.InferTensorRunAndCompare(model)
 
+    def testShapeOp(self):
+        model = model_helper.ModelHelper(name="shape_op_test")
+        model.Shape('x', 'y')
+        workspace.FeedBlob('x', np.random.rand(100, 150).astype(np.float32))
+        self.InferTensorRunAndCompare(model)
+
     def InferTensorRunAndCompare(self, model, expected_uninferred_blobs=None):
         '''
         Runs shape inference, and then the model to check
diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h
index bd0b2d6e44db9d..81c8228cf25f7b 100644
--- a/caffe2/utils/math.h
+++ b/caffe2/utils/math.h
@@ -15,6 +15,7 @@ extern "C" {
 
 #include "caffe2/core/common.h"
 #include "caffe2/core/types.h"
+#include "caffe2/utils/math_utils.h"
 
 #include "Eigen/Core"
 #include "Eigen/Dense"
@@ -31,26 +32,26 @@ class DefaultEngine {};
 // Common Eigen types that we will often use
 template <typename T>
 using EigenMatrixMap =
-    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> >;
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
 template <typename T>
 using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> >;
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
 template <typename T>
-using EigenVectorMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1> >;
+using EigenVectorMap = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1>>;
 template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> >;
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
 template <typename T>
 using ConstEigenMatrixMap =
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> >;
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
 template <typename T>
 using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> >;
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
 template <typename T>
 using ConstEigenVectorMap =
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1> >;
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>;
 template <typename T>
 using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1> >;
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
 
 namespace math {
 
@@ -81,69 +82,107 @@ void InvSqrt(const int N, const T* x, T* y, Context* context);
 template <typename T, class Context>
 void Sqr(const int N, const T* x, T* y, Context* context);
 
+template <typename T, class Context>
+void Neg(const int N, const T* x, T* y, Context* context);
+
+template <typename T, class Context>
+void Sign(const int N, const T* x, T* y, Context* context);
+
 template <typename T, class Context>
 void Not(const int N, const T* x, T* y, Context* context);
 
 template <typename T, class Context>
 void Powx(const int N, const T* a, const T b, T* y, Context* context);
 
-#define CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(name)                         \
+#define CAFFE2_DECLARE_COMPARE_OP(Comp)                                      \
   template <typename T, class Context>                                       \
-  void name(const int N, const T* a, const T* b, bool* y, Context* context); \
+  void Comp(const int N, const T* A, const T* B, bool* C, Context* context); \
+                                                                             \
+  template <typename T, class Context, bool kBroadcast1st = false>           \
+  void Rowwise##Comp(                                                        \
+      const int rows,                                                        \
+      const int cols,                                                        \
+      const T* A,                                                            \
+      const T* B,                                                            \
+      bool* C,                                                               \
+      Context* context);                                                     \
+                                                                             \
+  template <typename T, class Context, bool kBroadcast1st = false>           \
+  void Colwise##Comp(                                                        \
+      const int rows,                                                        \
+      const int cols,                                                        \
+      const T* A,                                                            \
+      const T* B,                                                            \
+      bool* C,                                                               \
+      Context* context);                                                     \
+                                                                             \
   template <typename T, class Context>                                       \
-  void name##ToRow(                                                          \
-      const int M,                                                           \
-      const int N,                                                           \
-      const T* a,                                                            \
-      const T* b,                                                            \
-      bool* y,                                                               \
+  void Comp(                                                                 \
+      const int A_ndim,                                                      \
+      const int* A_dims,                                                     \
+      const int B_ndim,                                                      \
+      const int* B_dims,                                                     \
+      const T* A,                                                            \
+      const T* B,                                                            \
+      bool* C,                                                               \
       Context* context);
 
-CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(LT);
-CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(LE);
-CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(GT);
-CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(GE);
-
-CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(And);
-CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(Or);
-CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT(Xor);
+CAFFE2_DECLARE_COMPARE_OP(EQ)
+CAFFE2_DECLARE_COMPARE_OP(NE)
+CAFFE2_DECLARE_COMPARE_OP(LT)
+CAFFE2_DECLARE_COMPARE_OP(LE)
+CAFFE2_DECLARE_COMPARE_OP(GT)
+CAFFE2_DECLARE_COMPARE_OP(GE)
 
-#undef CAFFE2_DECLARE_BINARY_OP_BINARY_RESULT
+#undef CAFFE2_DECLARE_COMPARE_OP
 
-#define CAFFE2_DECLARE_BINARY_OP(name)                                    \
-  template <typename T, class Context>                                    \
-  void name(const int N, const T* a, const T* b, T* y, Context* context); \
+#define CAFFE2_DECLARE_BINARY_OP(Func)                                    \
   template <typename T, class Context>                                    \
-  void name##ToRow(                                                       \
-      const int M,                                                        \
-      const int N,                                                        \
-      const T* a,                                                         \
-      const T* b,                                                         \
-      T* y,                                                               \
+  void Func(const int N, const T* A, const T* B, T* C, Context* context); \
+                                                                          \
+  template <typename T, class Context, bool kBroadcast1st = false>        \
+  void Rowwise##Func(                                                     \
+      const int rows,                                                     \
+      const int cols,                                                     \
+      const T* A,                                                         \
+      const T* B,                                                         \
+      T* C,                                                               \
       Context* context);                                                  \
+                                                                          \
+  template <typename T, class Context, bool kBroadcast1st = false>        \
+  void Colwise##Func(                                                     \
+      const int rows,                                                     \
+      const int cols,                                                     \
+      const T* A,                                                         \
+      const T* B,                                                         \
+      T* C,                                                               \
+      Context* context);                                                  \
+                                                                          \
   template <typename T, class Context>                                    \
-  void name##ToRow(                                                       \
-      const int M, const int N, const T* x, T* y, Context* context);      \
-  template <typename T, class Context>                                    \
-  void name##ToCol(                                                       \
-      const int M, const int N, const T* x, T* y, Context* context);
-
-CAFFE2_DECLARE_BINARY_OP(Add);
-CAFFE2_DECLARE_BINARY_OP(Sub);
-CAFFE2_DECLARE_BINARY_OP(Mul);
-CAFFE2_DECLARE_BINARY_OP(Div);
-
-#undef CAFFE2_DECLARE_BINARY_OP
+  void Func(                                                              \
+      const int A_ndim,                                                   \
+      const int* A_dims,                                                  \
+      const int B_ndim,                                                   \
+      const int* B_dims,                                                  \
+      const T* A,                                                         \
+      const T* B,                                                         \
+      T* C,                                                               \
+      Context* context);
 
-namespace internal {
+CAFFE2_DECLARE_BINARY_OP(Add)
+CAFFE2_DECLARE_BINARY_OP(Sub)
+CAFFE2_DECLARE_BINARY_OP(Mul)
+CAFFE2_DECLARE_BINARY_OP(Div)
 
-// Increase the index digits by one based on dims.
-void IncreaseIndexInDims(const int n, const int* dims, int* index);
+CAFFE2_DECLARE_BINARY_OP(And)
+CAFFE2_DECLARE_BINARY_OP(Or)
+CAFFE2_DECLARE_BINARY_OP(Xor)
 
-// Get index value from dims and index digits.
-int GetIndexFromDims(const int n, const int* dims, const int* index);
+CAFFE2_DECLARE_BINARY_OP(BitwiseAnd)
+CAFFE2_DECLARE_BINARY_OP(BitwiseOr)
+CAFFE2_DECLARE_BINARY_OP(BitwiseXor)
 
-} // namespace internal
+#undef CAFFE2_DECLARE_BINARY_OP
 
 template <typename T, class Context>
 void ReduceMin(
@@ -152,6 +191,7 @@ void ReduceMin(
     T* y,
     Tensor<Context>* scratch_ptr,
     Context* context);
+
 template <typename T, class Context>
 void ReduceMax(
     const int N,
@@ -237,14 +277,12 @@ void AddStripedBatch(
 // Compute the row-wise max of a N*D matrix X, and write it to a N
 // dimensional vector y.
 template <typename T, class Context>
-void RowwiseMax(const int N, const int D, const T* x, T* y,
-                Context* context);
+void RowwiseMax(const int N, const int D, const T* x, T* y, Context* context);
 
 // Compute the column-wise max of a N*D matrix X, and write it to a D
 // dimensional vector y.
 template <typename T, class Context>
-void ColwiseMax(const int N, const int D, const T* x, T* y,
-                Context* context);
+void ColwiseMax(const int N, const int D, const T* x, T* y, Context* context);
 
 // Elemwise maximum of vector x and vector y. z[i] = max(x[i], y[i])
 template <typename T, class Context>
@@ -370,8 +408,12 @@ void Dot(const int N, const T* a, const T* b, T* y, Context* context);
 
 // Sum of vector x, and writes the result to a single value y.
 template <typename T, class Context>
-void Sum(const int N, const T* x, T* y, Context* context,
-         Tensor<Context>* scratch_ptr = nullptr);
+void Sum(
+    const int N,
+    const T* x,
+    T* y,
+    Context* context,
+    Tensor<Context>* scratch_ptr = nullptr);
 
 // Sum of squares of vector x, and writes the result to a single value y.
 template <typename T, class Context>
@@ -385,8 +427,13 @@ void SumSqr(
 // Select does index selection of the rows a N*D matrix x, and gives the N
 // dimensional vector y that contains the selected data.
 template <typename T, class Context>
-void Select(const int N, const int D, const T* x, const int* idx, T* y,
-            Context* context);
+void Select(
+    const int N,
+    const int D,
+    const T* x,
+    const int* idx,
+    T* y,
+    Context* context);
 
 template <typename T, class Context>
 void Scale(const int N, const float alpha, const T* x, T* y, Context* context);
@@ -487,12 +534,12 @@ void Col2Im(
 // image. image_size is H * W
 template <typename T, class Context>
 void BiasCHW(
-  const T* bias,
-  const T* bias_multiplier,
-  const int bias_channels,
-  const int image_size,
-  T* image,
-  Context* context);
+    const T* bias,
+    const T* bias_multiplier,
+    const int bias_channels,
+    const int image_size,
+    T* image,
+    Context* context);
 
 template <class Context>
 void CopyMatrix(
@@ -525,7 +572,7 @@ inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
 // is no overflow or underflow in the calculation.
 template <typename T>
 constexpr T divUp(T a, T b) {
-  return (a + b - (T) 1) / b;
+  return (a + b - (T)1) / b;
 }
 
 // Rounds a up to the next highest multiple of b. User must be careful
@@ -548,8 +595,8 @@ constexpr T integerNextHighestPowerOf2(T v) {
   return (integerIsPowerOf2(v) ? (T)2 * v : ((T)1 << (integerLog2(v) + 1)));
 }
 
-}  // namespace math
-}  // namespace caffe2
+} // namespace math
+} // namespace caffe2
 
 #include "caffe2/utils/math-detail.h"
-#endif  // CAFFE2_UTILS_MATH_H_
+#endif // CAFFE2_UTILS_MATH_H_
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index 140471a457dc4d..2b45fad7a63523 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -22,18 +22,19 @@
 #include <limits>
 #include <numeric>
 #include <random>
+#include <tuple>
 #include <unordered_set>
 #include <vector>
 
-#include "caffe2/utils/cpu_neon.h"
 #include "caffe2/core/context.h"
+#include "caffe2/utils/cpu_neon.h"
 
 #include "Eigen/Core"
 #include "Eigen/Dense"
 
 #ifdef CAFFE2_USE_MKL
 #include <mkl.h>
-#endif  // CAFFE2_USE_MKL
+#endif // CAFFE2_USE_MKL
 
 #ifdef CAFFE2_USE_HPTT
 #include <hptt.h>
@@ -91,40 +92,40 @@ void Gemm<float, CPUContext>(
     C_mat *= beta;
   }
   switch (TransA) {
-  case CblasNoTrans: {
-    switch (TransB) {
-    case CblasNoTrans:
-      C_mat.noalias() += alpha * (
-          ConstEigenMatrixMap<float>(B, N, K) *
-          ConstEigenMatrixMap<float>(A, K, M));
-      return;
-    case CblasTrans:
-      C_mat.noalias() += alpha * (
-          ConstEigenMatrixMap<float>(B, K, N).transpose() *
-          ConstEigenMatrixMap<float>(A, K, M));
-      return;
-    default:
-      LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
+    case CblasNoTrans: {
+      switch (TransB) {
+        case CblasNoTrans:
+          C_mat.noalias() += alpha *
+              (ConstEigenMatrixMap<float>(B, N, K) *
+               ConstEigenMatrixMap<float>(A, K, M));
+          return;
+        case CblasTrans:
+          C_mat.noalias() += alpha *
+              (ConstEigenMatrixMap<float>(B, K, N).transpose() *
+               ConstEigenMatrixMap<float>(A, K, M));
+          return;
+        default:
+          LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
+      }
     }
-  }
-  case CblasTrans: {
-    switch (TransB) {
-    case CblasNoTrans:
-      C_mat.noalias() += alpha * (
-          ConstEigenMatrixMap<float>(B, N, K) *
-          ConstEigenMatrixMap<float>(A, M, K).transpose());
-      return;
-    case CblasTrans:
-      C_mat.noalias() += alpha * (
-          ConstEigenMatrixMap<float>(B, K, N).transpose() *
-          ConstEigenMatrixMap<float>(A, M, K).transpose());
-      return;
-    default:
-      LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
+    case CblasTrans: {
+      switch (TransB) {
+        case CblasNoTrans:
+          C_mat.noalias() += alpha *
+              (ConstEigenMatrixMap<float>(B, N, K) *
+               ConstEigenMatrixMap<float>(A, M, K).transpose());
+          return;
+        case CblasTrans:
+          C_mat.noalias() += alpha *
+              (ConstEigenMatrixMap<float>(B, K, N).transpose() *
+               ConstEigenMatrixMap<float>(A, M, K).transpose());
+          return;
+        default:
+          LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
+      }
     }
-  }
-  default:
-    LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransA";
+    default:
+      LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransA";
   }
 }
 
@@ -157,14 +158,14 @@ void GemmEx<float, CPUContext>(
     case CblasNoTrans: {
       switch (TransB) {
         case CblasNoTrans:
-          C_mat.noalias() +=
-              alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) *
-                       ConstStridedMap(A, K, M, OuterStride(lda)));
+          C_mat.noalias() += alpha *
+              (ConstStridedMap(B, N, K, OuterStride(ldb)) *
+               ConstStridedMap(A, K, M, OuterStride(lda)));
           return;
         case CblasTrans:
-          C_mat.noalias() +=
-              alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
-                       ConstStridedMap(A, K, M, OuterStride(lda)));
+          C_mat.noalias() += alpha *
+              (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
+               ConstStridedMap(A, K, M, OuterStride(lda)));
           return;
         default:
           LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
@@ -173,14 +174,14 @@ void GemmEx<float, CPUContext>(
     case CblasTrans: {
       switch (TransB) {
         case CblasNoTrans:
-          C_mat.noalias() +=
-              alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) *
-                       ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
+          C_mat.noalias() += alpha *
+              (ConstStridedMap(B, N, K, OuterStride(ldb)) *
+               ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
           return;
         case CblasTrans:
-          C_mat.noalias() +=
-              alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
-                       ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
+          C_mat.noalias() += alpha *
+              (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
+               ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
           return;
         default:
           LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
@@ -213,15 +214,15 @@ void Gemv<float, CPUContext>(
   }
   switch (TransA) {
     case CblasNoTrans: {
-      y_vec.noalias() += alpha * (
-          ConstEigenMatrixMap<float>(A, N, M).transpose() *
-          ConstEigenVectorMap<float>(x, N));
+      y_vec.noalias() += alpha *
+          (ConstEigenMatrixMap<float>(A, N, M).transpose() *
+           ConstEigenVectorMap<float>(x, N));
       return;
     }
     case CblasTrans: {
-      y_vec.noalias() += alpha * (
-          ConstEigenMatrixMap<float>(A, N, M) *
-          ConstEigenVectorMap<float>(x, M));
+      y_vec.noalias() += alpha *
+          (ConstEigenMatrixMap<float>(A, N, M) *
+           ConstEigenVectorMap<float>(x, M));
       return;
     }
     default:
@@ -247,13 +248,12 @@ void Gemv<float, CPUContext>(
 CAFFE2_SPECIALIZED_SCALE(float)
 #undef CAFFE2_SPECIALIZED_SCALE
 
-#define CAFFE2_SPECIALIZED_DOT(T)                                              \
-template<>                                                                     \
-void Dot<T, CPUContext>(                                                       \
-    const int N, const T* a, const T* b, T* y,                                 \
-    CPUContext* context) {                                                     \
-  *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N));         \
-}
+#define CAFFE2_SPECIALIZED_DOT(T)                                        \
+  template <>                                                            \
+  void Dot<T, CPUContext>(                                               \
+      const int N, const T* a, const T* b, T* y, CPUContext* context) {  \
+    *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \
+  }
 CAFFE2_SPECIALIZED_DOT(float)
 #undef CAFFE2_SPECIALIZED_DOT
 
@@ -271,17 +271,22 @@ CAFFE2_SPECIALIZED_DOT(float)
 CAFFE2_SPECIALIZED_AXPY(float)
 #undef CAFFE2_SPECIALIZED_AXPY
 
-#define CAFFE2_SPECIALIZED_AXPBY(T)                                            \
-template <>                                                                    \
-void Axpby<T, CPUContext>(const int N, const T alpha, const T* x,              \
-                          const T beta, T* y, CPUContext* context) {           \
-  EigenVectorMap<T> y_vec(y, N);                                               \
-  y_vec = y_vec * beta + ConstEigenVectorMap<T>(x, N) * alpha;                 \
-}
+#define CAFFE2_SPECIALIZED_AXPBY(T)                              \
+  template <>                                                    \
+  void Axpby<T, CPUContext>(                                     \
+      const int N,                                               \
+      const T alpha,                                             \
+      const T* x,                                                \
+      const T beta,                                              \
+      T* y,                                                      \
+      CPUContext* context) {                                     \
+    EigenVectorMap<T> y_vec(y, N);                               \
+    y_vec = y_vec * beta + ConstEigenVectorMap<T>(x, N) * alpha; \
+  }
 CAFFE2_SPECIALIZED_AXPBY(float)
 #undef CAFFE2_SPECIALIZED_AXPBY
 
-#else  // CAFFE2_USE_EIGEN_FOR_BLAS
+#else // CAFFE2_USE_EIGEN_FOR_BLAS
 
 template <>
 void Gemm<float, CPUContext>(
@@ -299,8 +304,21 @@ void Gemm<float, CPUContext>(
     TensorProto::DataType /*math_type*/) {
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
-  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, N);
+  cblas_sgemm(
+      CblasRowMajor,
+      TransA,
+      TransB,
+      M,
+      N,
+      K,
+      alpha,
+      A,
+      lda,
+      B,
+      ldb,
+      beta,
+      C,
+      N);
 }
 
 template <>
@@ -319,8 +337,21 @@ void GemmEx<float, CPUContext>(
     float* C,
     const int ldc,
     CPUContext* /*context*/) {
-  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
+  cblas_sgemm(
+      CblasRowMajor,
+      TransA,
+      TransB,
+      M,
+      N,
+      K,
+      alpha,
+      A,
+      lda,
+      B,
+      ldb,
+      beta,
+      C,
+      ldc);
 }
 
 template <>
@@ -393,7 +424,7 @@ CAFFE2_SPECIALIZED_AXPY(float, s)
       CPUContext*) {                                   \
     cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1); \
   }
-#else  // CAFFE2_USE_MKL
+#else // CAFFE2_USE_MKL
 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix)     \
   template <>                                   \
   void Axpby<T, CPUContext>(                    \
@@ -406,11 +437,11 @@ CAFFE2_SPECIALIZED_AXPY(float, s)
     cblas_##prefix##scal(N, beta, y, 1);        \
     cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \
   }
-#endif  // CAFFE2_USE_MKL
+#endif // CAFFE2_USE_MKL
 CAFFE2_SPECIALIZED_AXPBY(float, s)
 #undef CAFFE2_SPECIALIZED_AXPBY
 
-#endif  // CAFFE2_USE_EIGEN_FOR_BLAS
+#endif // CAFFE2_USE_EIGEN_FOR_BLAS
 
 template <>
 void GemmBatched<float, CPUContext>(
@@ -549,28 +580,28 @@ DELEGATE_POWX_FUNCTION(float, vsPowx)
 DELEGATE_POWX_FUNCTION(double, vdPowx)
 #undef DELEGATE_POWX_FUNCTION
 
-#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \
-  template <>                                                      \
-  void Funcname<T, CPUContext>(                                    \
-      const int N, const T* a, const T* b, T* y, CPUContext*) {    \
-    OriginalFunc(N, a, b, y);                                      \
+#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, FuncImpl)      \
+  template <>                                                   \
+  void Func<T, CPUContext>(                                     \
+      const int N, const T* A, const T* B, T* C, CPUContext*) { \
+    FuncImpl(N, A, B, C);                                       \
   }
-DELEGATE_SIMPLE_BINARY_FUNCTION(float,  Add, vsAdd)
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd)
 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd)
-DELEGATE_SIMPLE_BINARY_FUNCTION(float,  Sub, vsSub)
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Sub, vsSub)
 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Sub, vdSub)
-DELEGATE_SIMPLE_BINARY_FUNCTION(float,  Mul, vsMul)
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Mul, vsMul)
 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Mul, vdMul)
-DELEGATE_SIMPLE_BINARY_FUNCTION(float,  Div, vsDiv)
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv)
 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
 #undef DELEGATE_SIMPLE_BINARY_FUNCTION
 
-#else  // CAFFE2_USE_MKL
+#else // CAFFE2_USE_MKL
 
 #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr)                    \
   template <>                                                                \
   void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
-    EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(x, N).array().expr();   \
+    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).expr();      \
   }
 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp)
 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log)
@@ -586,12 +617,12 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(float, InvSqrt, rsqrt)
 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square)
 #undef DELEGATE_SIMPLE_UNARY_FUNCTION
 
-#define DELEGATE_SINCOS_FUNCTION(T)                                        \
-  template <>                                                              \
-  void SinCos<T, CPUContext>(                                              \
-      const int N, const T* x, T* ys, T* yc, CPUContext*) {                \
-    EigenVectorMap<T>(ys, N) = ConstEigenVectorMap<T>(x, N).array().sin(); \
-    EigenVectorMap<T>(yc, N) = ConstEigenVectorMap<T>(x, N).array().cos(); \
+#define DELEGATE_SINCOS_FUNCTION(T)                                     \
+  template <>                                                           \
+  void SinCos<T, CPUContext>(                                           \
+      const int N, const T* x, T* ys, T* yc, CPUContext*) {             \
+    EigenVectorMap<T>(ys, N) = ConstEigenVectorArrayMap<T>(x, N).sin(); \
+    EigenVectorMap<T>(yc, N) = ConstEigenVectorArrayMap<T>(x, N).cos(); \
   }
 DELEGATE_SINCOS_FUNCTION(float)
 DELEGATE_SINCOS_FUNCTION(double)
@@ -600,36 +631,56 @@ DELEGATE_SINCOS_FUNCTION(double)
 #define DELEGATE_POWX_FUNCTION(T)                                             \
   template <>                                                                 \
   void Powx<T, CPUContext>(const int N, const T* a, T b, T* y, CPUContext*) { \
-    EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(a, N).array().pow(b);    \
+    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(a, N).pow(b);       \
   }
 DELEGATE_POWX_FUNCTION(float)
 #undef DELEGATE_POWX_FUNCTION
 
-#endif  // CAFFE2_USE_MKL
-
+#endif // CAFFE2_USE_MKL
 
-#define EIGEN_SIMPLE_BINARY_FUNCTION(T, Funcname, expr)                        \
-template <>                                                                    \
-void Funcname<T, CPUContext>(                                                  \
-    const int N, const T* a, const T* b, T* y,                                 \
-    CPUContext*) {                                                             \
-  EigenVectorMap<T>(y, N) =                                                    \
-      ConstEigenVectorMap<T>(a, N).array() expr                                \
-      ConstEigenVectorMap<T>(b, N).array();                                    \
-}
+#define DELEGATE_NEG_FUNCTION(T)                                        \
+  template <>                                                           \
+  void Neg<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
+    EigenVectorMap<T>(y, N) = -ConstEigenVectorMap<T>(x, N);            \
+  }
+DELEGATE_NEG_FUNCTION(float)
+DELEGATE_NEG_FUNCTION(double)
+DELEGATE_NEG_FUNCTION(std::int32_t)
+DELEGATE_NEG_FUNCTION(std::int64_t)
+#undef DELEGATE_NEG_FUNCTION
+
+#define DELEGATE_SIGN_FUNCTION(T)                                        \
+  template <>                                                            \
+  void Sign<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
+    EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).sign();  \
+  }
+DELEGATE_SIGN_FUNCTION(float)
+DELEGATE_SIGN_FUNCTION(double)
+DELEGATE_SIGN_FUNCTION(std::int32_t)
+DELEGATE_SIGN_FUNCTION(std::int64_t)
+#undef DELEGATE_SIGN_FUNCTION
+
+#define EIGEN_SIMPLE_BINARY_FUNCTION(T, Func, expr)             \
+  template <>                                                   \
+  void Func<T, CPUContext>(                                     \
+      const int N, const T* A, const T* B, T* C, CPUContext*) { \
+    EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \
+        expr ConstEigenVectorArrayMap<T>(B, N);                 \
+  }
 
 #ifdef CAFFE2_USE_MKL
 
-#define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr)                          \
-EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr)                          \
-EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr)
+#define DEFINE_SIMPLE_BINARY_FUNCTION(Func, expr)        \
+  EIGEN_SIMPLE_BINARY_FUNCTION(std::int32_t, Func, expr) \
+  EIGEN_SIMPLE_BINARY_FUNCTION(std::int64_t, Func, expr)
 
 #else
 
-#define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr)                          \
-EIGEN_SIMPLE_BINARY_FUNCTION(float, Funcname, expr)                            \
-EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr)                          \
-EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr)
+#define DEFINE_SIMPLE_BINARY_FUNCTION(Func, expr)        \
+  EIGEN_SIMPLE_BINARY_FUNCTION(float, Func, expr)        \
+  EIGEN_SIMPLE_BINARY_FUNCTION(double, Func, expr)       \
+  EIGEN_SIMPLE_BINARY_FUNCTION(std::int32_t, Func, expr) \
+  EIGEN_SIMPLE_BINARY_FUNCTION(std::int64_t, Func, expr)
 
 #endif
 
@@ -638,9 +689,8 @@ DEFINE_SIMPLE_BINARY_FUNCTION(Sub, -)
 DEFINE_SIMPLE_BINARY_FUNCTION(Mul, *)
 DEFINE_SIMPLE_BINARY_FUNCTION(Div, /)
 
+#undef DEFINE_SIMPLE_BINARY_FUNCTION
 #undef EIGEN_SIMPLE_BINARY_FUNCTION
-#undef DEFINE_FLOAT_BINARY_FUNCTION
-
 
 ////////////////////////////////////////////////////////////////////////////////
 // Common math functions being used in Caffe that do not have a BLAS or MKL
@@ -677,31 +727,6 @@ CAFFE2_SPECIALIZED_REDUCEMAX(int64_t)
 
 #undef CAFFE2_SPECIALIZED_REDUCEMAX
 
-namespace internal {
-
-void IncreaseIndexInDims(const int n, const int* dims, int* index) {
-  for (int i = n - 1; i >= 0; --i) {
-    ++index[i];
-    if (index[i] >= dims[i]) {
-      index[i] -= dims[i];
-    } else {
-      break;
-    }
-  }
-}
-
-int GetIndexFromDims(const int n, const int* dims, const int* index) {
-  int sum = 0;
-  for (int i = 0; i < n; ++i) {
-    if (dims[i] > 1) {
-      sum = sum * dims[i] + index[i];
-    }
-  }
-  return sum;
-}
-
-} // namespace internal
-
 namespace {
 
 template <typename T, class Reducer>
@@ -728,9 +753,9 @@ void ReduceTensor(
   std::vector<int> index(num_dims, 0);
   for (int X_index = 0; X_index < X_size; ++X_index) {
     const int Y_index =
-        internal::GetIndexFromDims(num_dims, Y_dims.data(), index.data());
+        utils::GetIndexFromDims(num_dims, Y_dims.data(), index.data());
     Y[Y_index] = reducer(Y[Y_index], X[X_index]);
-    internal::IncreaseIndexInDims(num_dims, dims, index.data());
+    utils::IncreaseIndexInDims(num_dims, dims, index.data());
   }
 }
 
@@ -868,9 +893,9 @@ void BroadcastImpl(
   std::vector<int> index(Y_ndim, 0);
   for (int Y_index = 0; Y_index < Y_size; ++Y_index) {
     const int X_index =
-        internal::GetIndexFromDims(Y_ndim, X_dims_ex.data(), index.data());
+        utils::GetIndexFromDims(Y_ndim, X_dims_ex.data(), index.data());
     Y[Y_index] = X[X_index];
-    internal::IncreaseIndexInDims(Y_ndim, Y_dims, index.data());
+    utils::IncreaseIndexInDims(Y_ndim, Y_dims, index.data());
   }
 }
 
@@ -920,10 +945,10 @@ void MomentsImpl(
   std::vector<int> index(num_dims, 0);
   for (int X_index = 0; X_index < X_size; ++X_index) {
     const int Y_index =
-        internal::GetIndexFromDims(num_dims, Y_dims.data(), index.data());
+        utils::GetIndexFromDims(num_dims, Y_dims.data(), index.data());
     mean[Y_index] += X[X_index];
     variance[Y_index] += X[X_index] * X[X_index];
-    internal::IncreaseIndexInDims(num_dims, dims, index.data());
+    utils::IncreaseIndexInDims(num_dims, dims, index.data());
   }
   for (int Y_index = 0; Y_index < Y_size; ++Y_index) {
     mean[Y_index] /= static_cast<T>(scale);
@@ -992,42 +1017,169 @@ CAFFE2_SPECIALIZED_ELEMWISEMAX(float)
 CAFFE2_SPECIALIZED_MAXIMUM(float)
 #undef CAFFE2_SPECIALIZED_MAXIMUM
 
-// AddToRow and AddToCol adds the corresponding row/col vector b to the matrix a
-// of shape M x N. The actual implementation uses eigen which is column major,
-// so notice the row/column swap in the actual implementation.
-#define DELEGATE_BROADCAST_BINARY_FUNCTION(T, Funcname, expr)                \
-  template <>                                                                \
-  void Funcname##ToRow<T, CPUContext>(                                       \
-      const int M, const int N, const T* a, const T* b, T* y, CPUContext*) { \
-    EigenArrayMap<T>(y, N, M) = ConstEigenArrayMap<T>(a, N, M).colwise()     \
-                                    expr ConstEigenVectorArrayMap<T>(b, N);  \
-  }                                                                          \
-  /* inplace versions */                                                     \
-  template <>                                                                \
-  void Funcname##ToRow<T, CPUContext>(                                       \
-      const int M, const int N, const T* x, T* y, CPUContext*) {             \
-    EigenArrayMap<T>(y, N, M).colwise() expr## =                             \
-        ConstEigenVectorArrayMap<T>(x, N);                                   \
-  }                                                                          \
-  template <>                                                                \
-  void Funcname##ToCol<T, CPUContext>(                                       \
-      const int M, const int N, const T* x, T* y, CPUContext*) {             \
-    EigenArrayMap<T>(y, N, M).rowwise() expr## =                             \
-        ConstEigenVectorArrayMap<T>(x, M).transpose();                       \
+// The actual implementation uses eigen which is column major, so notice the
+// row/column swap in the actual implementation.
+
+#define DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \
+  template <>                                                          \
+  void Rowwise##Func<T, CPUContext, true>(                             \
+      const int rows,                                                  \
+      const int cols,                                                  \
+      const T* A,                                                      \
+      const T* B,                                                      \
+      T* C,                                                            \
+      CPUContext*) {                                                   \
+    if (C == B) {                                                      \
+      EigenArrayMap<T>(C, cols, rows).colwise() expr## =               \
+          ConstEigenVectorArrayMap<T>(A, cols);                        \
+    } else {                                                           \
+      EigenArrayMap<T>(C, cols, rows) =                                \
+          ConstEigenArrayMap<T>(B, cols, rows)                         \
+              .colwise() expr ConstEigenVectorArrayMap<T>(A, cols);    \
+    }                                                                  \
+  }                                                                    \
+  template <>                                                          \
+  void Colwise##Func<T, CPUContext, true>(                             \
+      const int rows,                                                  \
+      const int cols,                                                  \
+      const T* A,                                                      \
+      const T* B,                                                      \
+      T* C,                                                            \
+      CPUContext*) {                                                   \
+    if (C == B) {                                                      \
+      EigenArrayMap<T>(C, cols, rows).rowwise() expr## =               \
+          ConstEigenVectorArrayMap<T>(A, rows).transpose();            \
+    } else {                                                           \
+      EigenArrayMap<T>(C, cols, rows) =                                \
+          ConstEigenArrayMap<T>(B, cols, rows)                         \
+              .rowwise() expr ConstEigenVectorArrayMap<T>(A, rows)     \
+              .transpose();                                            \
+    }                                                                  \
+  }
+
+#define DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) \
+  template <>                                                          \
+  void Rowwise##Func<T, CPUContext, false>(                            \
+      const int rows,                                                  \
+      const int cols,                                                  \
+      const T* A,                                                      \
+      const T* B,                                                      \
+      T* C,                                                            \
+      CPUContext*) {                                                   \
+    if (C == A) {                                                      \
+      EigenArrayMap<T>(C, cols, rows).colwise() expr## =               \
+          ConstEigenVectorArrayMap<T>(B, cols);                        \
+    } else {                                                           \
+      EigenArrayMap<T>(C, cols, rows) =                                \
+          ConstEigenArrayMap<T>(A, cols, rows)                         \
+              .colwise() expr ConstEigenVectorArrayMap<T>(B, cols);    \
+    }                                                                  \
+  }                                                                    \
+  template <>                                                          \
+  void Colwise##Func<T, CPUContext, false>(                            \
+      const int rows,                                                  \
+      const int cols,                                                  \
+      const T* A,                                                      \
+      const T* B,                                                      \
+      T* C,                                                            \
+      CPUContext*) {                                                   \
+    if (C == A) {                                                      \
+      EigenArrayMap<T>(C, cols, rows).rowwise() expr## =               \
+          ConstEigenVectorArrayMap<T>(B, rows).transpose();            \
+    } else {                                                           \
+      EigenArrayMap<T>(C, cols, rows) =                                \
+          ConstEigenArrayMap<T>(A, cols, rows)                         \
+              .rowwise() expr ConstEigenVectorArrayMap<T>(B, rows)     \
+              .transpose();                                            \
+    }                                                                  \
   }
 
-#define DEFINE_BROADCAST_BINARY_FUNCTION(name, op)                       \
-  DELEGATE_BROADCAST_BINARY_FUNCTION(int32_t, name, op)                  \
-  DELEGATE_BROADCAST_BINARY_FUNCTION(int64_t, name, op)                  \
-  DELEGATE_BROADCAST_BINARY_FUNCTION(float, name, op)                    \
+#define DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(T, Func, expr) \
+  DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr)   \
+  DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr)
 
-DEFINE_BROADCAST_BINARY_FUNCTION(Add, +)
-DEFINE_BROADCAST_BINARY_FUNCTION(Sub, -)
-DEFINE_BROADCAST_BINARY_FUNCTION(Mul, *)
-DEFINE_BROADCAST_BINARY_FUNCTION(Div, /)
+#define DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Func, expr)           \
+  DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(float, Func, expr)        \
+  DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(double, Func, expr)       \
+  DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, Func, expr) \
+  DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, Func, expr)
 
-#undef DEFINE_BROADCAST_BINARY_FUNCTION
-#undef DELEGATE_BROADCAST_BINARY_FUNCTION
+DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Add, +)
+DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *)
+
+#undef DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION
+#undef DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION
+
+#define DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(T)           \
+  template <>                                               \
+  void RowwiseSub<T, CPUContext, true>(                     \
+      const int rows,                                       \
+      const int cols,                                       \
+      const T* A,                                           \
+      const T* B,                                           \
+      T* C,                                                 \
+      CPUContext*) {                                        \
+    EigenArrayMap<T>(C, cols, rows) =                       \
+        (-ConstEigenArrayMap<T>(B, cols, rows)).colwise() + \
+        ConstEigenVectorArrayMap<T>(A, cols);               \
+  }                                                         \
+  template <>                                               \
+  void ColwiseSub<T, CPUContext, true>(                     \
+      const int rows,                                       \
+      const int cols,                                       \
+      const T* A,                                           \
+      const T* B,                                           \
+      T* C,                                                 \
+      CPUContext*) {                                        \
+    EigenArrayMap<T>(C, cols, rows) =                       \
+        (-ConstEigenArrayMap<T>(B, cols, rows)).rowwise() + \
+        ConstEigenVectorArrayMap<T>(A, rows).transpose();   \
+  }                                                         \
+  DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Sub, -)
+
+DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(float)
+DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(double)
+DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int32_t)
+DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int64_t)
+
+#undef DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION
+
+#define DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(T)                  \
+  template <>                                                      \
+  void RowwiseDiv<T, CPUContext, true>(                            \
+      const int rows,                                              \
+      const int cols,                                              \
+      const T* A,                                                  \
+      const T* B,                                                  \
+      T* C,                                                        \
+      CPUContext*) {                                               \
+    EigenArrayMap<T>(C, cols, rows) =                              \
+        ConstEigenArrayMap<T>(B, cols, rows).inverse().colwise() * \
+        ConstEigenVectorArrayMap<T>(A, cols);                      \
+  }                                                                \
+  template <>                                                      \
+  void ColwiseDiv<T, CPUContext, true>(                            \
+      const int rows,                                              \
+      const int cols,                                              \
+      const T* A,                                                  \
+      const T* B,                                                  \
+      T* C,                                                        \
+      CPUContext*) {                                               \
+    EigenArrayMap<T>(C, cols, rows) =                              \
+        ConstEigenArrayMap<T>(B, cols, rows).inverse().rowwise() * \
+        ConstEigenVectorArrayMap<T>(A, rows).transpose();          \
+  }                                                                \
+  DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Div, /)
+
+DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(float)
+DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(double)
+DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int32_t, Div, /)
+DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int64_t, Div, /)
+
+#undef DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION
+
+#undef DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION
+#undef DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION
 
 #define CAFFE2_SPECIALIZED_SET(T)                                             \
   template <>                                                                 \
@@ -1053,48 +1205,13 @@ CAFFE2_SPECIALIZED_SET(uint8_t);
 CAFFE2_SPECIALIZED_SET(uint16_t);
 #undef CAFFE2_SPECIALIZED_SET
 
-#define CAFFE2_INSTANTIATE_BINARY_OP(name, op, T)                  \
-  template <>                                                      \
-  void name<T, CPUContext>(                                        \
-      const int n, const T* a, const T* b, bool* y, CPUContext*) { \
-    for (int i = 0; i < n; ++i) {                                  \
-      y[i] = a[i] op b[i];                                         \
-    }                                                              \
-  }                                                                \
-  template <>                                                      \
-  void name##ToRow<T, CPUContext>(                                 \
-      const int m,                                                 \
-      const int n,                                                 \
-      const T* a,                                                  \
-      const T* b,                                                  \
-      bool* y,                                                     \
-      CPUContext*) {                                               \
-    for (int i = 0; i < n * m; ++i) {                              \
-      y[i] = a[i] op b[i % n];                                     \
-    }                                                              \
-  }
-
-#define CAFFE2_DEFINE_BINARY_OP(name, op)         \
-  CAFFE2_INSTANTIATE_BINARY_OP(name, op, float)   \
-  CAFFE2_INSTANTIATE_BINARY_OP(name, op, int32_t) \
-  CAFFE2_INSTANTIATE_BINARY_OP(name, op, int64_t)
-
-CAFFE2_DEFINE_BINARY_OP(LT, <);
-CAFFE2_DEFINE_BINARY_OP(LE, <=);
-CAFFE2_DEFINE_BINARY_OP(GT, >);
-CAFFE2_DEFINE_BINARY_OP(GE, >=);
-
-CAFFE2_INSTANTIATE_BINARY_OP(Or, |, bool);
-CAFFE2_INSTANTIATE_BINARY_OP(And, &, bool);
-CAFFE2_INSTANTIATE_BINARY_OP(Xor, ^, bool);
-
 template <>
 void Not<bool, CPUContext>(
-    const int n,
+    const int N,
     const bool* x,
     bool* y,
     CPUContext* /*context*/) {
-  for (int i = 0; i < n; ++i) {
+  for (int i = 0; i < N; ++i) {
     y[i] = !x[i];
   }
 }
@@ -1119,6 +1236,363 @@ void Not<bool, CPUContext>(
 CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float);
 #undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH
 
+namespace {
+
+template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
+void RowwiseBinaryOp(
+    const int rows,
+    const int cols,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      const int C_index = i * cols + j;
+      const int A_index = kBroadcast1st ? j : C_index;
+      const int B_index = kBroadcast1st ? C_index : j;
+      C[C_index] = op(A[A_index], B[B_index]);
+    }
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
+void ColwiseBinaryOp(
+    const int rows,
+    const int cols,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      const int C_index = i * cols + j;
+      const int A_index = kBroadcast1st ? i : C_index;
+      const int B_index = kBroadcast1st ? C_index : i;
+      C[C_index] = op(A[A_index], B[B_index]);
+    }
+  }
+}
+
+template <typename TIn, typename TOut, class Operator1, class Operator2>
+void BinaryOpWith2DBroadcasting(
+    const int ndim,
+    const int* dims,
+    const int pivot,
+    const bool broadcast_1st,
+    const Operator1& op1,
+    const Operator2& op2,
+    const TIn* A,
+    const TIn* B,
+    TOut* C,
+    CPUContext* context) {
+  const int rows =
+      std::accumulate(dims, dims + pivot, 1, std::multiplies<int>());
+  const int cols =
+      std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies<int>());
+  if (broadcast_1st) {
+    op1(rows, cols, A, B, C, context);
+  } else {
+    op2(rows, cols, A, B, C, context);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator>
+void BroadcastBinaryOpImpl(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    const int* C_dims,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  std::vector<int> index(ndim, 0);
+  const int C_size =
+      std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies<int>());
+  for (int C_index = 0; C_index < C_size; ++C_index) {
+    const int A_index = utils::GetIndexFromDims(ndim, A_dims, index.data());
+    const int B_index = utils::GetIndexFromDims(ndim, B_dims, index.data());
+    C[C_index] = op(A[A_index], B[B_index]);
+    utils::IncreaseIndexInDims(ndim, C_dims, index.data());
+  }
+}
+
+} // namespace
+
+#define DELEGATE_1D_BINARY_FUNCTION(TIn, TOut, Func, Op)               \
+  template <>                                                          \
+  void Func<TIn, CPUContext>(                                          \
+      const int N, const TIn* A, const TIn* B, TOut* C, CPUContext*) { \
+    std::transform(A, A + N, B, C, Op<TIn>());                         \
+  }
+
+#define DEFINE_1D_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_1D_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_1D_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_1D_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_1D_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_1D_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_1D_COMPARE_FUNCTION(EQ, std::equal_to)
+DEFINE_1D_COMPARE_FUNCTION(NE, std::not_equal_to)
+DEFINE_1D_COMPARE_FUNCTION(LT, std::less)
+DEFINE_1D_COMPARE_FUNCTION(LE, std::less_equal)
+DEFINE_1D_COMPARE_FUNCTION(GT, std::greater)
+DEFINE_1D_COMPARE_FUNCTION(GE, std::greater_equal)
+
+#undef DEFINE_1D_COMPARE_FUNCTION
+
+DELEGATE_1D_BINARY_FUNCTION(bool, bool, And, std::logical_and)
+DELEGATE_1D_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
+DELEGATE_1D_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
+
+#define DEFINE_1D_BITWISE_BINARY_FUNCTION(Func, op)                 \
+  DELEGATE_1D_BINARY_FUNCTION(bool, bool, Func, op)                 \
+  DELEGATE_1D_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, op) \
+  DELEGATE_1D_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, op)
+
+DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
+DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
+DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
+
+#undef DEFINE_1D_BITWISE_BINARY_FUNCTION
+
+#undef DELEGATE_1D_BINARY_FUNCTION
+
+#define DELEGATE_2D_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op)             \
+  template <>                                                                  \
+  void Rowwise##Func<TIn, CPUContext, true>(                                   \
+      const int rows,                                                          \
+      const int cols,                                                          \
+      const TIn* A,                                                            \
+      const TIn* B,                                                            \
+      TOut* C,                                                                 \
+      CPUContext*) {                                                           \
+    RowwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
+  }                                                                            \
+  template <>                                                                  \
+  void Rowwise##Func<TIn, CPUContext, false>(                                  \
+      const int rows,                                                          \
+      const int cols,                                                          \
+      const TIn* A,                                                            \
+      const TIn* B,                                                            \
+      TOut* C,                                                                 \
+      CPUContext*) {                                                           \
+    RowwiseBinaryOp<TIn, TOut, Op<TIn>, false>(                                \
+        rows, cols, Op<TIn>(), A, B, C);                                       \
+  }                                                                            \
+  template <>                                                                  \
+  void Colwise##Func<TIn, CPUContext, true>(                                   \
+      const int rows,                                                          \
+      const int cols,                                                          \
+      const TIn* A,                                                            \
+      const TIn* B,                                                            \
+      TOut* C,                                                                 \
+      CPUContext*) {                                                           \
+    ColwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
+  }                                                                            \
+  template <>                                                                  \
+  void Colwise##Func<TIn, CPUContext, false>(                                  \
+      const int rows,                                                          \
+      const int cols,                                                          \
+      const TIn* A,                                                            \
+      const TIn* B,                                                            \
+      TOut* C,                                                                 \
+      CPUContext*) {                                                           \
+    ColwiseBinaryOp<TIn, TOut, Op<TIn>, false>(                                \
+        rows, cols, Op<TIn>(), A, B, C);                                       \
+  }
+
+#define DEFINE_2D_COMPARE_FUNCTION(Func, Op)                          \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_2D_COMPARE_FUNCTION(EQ, std::equal_to)
+DEFINE_2D_COMPARE_FUNCTION(NE, std::not_equal_to)
+DEFINE_2D_COMPARE_FUNCTION(LT, std::less)
+DEFINE_2D_COMPARE_FUNCTION(LE, std::less_equal)
+DEFINE_2D_COMPARE_FUNCTION(GT, std::greater)
+DEFINE_2D_COMPARE_FUNCTION(GE, std::greater_equal)
+
+#undef DEFINE_2D_COMPARE_FUNCTION
+
+DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and)
+DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
+DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
+
+#define DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op)                 \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)                 \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
+DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
+DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
+
+#undef DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION
+
+#undef DELEGATE_2D_BROADCAST_BINARY_FUNCTION
+
+#define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T)   \
+  template <>                                     \
+  void RowwiseDiv<T, CPUContext, true>(           \
+      const int rows,                             \
+      const int cols,                             \
+      const T* A,                                 \
+      const T* B,                                 \
+      T* C,                                       \
+      CPUContext*) {                              \
+    RowwiseBinaryOp<T, T, std::divides<T>, true>( \
+        rows, cols, std::divides<T>(), A, B, C);  \
+  }                                               \
+  template <>                                     \
+  void ColwiseDiv<T, CPUContext, true>(           \
+      const int rows,                             \
+      const int cols,                             \
+      const T* A,                                 \
+      const T* B,                                 \
+      T* C,                                       \
+      CPUContext*) {                              \
+    ColwiseBinaryOp<T, T, std::divides<T>, true>( \
+        rows, cols, std::divides<T>(), A, B, C);  \
+  }
+DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int32_t)
+DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int64_t)
+#undef DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION
+
+#define DELEGATE_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \
+  template <>                                                   \
+  void Func<TIn, CPUContext>(                                   \
+      const int A_ndim,                                         \
+      const int* A_dims,                                        \
+      const int B_ndim,                                         \
+      const int* B_dims,                                        \
+      const TIn* A,                                             \
+      const TIn* B,                                             \
+      TOut* C,                                                  \
+      CPUContext* context) {                                    \
+    const int ndim = std::max(A_ndim, B_ndim);                  \
+    std::vector<int> A_dims_array(ndim);                        \
+    std::vector<int> B_dims_array(ndim);                        \
+    std::vector<int> C_dims_array(ndim);                        \
+    utils::ComputeBroadcastBinaryOpDims(                        \
+        A_ndim,                                                 \
+        A_dims,                                                 \
+        B_ndim,                                                 \
+        B_dims,                                                 \
+        A_dims_array.data(),                                    \
+        B_dims_array.data(),                                    \
+        C_dims_array.data());                                   \
+    if (A_dims_array == B_dims_array) {                         \
+      const int size = std::accumulate(                         \
+          C_dims_array.cbegin(),                                \
+          C_dims_array.cend(),                                  \
+          1,                                                    \
+          std::multiplies<int>());                              \
+      Func<TIn, CPUContext>(size, A, B, C, context);            \
+      return;                                                   \
+    }                                                           \
+    int pivot;                                                  \
+    bool broadcast_1st;                                         \
+    if (utils::IsRowwiseBroadcastBinaryOp(                      \
+            ndim,                                               \
+            A_dims_array.data(),                                \
+            B_dims_array.data(),                                \
+            &pivot,                                             \
+            &broadcast_1st)) {                                  \
+      BinaryOpWith2DBroadcasting(                               \
+          ndim,                                                 \
+          C_dims_array.data(),                                  \
+          pivot,                                                \
+          broadcast_1st,                                        \
+          Rowwise##Func<TIn, CPUContext, true>,                 \
+          Rowwise##Func<TIn, CPUContext, false>,                \
+          A,                                                    \
+          B,                                                    \
+          C,                                                    \
+          context);                                             \
+      return;                                                   \
+    }                                                           \
+    if (utils::IsColwiseBroadcastBinaryOp(                      \
+            ndim,                                               \
+            A_dims_array.data(),                                \
+            B_dims_array.data(),                                \
+            &pivot,                                             \
+            &broadcast_1st)) {                                  \
+      BinaryOpWith2DBroadcasting(                               \
+          ndim,                                                 \
+          C_dims_array.data(),                                  \
+          pivot,                                                \
+          broadcast_1st,                                        \
+          Colwise##Func<TIn, CPUContext, true>,                 \
+          Colwise##Func<TIn, CPUContext, false>,                \
+          A,                                                    \
+          B,                                                    \
+          C,                                                    \
+          context);                                             \
+      return;                                                   \
+    }                                                           \
+    BroadcastBinaryOpImpl(                                      \
+        ndim,                                                   \
+        A_dims_array.data(),                                    \
+        B_dims_array.data(),                                    \
+        C_dims_array.data(),                                    \
+        Op<TIn>(),                                              \
+        A,                                                      \
+        B,                                                      \
+        C);                                                     \
+  }
+
+#define DEFINE_BROADCAST_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_BROADCAST_COMPARE_FUNCTION(EQ, std::equal_to)
+DEFINE_BROADCAST_COMPARE_FUNCTION(NE, std::not_equal_to)
+DEFINE_BROADCAST_COMPARE_FUNCTION(LT, std::less)
+DEFINE_BROADCAST_COMPARE_FUNCTION(LE, std::less_equal)
+DEFINE_BROADCAST_COMPARE_FUNCTION(GT, std::greater)
+DEFINE_BROADCAST_COMPARE_FUNCTION(GE, std::greater_equal)
+
+#undef DEFINE_BROADCAST_COMPARE_FUNCTION
+
+#define DEFINE_BROADCAST_BINARY_FUNCTION(Func, Op)                         \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(float, float, Func, Op)               \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(double, double, Func, Op)             \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_BROADCAST_BINARY_FUNCTION(Add, std::plus)
+DEFINE_BROADCAST_BINARY_FUNCTION(Sub, std::minus)
+DEFINE_BROADCAST_BINARY_FUNCTION(Mul, std::multiplies)
+DEFINE_BROADCAST_BINARY_FUNCTION(Div, std::divides)
+
+#undef DEFINE_BROADCAST_BINARY_FUNCTION
+
+DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and)
+DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
+DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
+
+#define DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op)                 \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)                 \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
+DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
+DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
+
+#undef DEFINE_BITWISE_BROADCAST_BINARY_FUNCTION
+
+#undef DELEGATE_BROADCAST_BINARY_FUNCTION
+
 template <>
 void RandUniform<float, CPUContext>(
     const size_t n,
@@ -1127,7 +1601,7 @@ void RandUniform<float, CPUContext>(
     float* r,
     CPUContext* context) {
   std::uniform_real_distribution<float> distribution(a, b);
-  for (auto i = 0; i < n; ++i) {
+  for (size_t i = 0; i < n; ++i) {
     r[i] = distribution(context->RandGenerator());
   }
 }
@@ -1140,7 +1614,7 @@ void RandUniform<int, CPUContext>(
     int* r,
     CPUContext* context) {
   std::uniform_int_distribution<int> distribution(a, b);
-  for (auto i = 0; i < n; ++i) {
+  for (size_t i = 0; i < n; ++i) {
     r[i] = distribution(context->RandGenerator());
   }
 }
@@ -1185,7 +1659,7 @@ void RandGaussian<float, CPUContext>(
     float* r,
     CPUContext* context) {
   std::normal_distribution<float> distribution(mean, std);
-  for (auto i = 0; i < n; ++i) {
+  for (size_t i = 0; i < n; ++i) {
     r[i] = distribution(context->RandGenerator());
   }
 }
@@ -1280,7 +1754,7 @@ void Im2ColNdNCHWImpl(
       } else if (!is_padding) {
         Y_data[img_index] += X_data[col_index];
       }
-      internal::IncreaseIndexInDims(N, col_shape + 1, d_iter.data());
+      utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data());
     }
   }
 }
@@ -1693,10 +2167,9 @@ void BiasCHW<float, CPUContext>(
 
     // FIXME: if input < kVecSizeInFloat, can't vectorize at all
 
-    int prologue =
-      kVecSizeInFloat -
-      // remainder in floats
-      (((uintptr_t) image) % (sizeof(float32x4_t))) / sizeof(float);
+    int prologue = kVecSizeInFloat -
+        // remainder in floats
+        (((uintptr_t)image) % (sizeof(float32x4_t))) / sizeof(float);
 
     int i = 0;
     // Prologue loop
@@ -1900,7 +2373,7 @@ void TransposeCPUImpl(
           X + block_size * X_index,
           block_size * sizeof(T));
     }
-    internal::IncreaseIndexInDims(itr_axes, Y_dims.data(), index.data());
+    utils::IncreaseIndexInDims(itr_axes, Y_dims.data(), index.data());
   }
 }
 
@@ -1922,16 +2395,16 @@ void Transpose<float, CPUContext>(
   TransposeCPUImpl(ndim, dims, axes, X, Y);
 }
 
-#define CAFFE2_SPECIALIZED_TRANSPOSE(T)           \
-  template <>                                     \
-  void Transpose<T, CPUContext>(                  \
-      const int ndim,                             \
-      const int* dims,                            \
-      const int* axes,                            \
-      const T* X,                                 \
-      T* Y,                                       \
-      CPUContext* /* context */) {                \
-    TransposeCPUImpl(ndim, dims, axes, X, Y);     \
+#define CAFFE2_SPECIALIZED_TRANSPOSE(T)       \
+  template <>                                 \
+  void Transpose<T, CPUContext>(              \
+      const int ndim,                         \
+      const int* dims,                        \
+      const int* axes,                        \
+      const T* X,                             \
+      T* Y,                                   \
+      CPUContext* /* context */) {            \
+    TransposeCPUImpl(ndim, dims, axes, X, Y); \
   }
 CAFFE2_SPECIALIZED_TRANSPOSE(double)
 CAFFE2_SPECIALIZED_TRANSPOSE(int)
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index ca05238a62fe91..b667351c78d564 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -9,6 +9,8 @@
 #include <cub/block/block_reduce.cuh>
 #include <cub/cub.cuh>
 
+#include <thrust/functional.h>
+
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/utils/conversions.h"
 
@@ -19,117 +21,604 @@
 namespace caffe2 {
 namespace math {
 
-#define DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(T, Funcname, function)          \
-  __global__ void _Kernel_##T##_##Funcname(const int N, const T* x, T* y) { \
-    CUDA_1D_KERNEL_LOOP(i, N) {                                             \
-      y[i] = function(x[i]);                                                \
-    }                                                                       \
-  }                                                                         \
-  template <>                                                               \
-  void Funcname<T, CUDAContext>(                                            \
-      const int N, const T* x, T* y, CUDAContext* context) {                \
-    _Kernel_##T##_##Funcname<<<                                             \
-        CAFFE_GET_BLOCKS(N),                                                \
-        CAFFE_CUDA_NUM_THREADS,                                             \
-        0,                                                                  \
-        context->cuda_stream()>>>(N, x, y);                                 \
-  }
-
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf);
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf);
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos, cosf);
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos, acosf);
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin, sinf);
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin, asinf);
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan, tanf);
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan, atanf);
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs, fabsf);
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt, sqrtf);
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, InvSqrt, rsqrtf);
-
-__device__ float cuda_sqrf(const float x) {
+namespace {
+
+inline __host__ __device__ bool Not(const bool x) {
+  return !x;
+}
+
+template <typename T>
+inline __host__ __device__ T Negate(const T& x) {
+  return -x;
+}
+
+template <typename T>
+inline __host__ __device__ T Square(const T& x) {
   return x * x;
 }
 
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, cuda_sqrf);
+template <typename T>
+inline __host__ __device__ T Sign(const T& x) {
+  return x > 0 ? T(1) : (x < 0 ? T(-1) : T(0));
+}
+
+#define DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Func, expr)        \
+  template <typename T>                                               \
+  struct Func##Functor {                                              \
+    inline __host__ __device__ T                                      \
+    operator()(const T& lhs, const T& rhs) const {                    \
+      return lhs expr rhs;                                            \
+    }                                                                 \
+  };                                                                  \
+  template <>                                                         \
+  struct Func##Functor<float16> {                                     \
+    inline __host__ __device__ float16                                \
+    operator()(const float16& lhs, const float16& rhs) const {        \
+      return convert::To<float, float16>(convert::To<float16, float>( \
+          lhs) expr convert::To<float16, float>(rhs));                \
+    }                                                                 \
+  };
+DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Add, +)
+DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Sub, -)
+DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Mul, *)
+DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Div, /)
+#undef DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR
+
+template <typename T>
+__global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+#if __CUDA_ARCH__ >= 350
+    sincos(__ldg(X + i), S + i, C + i);
+#else
+    sincos(X[i], S + i, C + i);
+#endif
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator>
+__global__ void SimpleBinaryOpCUDAKernel(
+    const int N,
+    const BinaryOperator op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    C[i] = op(A[i], B[i]);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
+__global__ void RowwiseBinaryOpCUDAKenel(
+    const int rows,
+    const int cols,
+    const BinaryOperator op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  const int size = rows * cols;
+  CUDA_1D_KERNEL_LOOP(C_index, size) {
+    const int j = C_index % cols;
+    const int A_index = broadcast_1st ? j : C_index;
+    const int B_index = broadcast_1st ? C_index : j;
+    C[C_index] = op(A[A_index], B[B_index]);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
+__global__ void ColwiseBinaryOpCUDAKenel(
+    const int rows,
+    const int cols,
+    const BinaryOperator op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  const int size = rows * cols;
+  CUDA_1D_KERNEL_LOOP(C_index, size) {
+    const int i = C_index / cols;
+    const int A_index = broadcast_1st ? i : C_index;
+    const int B_index = broadcast_1st ? C_index : i;
+    C[C_index] = op(A[A_index], B[B_index]);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, int D>
+__global__ void BroadcastBinaryOpCUDAKernel(
+    const int size,
+    const SimpleArray<int, D> A_strides,
+    const SimpleArray<int, D> B_strides,
+    const SimpleArray<int, D> C_dims,
+    const BinaryOperator op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C) {
+  CUDA_1D_KERNEL_LOOP(C_index, size) {
+    int A_index = 0;
+    int B_index = 0;
+    int C_index_val = C_index;
+#pragma unroll
+    for (int i = D - 1; i >= 0; --i) {
+      const int d = C_index_val % C_dims.data[i];
+      A_index += A_strides.data[i] == 0 ? 0 : d * A_strides.data[i];
+      B_index += B_strides.data[i] == 0 ? 0 : d * B_strides.data[i];
+      C_index_val /= C_dims.data[i];
+    }
+    C[C_index] = op(A[A_index], B[B_index]);
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator>
+void BinaryOpWith2DBroadcasting(
+    const int ndim,
+    const int* dims,
+    const int pivot,
+    const bool rowwise_broadcast,
+    const bool broadcast_1st,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C,
+    CUDAContext* context) {
+  const int rows =
+      std::accumulate(dims, dims + pivot, 1, std::multiplies<int>());
+  const int cols =
+      std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies<int>());
+  const int size = rows * cols;
+  if (rowwise_broadcast) {
+    if (broadcast_1st) {
+      RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true>
+          <<<CAFFE_GET_BLOCKS(size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context->cuda_stream()>>>(rows, cols, op, A, B, C);
+    } else {
+      RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
+          <<<CAFFE_GET_BLOCKS(size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context->cuda_stream()>>>(rows, cols, op, A, B, C);
+    }
+  } else {
+    if (broadcast_1st) {
+      ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true>
+          <<<CAFFE_GET_BLOCKS(size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context->cuda_stream()>>>(rows, cols, op, A, B, C);
+    } else {
+      ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
+          <<<CAFFE_GET_BLOCKS(size),
+             CAFFE_CUDA_NUM_THREADS,
+             0,
+             context->cuda_stream()>>>(rows, cols, op, A, B, C);
+    }
+  }
+}
+
+template <typename TIn, typename TOut, class BinaryOperator, int D>
+void BroadcastBinaryOpImpl(
+    const int* A_dims,
+    const int* B_dims,
+    const int* C_dims,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C,
+    CUDAContext* context) {
+  SimpleArray<int, D> A_strides_array;
+  SimpleArray<int, D> B_strides_array;
+  SimpleArray<int, D> C_dims_array;
+  int A_stride = 1;
+  int B_stride = 1;
+  for (int i = D - 1; i >= 0; --i) {
+    A_strides_array.data[i] = A_dims[i] == 1 ? 0 : A_stride;
+    B_strides_array.data[i] = B_dims[i] == 1 ? 0 : B_stride;
+    A_stride *= A_dims[i];
+    B_stride *= B_dims[i];
+  }
+  std::copy(C_dims, C_dims + D, C_dims_array.data);
+  const int size =
+      std::accumulate(C_dims, C_dims + D, 1, std::multiplies<int>());
+  BroadcastBinaryOpCUDAKernel<TIn, TOut, BinaryOperator, D>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context->cuda_stream()>>>(
+          size, A_strides_array, B_strides_array, C_dims_array, op, A, B, C);
+}
+
+template <typename TIn, typename TOut, class BinaryOperator>
+void BroadcastBinaryOp(
+    const int A_ndim,
+    const int* A_dims,
+    const int B_ndim,
+    const int* B_dims,
+    const BinaryOperator& op,
+    const TIn* A,
+    const TIn* B,
+    TOut* C,
+    CUDAContext* context) {
+  const int ndim = std::max(A_ndim, B_ndim);
+  std::vector<int> A_dims_array(ndim);
+  std::vector<int> B_dims_array(ndim);
+  std::vector<int> C_dims_array(ndim);
+  utils::ComputeBroadcastBinaryOpDims(
+      A_ndim,
+      A_dims,
+      B_ndim,
+      B_dims,
+      A_dims_array.data(),
+      B_dims_array.data(),
+      C_dims_array.data());
+  if (A_dims_array == B_dims_array) {
+    const int size = std::accumulate(
+        C_dims_array.cbegin(), C_dims_array.cend(), 1, std::multiplies<int>());
+    SimpleBinaryOpCUDAKernel<TIn, TOut, BinaryOperator>
+        <<<CAFFE_GET_BLOCKS(size),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context->cuda_stream()>>>(size, op, A, B, C);
+    return;
+  }
+  int pivot;
+  bool broadcast_1st;
+  if (utils::IsRowwiseBroadcastBinaryOp(
+          ndim,
+          A_dims_array.data(),
+          B_dims_array.data(),
+          &pivot,
+          &broadcast_1st)) {
+    BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
+        ndim,
+        C_dims_array.data(),
+        pivot,
+        true,
+        broadcast_1st,
+        op,
+        A,
+        B,
+        C,
+        context);
+    return;
+  }
+  if (utils::IsColwiseBroadcastBinaryOp(
+          ndim,
+          A_dims_array.data(),
+          B_dims_array.data(),
+          &pivot,
+          &broadcast_1st)) {
+    BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
+        ndim,
+        C_dims_array.data(),
+        pivot,
+        false,
+        broadcast_1st,
+        op,
+        A,
+        B,
+        C,
+        context);
+    return;
+  }
+  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(
+      ndim,
+      BroadcastBinaryOpImpl,
+      TIn,
+      TOut,
+      BinaryOperator,
+      A_dims_array.data(),
+      B_dims_array.data(),
+      C_dims_array.data(),
+      op,
+      A,
+      B,
+      C,
+      context);
+}
+
+} // namespace
+
+#define DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func, op)            \
+  __global__ void Func##CUDAKernel(const int N, const T* X, T* Y) { \
+    CUDA_1D_KERNEL_LOOP(i, N) {                                     \
+      Y[i] = op(X[i]);                                              \
+    }                                                               \
+  }                                                                 \
+  template <>                                                       \
+  void Func<T, CUDAContext>(                                        \
+      const int N, const T* x, T* y, CUDAContext* context) {        \
+    Func##CUDAKernel<<<                                             \
+        CAFFE_GET_BLOCKS(N),                                        \
+        CAFFE_CUDA_NUM_THREADS,                                     \
+        0,                                                          \
+        context->cuda_stream()>>>(N, x, y);                         \
+  }
+
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos, cosf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos, acosf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin, sinf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin, asinf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan, tanf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan, atanf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs, fabsf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt, sqrtf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, InvSqrt, rsqrtf)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, Square<float>)
+
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not, Not)
+
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg, Negate<float>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg, Negate<double>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Neg, Negate<std::int32_t>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Neg, Negate<std::int64_t>)
+
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign, Sign<float>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign, Sign<double>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Sign, Sign<std::int32_t>)
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Sign, Sign<std::int64_t>)
 
 #undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION
 
-#define DELEGATE_SINCOS_CUDA_FUNCTION(T)                             \
-  __global__ void _Kernel_##T##_##SinCos(                            \
-      const int N, const T* x, T* ys, T* yc) {                       \
-    CUDA_1D_KERNEL_LOOP(i, N) {                                      \
-      sincos(x[i], ys + i, yc + i);                                  \
-    }                                                                \
-  }                                                                  \
+#define CAFFE2_SPECIALIZED_CUDA_SINCOS(T)                            \
   template <>                                                        \
   void SinCos<T, CUDAContext>(                                       \
       const int N, const T* x, T* ys, T* yc, CUDAContext* context) { \
-    _Kernel_##T##_##SinCos<<<                                        \
+    SinCosCUDAKernel<<<                                              \
         CAFFE_GET_BLOCKS(N),                                         \
         CAFFE_CUDA_NUM_THREADS,                                      \
         0,                                                           \
         context->cuda_stream()>>>(N, x, ys, yc);                     \
   }
+CAFFE2_SPECIALIZED_CUDA_SINCOS(float)
+CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
+#undef CAFFE2_SPECIALIZED_CUDA_SINCOS
+
+#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
+  template <>                                                     \
+  void Func<TIn, CUDAContext>(                                    \
+      const int N,                                                \
+      const TIn* A,                                               \
+      const TIn* B,                                               \
+      TOut* C,                                                    \
+      CUDAContext* context) {                                     \
+    SimpleBinaryOpCUDAKernel<TIn, TOut, Op<TIn>>                  \
+        <<<CAFFE_GET_BLOCKS(N),                                   \
+           CAFFE_CUDA_NUM_THREADS,                                \
+           0,                                                     \
+           context->cuda_stream()>>>(N, Op<TIn>(), A, B, C);      \
+  }
 
-DELEGATE_SINCOS_CUDA_FUNCTION(float)
-DELEGATE_SINCOS_CUDA_FUNCTION(double)
+#define DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)
 
-#undef DELEGATE_SINCOS_CUDA_FUNCTION
+DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
+DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
+DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LT, thrust::less)
+DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
+DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
+DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)
 
-#define DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(T, Funcname, expr)         \
-  __global__ void _Kernel_##T##_##Funcname(                                   \
-      const int N, const T* a, const T* b, T* y) {                            \
-    CUDA_1D_KERNEL_LOOP(i, N) {                                               \
-      float r = convert::To<T, float>(a[i]) expr convert::To<T, float>(b[i]); \
-      y[i] = convert::To<float, T>(r);                                        \
-    }                                                                         \
-  }                                                                           \
-  template <>                                                                 \
-  void Funcname<T, CUDAContext>(                                              \
-      const int N, const T* a, const T* b, T* y, CUDAContext* context) {      \
-    _Kernel_##T##_##Funcname<<<                                               \
-        CAFFE_GET_BLOCKS(N),                                                  \
-        CAFFE_CUDA_NUM_THREADS,                                               \
-        0,                                                                    \
-        context->cuda_stream()>>>(N, a, b, y);                                \
-  }
-
-DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float, Add, +);
-DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(int32_t, Add, +);
-DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float, Sub, -);
-DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float, Mul, *);
-DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float, Div, /);
-
-DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float16, Add, +);
-DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float16, Sub, -);
-DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float16, Mul, *);
-DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float16, Div, /);
-
-#undef DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION
-
-#define DELEGATE_SIMPLE_CUDA_BINARY_PREFIX_FUNCTION(T, Funcname, func)    \
-  __global__ void _Kernel_##T##_##Funcname(                               \
-      const int N, const T* a, const T* b, T* y) {                        \
-    CUDA_1D_KERNEL_LOOP(i, N) {                                           \
-      float r =                                                           \
-          func(convert::To<T, float>(a[i]), convert::To<T, float>(b[i])); \
-      y[i] = convert::To<float, T>(r);                                    \
-    }                                                                     \
-  }                                                                       \
-  template <>                                                             \
-  void Funcname<T, CUDAContext>(                                          \
-      const int N, const T* a, const T* b, T* y, CUDAContext* context) {  \
-    _Kernel_##T##_##Funcname<<<                                           \
-        CAFFE_GET_BLOCKS(N),                                              \
-        CAFFE_CUDA_NUM_THREADS,                                           \
-        0,                                                                \
-        context->cuda_stream()>>>(N, a, b, y);                            \
-  }
+#undef DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION
+
+#define DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Func, Op)                         \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, float, Func, Op)               \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, double, Func, Op)             \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)
+
+DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Add, AddFunctor)
+DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
+DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
+DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Div, DivFunctor)
+
+#undef DEFINE_SIMPLE_CUDA_BINARY_FUNCTION
+
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
+
+#define DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(Func, Op)                 \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)                 \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
+  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
+DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
+DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
 
-DELEGATE_SIMPLE_CUDA_BINARY_PREFIX_FUNCTION(float, ElemwiseMax, fmaxf);
+#undef DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION
 
-#undef DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
+    float,
+    float,
+    ElemwiseMax,
+    thrust::maximum);
+
+#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION
+
+#define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
+  template <>                                                           \
+  void Rowwise##Func<TIn, CUDAContext, true>(                           \
+      const int rows,                                                   \
+      const int cols,                                                   \
+      const TIn* A,                                                     \
+      const TIn* B,                                                     \
+      TOut* C,                                                          \
+      CUDAContext* context) {                                           \
+    const int size = rows * cols;                                       \
+    RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true>                  \
+        <<<CAFFE_GET_BLOCKS(size),                                      \
+           CAFFE_CUDA_NUM_THREADS,                                      \
+           0,                                                           \
+           context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C);   \
+  }                                                                     \
+  template <>                                                           \
+  void Rowwise##Func<TIn, CUDAContext, false>(                          \
+      const int rows,                                                   \
+      const int cols,                                                   \
+      const TIn* A,                                                     \
+      const TIn* B,                                                     \
+      TOut* C,                                                          \
+      CUDAContext* context) {                                           \
+    const int size = rows * cols;                                       \
+    RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false>                 \
+        <<<CAFFE_GET_BLOCKS(size),                                      \
+           CAFFE_CUDA_NUM_THREADS,                                      \
+           0,                                                           \
+           context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C);   \
+  }                                                                     \
+  template <>                                                           \
+  void Colwise##Func<TIn, CUDAContext, true>(                           \
+      const int rows,                                                   \
+      const int cols,                                                   \
+      const TIn* A,                                                     \
+      const TIn* B,                                                     \
+      TOut* C,                                                          \
+      CUDAContext* context) {                                           \
+    const int size = rows * cols;                                       \
+    ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true>                  \
+        <<<CAFFE_GET_BLOCKS(size),                                      \
+           CAFFE_CUDA_NUM_THREADS,                                      \
+           0,                                                           \
+           context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C);   \
+  }                                                                     \
+  template <>                                                           \
+  void Colwise##Func<TIn, CUDAContext, false>(                          \
+      const int rows,                                                   \
+      const int cols,                                                   \
+      const TIn* A,                                                     \
+      const TIn* B,                                                     \
+      TOut* C,                                                          \
+      CUDAContext* context) {                                           \
+    const int size = rows * cols;                                       \
+    ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false>                 \
+        <<<CAFFE_GET_BLOCKS(size),                                      \
+           CAFFE_CUDA_NUM_THREADS,                                      \
+           0,                                                           \
+           context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C);   \
+  }
+
+#define DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
+DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
+DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less)
+DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
+DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
+DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)
+
+#undef DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION
+
+#define DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op)             \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                          \
+      std::int32_t, std::int32_t, Func, Op)                            \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                          \
+      std::int64_t, std::int64_t, Func, Op)                            \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op)   \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)
+
+DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor)
+DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
+DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
+DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor)
+
+#undef DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION
+
+DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
+DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
+DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
+
+#define DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                      \
+      std::int32_t, std::int32_t, Func, Op)                        \
+  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                      \
+      std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
+DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
+DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
+
+#undef DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION
+
+#undef DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION
+
+#define DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op)  \
+  template <>                                                         \
+  void Func<TIn, CUDAContext>(                                        \
+      const int A_ndim,                                               \
+      const int* A_dims,                                              \
+      const int B_ndim,                                               \
+      const int* B_dims,                                              \
+      const TIn* A,                                                   \
+      const TIn* B,                                                   \
+      TOut* C,                                                        \
+      CUDAContext* context) {                                         \
+    BroadcastBinaryOp<TIn, TOut, Op<TIn>>(                            \
+        A_ndim, A_dims, B_ndim, B_dims, Op<TIn>(), A, B, C, context); \
+  }
+
+#define DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op)                \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op)        \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op)       \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)
+
+DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
+DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
+DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less)
+DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
+DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
+DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)
+
+#undef DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION
+
+#define DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op)             \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(                          \
+      std::int32_t, std::int32_t, Func, Op)                         \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(                          \
+      std::int64_t, std::int64_t, Func, Op)                         \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op)   \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)
+
+DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor)
+DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
+DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
+DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor)
+
+#undef DEFINE_BROADCAST_CUDA_BINARY_FUNCTION
+
+DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
+DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
+DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
+
+#define DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(                      \
+      std::int32_t, std::int32_t, Func, Op)                     \
+  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
+
+DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
+DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
+DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
+
+#undef DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION
+
+#undef DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION
 
 #define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func)                  \
   template <>                                                           \
@@ -2228,43 +2717,6 @@ void Maximum(
 
 namespace {
 
-std::vector<int> MakeTransposeAxes(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes) {
-  std::vector<int> transpose_axes(num_dims);
-  const int d = num_dims - num_axes;
-  std::copy_n(axes, num_axes, transpose_axes.begin() + d);
-  std::sort(transpose_axes.begin() + d, transpose_axes.end());
-  int p = 0;
-  int q = d;
-  for (int i = 0; i < num_dims; ++i) {
-    if (q < num_dims && i == transpose_axes[q]) {
-      ++q;
-    } else {
-      transpose_axes[p++] = i;
-    }
-  }
-  return transpose_axes;
-}
-
-template <int D>
-void ComputeTransposedStrides(
-    const int* X_dims,
-    const int* axes,
-    int* X_strides) {
-  int buff[D];
-  int cur_stride = 1;
-  for (int i = D - 1; i >= 0; --i) {
-    buff[i] = cur_stride;
-    cur_stride *= X_dims[i];
-  }
-  for (int i = 0; i < D; ++i) {
-    X_strides[i] = buff[axes[i]];
-  }
-}
-
 template <typename T, class Reducer, int D>
 __global__ void ReduceTensorCUDAKernel(
     const int outer_size,
@@ -2282,9 +2734,9 @@ __global__ void ReduceTensorCUDAKernel(
       int X_index = 0;
       int Y_index = i * inner_size + j;
 #pragma unroll
-      for (int i = D - 1; i >= 0; --i) {
-        X_index += (Y_index % Y_dims.data[i]) * X_strides.data[i];
-        Y_index /= Y_dims.data[i];
+      for (int d = D - 1; d >= 0; --d) {
+        X_index += (Y_index % Y_dims.data[d]) * X_strides.data[d];
+        Y_index /= Y_dims.data[d];
       }
 #if __CUDA_ARCH__ >= 350
       val = reducer(val, __ldg(X + X_index));
@@ -2313,7 +2765,7 @@ void ReduceTensorCUDAImpl(
     CUDAContext* context) {
   SimpleArray<int, D> X_strides;
   SimpleArray<int, D> Y_dims;
-  ComputeTransposedStrides<D>(dims, axes, X_strides.data);
+  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
   for (int i = 0; i < D; ++i) {
     Y_dims.data[i] = dims[axes[i]];
   }
@@ -2337,8 +2789,9 @@ void ReduceTensorCUDA(
     T* Y,
     CUDAContext* context) {
   CAFFE_ENFORCE_LE(num_axes, num_dims);
-  const std::vector<int> transpose_axes =
-      MakeTransposeAxes(num_dims, dims, num_axes, axes);
+  std::vector<int> transpose_axes(num_dims);
+  utils::ComputeTransposeAxesForReduceOp(
+      num_dims, num_axes, axes, transpose_axes.data());
   const int pivot = num_dims - num_axes;
   int outer_size = 1;
   for (int i = 0; i < pivot; ++i) {
@@ -2648,7 +3101,7 @@ void MomentsCUDAImpl(
     CUDAContext* context) {
   SimpleArray<int, D> X_strides;
   SimpleArray<int, D> Y_dims;
-  ComputeTransposedStrides<D>(dims, axes, X_strides.data);
+  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
   for (int i = 0; i < D; ++i) {
     Y_dims.data[i] = dims[axes[i]];
   }
@@ -2671,8 +3124,9 @@ void MomentsCUDA(
     T* variance,
     CUDAContext* context) {
   CAFFE_ENFORCE_LE(num_axes, num_dims);
-  const std::vector<int> transpose_axes =
-      MakeTransposeAxes(num_dims, dims, num_axes, axes);
+  std::vector<int> transpose_axes(num_dims);
+  utils::ComputeTransposeAxesForReduceOp(
+      num_dims, num_axes, axes, transpose_axes.data());
   const int pivot = num_dims - num_axes;
   int outer_size = 1;
   for (int i = 0; i < pivot; ++i) {
@@ -2757,7 +3211,7 @@ void TransposeCUDAImpl(
     CUDAContext* context) {
   SimpleArray<int, D> X_strides;
   SimpleArray<int, D> Y_dims;
-  ComputeTransposedStrides<D>(dims, axes, X_strides.data);
+  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
   int size = 1;
   for (int i = 0; i < D; ++i) {
     Y_dims.data[i] = dims[axes[i]];
diff --git a/caffe2/utils/math_utils.cc b/caffe2/utils/math_utils.cc
new file mode 100644
index 00000000000000..4f245c9b0fd0e3
--- /dev/null
+++ b/caffe2/utils/math_utils.cc
@@ -0,0 +1,141 @@
+#include "caffe2/utils/math_utils.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "caffe2/core/logging.h"
+
+namespace caffe2 {
+namespace math {
+namespace utils {
+
+void IncreaseIndexInDims(const int n, const int* dims, int* index) {
+  for (int i = n - 1; i >= 0; --i) {
+    ++index[i];
+    if (index[i] >= dims[i]) {
+      index[i] -= dims[i];
+    } else {
+      break;
+    }
+  }
+}
+
+int GetIndexFromDims(const int n, const int* dims, const int* index) {
+  int sum = 0;
+  for (int i = 0; i < n; ++i) {
+    if (dims[i] > 1) {
+      sum = sum * dims[i] + index[i];
+    }
+  }
+  return sum;
+}
+
+void ComputeBroadcastBinaryOpDims(
+    const int A_ndim,
+    const int* A_dims,
+    const int B_ndim,
+    const int* B_dims,
+    int* A_broadcast_dims,
+    int* B_broadcast_dims,
+    int* C_broadcast_dims) {
+  const int ndim = std::max(A_ndim, B_ndim);
+  std::fill(A_broadcast_dims, A_broadcast_dims + ndim - A_ndim, 1);
+  std::fill(B_broadcast_dims, B_broadcast_dims + ndim - B_ndim, 1);
+  std::copy(A_dims, A_dims + A_ndim, A_broadcast_dims + ndim - A_ndim);
+  std::copy(B_dims, B_dims + B_ndim, B_broadcast_dims + ndim - B_ndim);
+  for (int i = 0; i < ndim; ++i) {
+    CAFFE_ENFORCE(
+        A_broadcast_dims[i] == B_broadcast_dims[i] ||
+        A_broadcast_dims[i] <= 1 || B_broadcast_dims[i] <= 1);
+    if (A_broadcast_dims[i] == 0 || B_broadcast_dims[i] == 0) {
+      C_broadcast_dims[i] = 0;
+    } else {
+      C_broadcast_dims[i] = std::max(A_broadcast_dims[i], B_broadcast_dims[i]);
+    }
+  }
+}
+
+bool IsRowwiseBroadcastBinaryOp(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    int* pivot,
+    bool* broadcast_1st) {
+  if (ndim == 0) {
+    return false;
+  }
+  int A_pivot = 0;
+  for (; A_pivot < ndim && A_dims[A_pivot] == 1; ++A_pivot)
+    ;
+  int B_pivot = 0;
+  for (; B_pivot < ndim && B_dims[B_pivot] == 1; ++B_pivot)
+    ;
+  if (A_pivot == B_pivot) {
+    return false;
+  }
+  *pivot = std::max(A_pivot, B_pivot);
+  *broadcast_1st = A_pivot > B_pivot;
+  return std::equal(A_dims + *pivot, A_dims + ndim, B_dims + *pivot);
+}
+
+bool IsColwiseBroadcastBinaryOp(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    int* pivot,
+    bool* broadcast_1st) {
+  if (ndim == 0) {
+    return false;
+  }
+  int A_pivot = ndim - 1;
+  for (; A_pivot >= 0 && A_dims[A_pivot] == 1; --A_pivot)
+    ;
+  int B_pivot = ndim - 1;
+  for (; B_pivot >= 0 && B_dims[B_pivot] == 1; --B_pivot)
+    ;
+  if (A_pivot == B_pivot) {
+    return false;
+  }
+  *pivot = std::min(A_pivot, B_pivot) + 1;
+  *broadcast_1st = A_pivot < B_pivot;
+  return std::equal(A_dims, A_dims + *pivot, B_dims);
+}
+
+void ComputeTransposeAxesForReduceOp(
+    const int num_dims,
+    const int num_reduce_axes,
+    const int* reduce_axes,
+    int* transpose_axes) {
+  const int d = num_dims - num_reduce_axes;
+  std::copy_n(reduce_axes, num_reduce_axes, transpose_axes + d);
+  std::sort(transpose_axes + d, transpose_axes + num_dims);
+  int p = 0;
+  int q = d;
+  for (int i = 0; i < num_dims; ++i) {
+    if (q < num_dims && i == transpose_axes[q]) {
+      ++q;
+    } else {
+      transpose_axes[p++] = i;
+    }
+  }
+}
+
+void ComputeTransposedStrides(
+    const int ndim,
+    const int* dims,
+    const int* axes,
+    int* strides) {
+  std::vector<int> buff(ndim);
+  int cur_stride = 1;
+  for (int i = ndim - 1; i >= 0; --i) {
+    buff[i] = cur_stride;
+    cur_stride *= dims[i];
+  }
+  for (int i = 0; i < ndim; ++i) {
+    strides[i] = buff[axes[i]];
+  }
+}
+
+} // namespace utils
+} // namespace math
+} // namespace caffe2
diff --git a/caffe2/utils/math_utils.h b/caffe2/utils/math_utils.h
new file mode 100644
index 00000000000000..f5570391b422d7
--- /dev/null
+++ b/caffe2/utils/math_utils.h
@@ -0,0 +1,54 @@
+#ifndef CAFFE2_UTILS_MATH_UTILS_H_
+#define CAFFE2_UTILS_MATH_UTILS_H_
+
+namespace caffe2 {
+namespace math {
+namespace utils {
+
+// Increase the index digits by one based on dims.
+void IncreaseIndexInDims(const int n, const int* dims, int* index);
+
+// Get index value from dims and index digits.
+int GetIndexFromDims(const int n, const int* dims, const int* index);
+
+// Computest the broadcast binary operation dims.
+void ComputeBroadcastBinaryOpDims(
+    const int A_ndim,
+    const int* A_dims,
+    const int B_ndim,
+    const int* B_dims,
+    int* A_broadcast_dims,
+    int* B_broadcast_dims,
+    int* C_broadcast_dims);
+
+bool IsRowwiseBroadcastBinaryOp(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    int* pivot,
+    bool* broadcast_1st);
+
+bool IsColwiseBroadcastBinaryOp(
+    const int ndim,
+    const int* A_dims,
+    const int* B_dims,
+    int* pivot,
+    bool* broadcast_1st);
+
+void ComputeTransposeAxesForReduceOp(
+    const int num_dims,
+    const int num_reduce_axes,
+    const int* reduce_axes,
+    int* transpose_axes);
+
+void ComputeTransposedStrides(
+    const int ndim,
+    const int* dims,
+    const int* axes,
+    int* strides);
+
+} // namespace utils
+} // namespace math
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_MATH_UTILS_H_
diff --git a/caffe2/utils/murmur_hash3.cc b/caffe2/utils/murmur_hash3.cc
index 3173d3a80415d2..28b373165c98e5 100644
--- a/caffe2/utils/murmur_hash3.cc
+++ b/caffe2/utils/murmur_hash3.cc
@@ -18,6 +18,8 @@
 
 #define FORCE_INLINE __forceinline
 
+#define FALLTHROUGH_INTENDED
+
 #include <stdlib.h>
 
 #define ROTL32(x, y) _rotl(x, y)
@@ -31,6 +33,17 @@
 
 #define FORCE_INLINE inline __attribute__((__always_inline__))
 
+#if defined(__clang__) && defined(__has_cpp_attribute)
+#if __has_cpp_attribute(clang::fallthrough)
+#define FALLTHROUGH_INTENDED [[clang::fallthrough]]
+#endif
+#elif defined(__GNUC__) && __GNUC__ > 6
+#define FALLTHROUGH_INTENDED [[gnu::fallthrough]]
+#endif
+#ifndef FALLTHROUGH_INTENDED
+#define FALLTHROUGH_INTENDED
+#endif
+
 inline uint32_t rotl32(uint32_t x, int8_t r) {
   return (x << r) | (x >> (32 - r));
 }
@@ -121,8 +134,10 @@ void MurmurHash3_x86_32(const void* key, int len, uint32_t seed, void* out) {
   switch (len & 3) {
     case 3:
       k1 ^= tail[2] << 16;
+      FALLTHROUGH_INTENDED;
     case 2:
       k1 ^= tail[1] << 8;
+      FALLTHROUGH_INTENDED;
     case 1:
       k1 ^= tail[0];
       k1 *= c1;
@@ -222,47 +237,61 @@ void MurmurHash3_x86_128(
   switch (len & 15) {
     case 15:
       k4 ^= tail[14] << 16;
+      FALLTHROUGH_INTENDED;
     case 14:
       k4 ^= tail[13] << 8;
+      FALLTHROUGH_INTENDED;
     case 13:
       k4 ^= tail[12] << 0;
       k4 *= c4;
       k4 = ROTL32(k4, 18);
       k4 *= c1;
       h4 ^= k4;
+      FALLTHROUGH_INTENDED;
 
     case 12:
       k3 ^= tail[11] << 24;
+      FALLTHROUGH_INTENDED;
     case 11:
       k3 ^= tail[10] << 16;
+      FALLTHROUGH_INTENDED;
     case 10:
       k3 ^= tail[9] << 8;
+      FALLTHROUGH_INTENDED;
     case 9:
       k3 ^= tail[8] << 0;
       k3 *= c3;
       k3 = ROTL32(k3, 17);
       k3 *= c4;
       h3 ^= k3;
+      FALLTHROUGH_INTENDED;
 
     case 8:
       k2 ^= tail[7] << 24;
+      FALLTHROUGH_INTENDED;
     case 7:
       k2 ^= tail[6] << 16;
+      FALLTHROUGH_INTENDED;
     case 6:
       k2 ^= tail[5] << 8;
+      FALLTHROUGH_INTENDED;
     case 5:
       k2 ^= tail[4] << 0;
       k2 *= c2;
       k2 = ROTL32(k2, 16);
       k2 *= c3;
       h2 ^= k2;
+      FALLTHROUGH_INTENDED;
 
     case 4:
       k1 ^= tail[3] << 24;
+      FALLTHROUGH_INTENDED;
     case 3:
       k1 ^= tail[2] << 16;
+      FALLTHROUGH_INTENDED;
     case 2:
       k1 ^= tail[1] << 8;
+      FALLTHROUGH_INTENDED;
     case 1:
       k1 ^= tail[0] << 0;
       k1 *= c1;
@@ -359,37 +388,51 @@ void MurmurHash3_x64_128(
   switch (len & 15) {
     case 15:
       k2 ^= ((uint64_t)tail[14]) << 48;
+      FALLTHROUGH_INTENDED;
     case 14:
       k2 ^= ((uint64_t)tail[13]) << 40;
+      FALLTHROUGH_INTENDED;
     case 13:
       k2 ^= ((uint64_t)tail[12]) << 32;
+      FALLTHROUGH_INTENDED;
     case 12:
       k2 ^= ((uint64_t)tail[11]) << 24;
+      FALLTHROUGH_INTENDED;
     case 11:
       k2 ^= ((uint64_t)tail[10]) << 16;
+      FALLTHROUGH_INTENDED;
     case 10:
       k2 ^= ((uint64_t)tail[9]) << 8;
+      FALLTHROUGH_INTENDED;
     case 9:
       k2 ^= ((uint64_t)tail[8]) << 0;
       k2 *= c2;
       k2 = ROTL64(k2, 33);
       k2 *= c1;
       h2 ^= k2;
+      FALLTHROUGH_INTENDED;
 
     case 8:
       k1 ^= ((uint64_t)tail[7]) << 56;
+      FALLTHROUGH_INTENDED;
     case 7:
       k1 ^= ((uint64_t)tail[6]) << 48;
+      FALLTHROUGH_INTENDED;
     case 6:
       k1 ^= ((uint64_t)tail[5]) << 40;
+      FALLTHROUGH_INTENDED;
     case 5:
       k1 ^= ((uint64_t)tail[4]) << 32;
+      FALLTHROUGH_INTENDED;
     case 4:
       k1 ^= ((uint64_t)tail[3]) << 24;
+      FALLTHROUGH_INTENDED;
     case 3:
       k1 ^= ((uint64_t)tail[2]) << 16;
+      FALLTHROUGH_INTENDED;
     case 2:
       k1 ^= ((uint64_t)tail[1]) << 8;
+      FALLTHROUGH_INTENDED;
     case 1:
       k1 ^= ((uint64_t)tail[0]) << 0;
       k1 *= c1;
diff --git a/caffe2/utils/threadpool/WorkersPool.h b/caffe2/utils/threadpool/WorkersPool.h
index 4c3bfcf0a4fe46..9d2ffd57864b68 100644
--- a/caffe2/utils/threadpool/WorkersPool.h
+++ b/caffe2/utils/threadpool/WorkersPool.h
@@ -331,9 +331,9 @@ class WorkersPool {
     // One of the tasks will be run on the current thread.
     int workers_count = tasks.size() - 1;
     CreateWorkers(workers_count);
-    DCHECK_LE(workers_count, workers_.size());
+    DCHECK_LE(workers_count, (int)workers_.size());
     counter_to_decrement_when_ready_.Reset(workers_count);
-    for (auto task = 1; task < tasks.size(); ++task) {
+    for (size_t task = 1; task < tasks.size(); ++task) {
       workers_[task - 1]->StartWork(tasks[task].get());
     }
     // Execute the remaining workload immediately on the current thread.
diff --git a/cmake/Caffe2Config.cmake.in b/cmake/Caffe2Config.cmake.in
index 3b9bb04afa213a..fe393669750304 100644
--- a/cmake/Caffe2Config.cmake.in
+++ b/cmake/Caffe2Config.cmake.in
@@ -69,22 +69,31 @@ if (@CAFFE2_KNOWN_PROTOBUF_VERSION@)
   endif()
 endif()
 
-# Cuda
 if (@USE_CUDA@)
+  # The file public/cuda.cmake exclusively uses CAFFE2_USE_*.
+  # If Caffe2 was compiled with the libraries below, they must
+  # be found again when including the Caffe2 target.
+  set(CAFFE2_USE_CUDA @USE_CUDA@)
+  set(CAFFE2_USE_CUDNN @USE_CUDNN@)
+  set(CAFFE2_USE_TENSORRT @USE_TENSORRT@)
   include("${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake")
-  if (NOT CAFFE2_FOUND_CUDA)
+  if (@CAFFE2_USE_CUDA@ AND NOT CAFFE2_USE_CUDA)
     message(FATAL_ERROR
-        "Your installed Caffe2 version uses CUDA but I cannot find the CUDA "
-        "libraries. Please set the proper CUDA prefixes and / or install "
-        "CUDA.")
+      "Your installed Caffe2 version uses CUDA but I cannot find the CUDA "
+      "libraries. Please set the proper CUDA prefixes and / or install "
+      "CUDA.")
   endif()
-  if (@BUILD_CAFFE2@)
-    if (NOT CAFFE2_FOUND_CUDNN)
-      message(FATAL_ERROR
-          "Your installed Caffe2 version uses cuDNN but I cannot find the cuDNN "
-          "libraries. Please set the proper cuDNN prefixes and / or install "
-          "cuDNN.")
-    endif()
+  if (@CAFFE2_USE_CUDNN@ AND NOT CAFFE2_USE_CUDNN)
+    message(FATAL_ERROR
+      "Your installed Caffe2 version uses cuDNN but I cannot find the cuDNN "
+      "libraries. Please set the proper cuDNN prefixes and / or install "
+      "cuDNN.")
+  endif()
+  if (@CAFFE2_USE_TENSORRT@ AND NOT CAFFE2_USE_TENSORRT)
+    message(FATAL_ERROR
+      "Your installed Caffe2 version uses TensorRT but I cannot find the TensorRT "
+      "libraries. Please set the proper TensorRT prefixes and / or install "
+      "TensorRT.")
   endif()
 endif()
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 9a6611ebb99b72..30802b28590a5d 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -180,7 +180,7 @@ if(BUILD_TEST)
     set(gtest_force_shared_crt ON)
   endif()
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest)
-  include_directories(${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest/googletest/include)
+  include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest/googletest/include)
 
   # We will not need to test benchmark lib itself.
   set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Disable benchmark testing as we don't need it.")
@@ -393,32 +393,53 @@ endif()
 
 # ---[ CUDA
 if(USE_CUDA)
+  # public/*.cmake uses CAFFE2_USE_*
+  set(CAFFE2_USE_CUDA ${USE_CUDA})
+  set(CAFFE2_USE_CUDNN ${USE_CUDNN})
+  set(CAFFE2_USE_TENSORRT ${USE_TENSORRT})
   include(${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake)
-  if(CAFFE2_FOUND_CUDA)
+  if(CAFFE2_USE_CUDA)
     # A helper variable recording the list of Caffe2 dependent libraries
     # caffe2::cudart is dealt with separately, due to CUDA_ADD_LIBRARY
     # design reason (it adds CUDA_LIBRARIES itself).
-    set(Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
-        caffe2::cuda caffe2::cufft caffe2::curand caffe2::nvrtc)
-    if(CAFFE2_FOUND_CUDNN)
-      LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
+    set(Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cufft caffe2::curand)
+    if(BUILD_CAFFE2)
+      # Don't be deceived!  caffe2::cuda is the low-level DRIVER API,
+      # not the actual CUDA library.  Caffe2 depends directly on nvrtc
+      # and needs it, but ATen doesn't use it at all, and so the
+      # dependency is unnecessary.
+      #
+      # BTW, if you change this so that PyTorch has this dependency
+      # again, make sure Mac OS X GPU builds still work; there's
+      # a decent chance the library finding algorithm picked up
+      # on cuda.framework, which is totally not going to work when
+      # linking.
+      list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cuda caffe2::nvrtc)
+    endif()
+    if(CAFFE2_USE_CUDNN)
+      list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
     else()
       if(BUILD_CAFFE2)
         # TODO: Get rid of special case for Caffe2 where we require
         # CUDA *and* cuDNN to be installed.
         message(WARNING
-            "Not compiling with CUDA since cuDNN is missing. Suppress "
-            "this warning with -DUSE_CUDA=OFF.")
+          "Not compiling with CUDA since cuDNN is missing. Suppress "
+          "this warning with -DUSE_CUDA=OFF.")
         caffe2_update_option(USE_CUDA OFF)
         caffe2_update_option(USE_CUDNN OFF)
+        caffe2_update_option(USE_TENSORRT OFF)
+        set(CAFFE2_USE_CUDA OFF)
+        set(CAFFE2_USE_CUDNN OFF)
+        set(CAFFE2_USE_TENSORRT OFF)
       else()
         message(WARNING
-            "Not compiling with cuDNN. Suppress this warning with "
-            "-DUSE_CUDNN=OFF.")
+          "Not compiling with cuDNN. Suppress this warning with "
+          "-DUSE_CUDNN=OFF.")
         caffe2_update_option(USE_CUDNN OFF)
+        set(CAFFE2_USE_CUDNN OFF)
       endif()
     endif()
-    if(USE_CUDA)
+    if(CAFFE2_USE_CUDA)
       if(CAFFE2_STATIC_LINK_CUDA)
         # When statically linking, this must be the order of the libraries
         LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
@@ -426,21 +447,28 @@ if(USE_CUDA)
       else()
         LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cublas)
       endif()
-      if(USE_TENSORRT)
+      if(CAFFE2_USE_TENSORRT)
         list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::tensorrt)
+      else()
+        caffe2_update_option(USE_TENSORRT OFF)
       endif()
     endif()
   else()
     message(WARNING
-        "Not compiling with CUDA. Suppress this warning with "
-        "-DUSE_CUDA=OFF.")
+      "Not compiling with CUDA. Suppress this warning with "
+      "-DUSE_CUDA=OFF.")
     caffe2_update_option(USE_CUDA OFF)
+    caffe2_update_option(USE_CUDNN OFF)
+    caffe2_update_option(USE_TENSORRT OFF)
+    set(CAFFE2_USE_CUDA OFF)
+    set(CAFFE2_USE_CUDNN OFF)
+    set(CAFFE2_USE_TENSORRT OFF)
   endif()
 endif()
 
 # ---[ HIP
-if(BUILD_CAFFE2)
-  include(cmake/public/LoadHIP.cmake)
+if(BUILD_CAFFE2 OR BUILD_ATEN)
+  include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake)
   if(PYTORCH_FOUND_HIP)
     message(INFO "Compiling with HIP for AMD.")
     caffe2_update_option(USE_ROCM ON)
@@ -449,7 +477,7 @@ if(BUILD_CAFFE2)
     set(Caffe2_HIP_INCLUDES
       ${hip_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${Caffe2_HIP_INCLUDES} ${thrust_INCLUDE_DIRS})
     set(Caffe2_HIP_DEPENDENCY_LIBS
-      ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES})
+      ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipsparse_LIBRARIES} ${hipblas_LIBRARIES})
 
     # TODO: There is a bug in rocblas's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES}
     list(APPEND Caffe2_HIP_DEPENDENCY_LIBS
@@ -464,7 +492,7 @@ if(USE_ROCM AND NOT BUILD_CAFFE2)
  include_directories(${HIP_PATH}/include)
  include_directories(${HIPBLAS_PATH}/include)
  include_directories(${HIPSPARSE_PATH}/include)
- include_directories(${HIPRNG_PATH}/include)
+ include_directories(${HIPRAND_PATH}/include)
  include_directories(${THRUST_PATH})
 
  # load HIP cmake module and load platform id
@@ -472,7 +500,7 @@ if(USE_ROCM AND NOT BUILD_CAFFE2)
  EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
 
  # Link with HIPCC https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_porting_guide.md#linking-with-hipcc
- SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE})
+ # SET(CMAKE_CXX_LINK_EXECUTABLE ${HIP_HIPCC_EXECUTABLE})
 
  # Show message that we're using ROCm.
  MESSAGE(STATUS "ROCM TRUE:")
@@ -661,7 +689,6 @@ if (BUILD_ATEN)
     if (USE_CUDA)
       list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS aten_op_header_gen)
     endif()
-    include_directories(${PROJECT_BINARY_DIR}/caffe2/contrib/aten/aten/src/ATen)
     include_directories(${PROJECT_BINARY_DIR}/caffe2/contrib/aten)
   endif()
 endif()
@@ -1126,6 +1153,13 @@ if (BUILD_ATEN)
     set(AT_CUDNN_ENABLED 1)
   ENDIF()
 
+  IF (NOT USE_ROCM)
+    MESSAGE(STATUS "MIOpen not found. Compiling without MIOpen support")
+    set(AT_MIOPEN_ENABLED 0)
+  ELSE()
+    set(AT_MIOPEN_ENABLED 1)
+  ENDIF()
+
   if (NO_MKLDNN)
     message("disabling MKLDNN because NO_MKLDNN is set")
     set(AT_MKLDNN_ENABLED 0)
diff --git a/cmake/Modules/FindMIOpen.cmake b/cmake/Modules/FindMIOpen.cmake
new file mode 100644
index 00000000000000..6a047df7e52f3e
--- /dev/null
+++ b/cmake/Modules/FindMIOpen.cmake
@@ -0,0 +1,63 @@
+# - Try to find MIOpen
+#
+# The following variables are optionally searched for defaults
+#  MIOPEN_ROOT_DIR:            Base directory where all MIOpen components are found
+#
+# The following are set after configuration is done:
+#  MIOPEN_FOUND
+#  MIOPEN_INCLUDE_DIRS
+#  MIOPEN_LIBRARIES
+#  MIOPEN_LIBRARY_DIRS
+#
+# Borrowed from https://github.com/caffe2/caffe2/blob/master/cmake/Modules/FindCuDNN.cmake
+
+include(FindPackageHandleStandardArgs)
+
+set(MIOPEN_ROOT_DIR "" CACHE PATH "Folder contains MIOpen")
+
+if($ENV{MIOPEN_INCLUDE_DIR})
+  SET(MIOPEN_INCLUDE_DIR $ENV{MIOPEN_INCLUDE_DIR})
+else($ENV{MIOPEN_INCLUDE_DIR})
+  find_path(MIOPEN_INCLUDE_DIR miopen.h
+    HINTS ${MIOPEN_ROOT_DIR}
+    PATH_SUFFIXES include include/miopen)
+endif($ENV{MIOPEN_INCLUDE_DIR})
+
+if($ENV{MIOPEN_LIBRARY})
+  SET(MIOPEN_LIBRARY $ENV{MIOPEN_LIBRARY})
+else($ENV{MIOPEN_LIBRARY})
+  find_library(MIOPEN_LIBRARY MIOpen
+    HINTS ${MIOPEN_LIB_DIR} ${MIOPEN_ROOT_DIR}
+    PATH_SUFFIXES lib)
+endif($ENV{MIOPEN_LIBRARY})
+
+find_package_handle_standard_args(
+    MIOPEN DEFAULT_MSG MIOPEN_INCLUDE_DIR MIOPEN_LIBRARY)
+
+if(MIOPEN_FOUND)
+	# get MIOpen version
+  file(READ ${MIOPEN_INCLUDE_DIR}/version.h MIOPEN_HEADER_CONTENTS)
+	string(REGEX MATCH "define MIOPEN_VERSION_MAJOR * +([0-9]+)"
+				 MIOPEN_VERSION_MAJOR "${MIOPEN_HEADER_CONTENTS}")
+	string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR * +([0-9]+)" "\\1"
+				 MIOPEN_VERSION_MAJOR "${MIOPEN_VERSION_MAJOR}")
+	string(REGEX MATCH "define MIOPEN_VERSION_MINOR * +([0-9]+)"
+				 MIOPEN_VERSION_MINOR "${MIOPEN_HEADER_CONTENTS}")
+	string(REGEX REPLACE "define MIOPEN_VERSION_MINOR * +([0-9]+)" "\\1"
+				 MIOPEN_VERSION_MINOR "${MIOPEN_VERSION_MINOR}")
+	string(REGEX MATCH "define MIOPEN_VERSION_PATCH * +([0-9]+)"
+				 MIOPEN_VERSION_PATCH "${MIOPEN_HEADER_CONTENTS}")
+	string(REGEX REPLACE "define MIOPEN_VERSION_PATCH * +([0-9]+)" "\\1"
+				 MIOPEN_VERSION_PATCH "${MIOPEN_VERSION_PATCH}")
+  # Assemble MIOpen version
+  if(NOT MIOPEN_VERSION_MAJOR)
+    set(MIOPEN_VERSION "?")
+  else()
+    set(MIOPEN_VERSION "${MIOPEN_VERSION_MAJOR}.${MIOPEN_VERSION_MINOR}.${MIOPEN_VERSION_PATCH}")
+  endif()
+
+  set(MIOPEN_INCLUDE_DIRS ${MIOPEN_INCLUDE_DIR})
+  set(MIOPEN_LIBRARIES ${MIOPEN_LIBRARY})
+  message(STATUS "Found MIOpen: v${MIOPEN_VERSION}  (include: ${MIOPEN_INCLUDE_DIR}, library: ${MIOPEN_LIBRARY})")
+  mark_as_advanced(MIOPEN_ROOT_DIR MIOPEN_LIBRARY MIOPEN_INCLUDE_DIR)
+endif()
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 11e6068f65fe82..8e287336e7d536 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -13,14 +13,14 @@ macro(custom_protobuf_find)
   if (${CAFFE2_LINK_LOCAL_PROTOBUF})
     # If we are going to link protobuf locally, we will need to turn off
     # shared libs build for protobuf.
-    set(protobuf_BUILD_SHARED_LIBS OFF)
+    option(protobuf_BUILD_SHARED_LIBS "" OFF)
   else()
     # If we are building Caffe2 as shared libs, we will also build protobuf as
     # shared libs.
-    set(protobuf_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
+    option(protobuf_BUILD_SHARED_LIBS "" ${BUILD_SHARED_LIBS})
   endif()
   # We will make sure that protobuf and caffe2 uses the same msvc runtime.
-  set(protobuf_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME})
+  option(protobuf_MSVC_STATIC_RUNTIME "" ${CAFFE2_USE_MSVC_STATIC_RUNTIME})
 
   if (${CAFFE2_LINK_LOCAL_PROTOBUF})
     set(__caffe2_CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ${CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS})
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 7d813cc22de066..33cef8d0095fee 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -38,13 +38,6 @@ ELSE()
   SET(ROCBLAS_PATH $ENV{ROCBLAS_PATH})
 ENDIF()
 
-# HIPRNG_PATH
-IF(NOT DEFINED ENV{HIPRNG_PATH})
-  SET(HIPRNG_PATH ${ROCM_PATH}/hcrng)
-ELSE()
-  SET(HIPRNG_PATH $ENV{HIPRNG_PATH})
-ENDIF()
-
 # HIPSPARSE_PATH
 IF(NOT DEFINED ENV{HIPSPARSE_PATH})
   SET(HIPSPARSE_PATH ${ROCM_PATH}/hcsparse)
@@ -103,11 +96,15 @@ IF(HIP_FOUND)
   set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand)
   set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas)
   set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
+  set(hipblas_DIR ${HIPBLAS_PATH}/lib/cmake/hipblas)
+  set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
 
   find_package(rocrand REQUIRED)
   find_package(hiprand REQUIRED)
   find_package(rocblas REQUIRED)
   find_package(miopen REQUIRED)
+  #find_package(hipblas REQUIRED) There's a bug with the CMake file in the Hipblas package.
+  #find_package(hipsparse REQUIRED)
 
   # TODO: hip_hcc has an interface include flag "-hc" which is only
   # recognizable by hcc, but not gcc and clang. Right now in our
@@ -117,6 +114,9 @@ IF(HIP_FOUND)
   # TODO: miopen_LIBRARIES should return fullpath to the library file,
   # however currently it's just the lib name
   FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib)
+  FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib)
+  FIND_LIBRARY(hipblas_LIBRARIES hipblas HINTS ${HIPBLAS_PATH}/lib)
+  FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib)
 
   set(thrust_INCLUDE_DIRS ${THRUST_PATH} ${THRUST_PATH}/thrust/system/cuda/detail/cub-hip)
 
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index cd790ed98d64e1..b4b2b89c4380d3 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -1,8 +1,5 @@
 # ---[ cuda
 
-set(CAFFE2_FOUND_CUDA FALSE)
-set(CAFFE2_FOUND_CUDNN FALSE)
-
 # Find CUDA.
 find_package(CUDA 7.0)
 if(NOT CUDA_FOUND)
@@ -10,9 +7,9 @@ if(NOT CUDA_FOUND)
     "Caffe2: CUDA cannot be found. Depending on whether you are building "
     "Caffe2 or a Caffe2 dependent library, the next warning / error will "
     "give you more info.")
+  set(CAFFE2_USE_CUDA OFF)
   return()
 endif()
-set(CAFFE2_FOUND_CUDA TRUE)
 message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION})
 message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
 message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
@@ -22,10 +19,10 @@ if(CUDA_FOUND)
   # compiling with, e.g., if a ccache nvcc is fed to us by CUDA_NVCC_EXECUTABLE
   # but the PATH is not consistent with CUDA_HOME.  It's better safe
   # than sorry: make sure everything is consistent.
-  set(file "${PROJECT_BINARY_DIR}/detect_cuda_version.c")
+  set(file "${PROJECT_BINARY_DIR}/detect_cuda_version.cc")
   file(WRITE ${file} ""
     "#include <cuda.h>\n"
-    "#include <stdio.h>\n"
+    "#include <cstdio>\n"
     "int main() {\n"
     "  printf(\"%d.%d\", CUDA_VERSION / 1000, (CUDA_VERSION / 10) % 100);\n"
     "  return 0;\n"
@@ -94,13 +91,11 @@ find_package_handle_standard_args(
 if(NOT CUDNN_FOUND)
   message(WARNING
     "Caffe2: Cannot find cuDNN library. Turning the option off")
-  set(USE_CUDNN OFF)
-else()
-  set(CAFFE2_FOUND_CUDNN TRUE)
+  set(CAFFE2_USE_CUDNN OFF)
 endif()
 
 # Optionally, find TensorRT
-if (${USE_TENSORRT})
+if(CAFFE2_USE_TENSORRT)
   find_path(TENSORRT_INCLUDE_DIR NvInfer.h
     HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR}
     PATH_SUFFIXES include)
@@ -112,12 +107,12 @@ if (${USE_TENSORRT})
   if(NOT TENSORRT_FOUND)
     message(WARNING
       "Caffe2: Cannot find TensorRT library. Turning the option off")
-    set(USE_TENSORRT OFF)
+    set(CAFFE2_USE_TENSORRT OFF)
   endif()
 endif()
 
-# ---[ Exract versions
-if (CAFFE2_FOUND_CUDNN)
+# ---[ Extract versions
+if(CAFFE2_USE_CUDNN)
   # Get cuDNN version
   file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_HEADER_CONTENTS)
   string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
@@ -189,7 +184,7 @@ set_property(
 
 # cudnn
 # static linking is handled by USE_STATIC_CUDNN environment variable
-if(${USE_CUDNN})
+if(CAFFE2_USE_CUDNN)
   add_library(caffe2::cudnn UNKNOWN IMPORTED)
   set_property(
       TARGET caffe2::cudnn PROPERTY IMPORTED_LOCATION
@@ -231,7 +226,7 @@ set_property(
     ${CUDA_INCLUDE_DIRS})
 
 # TensorRT
-if(${USE_TENSORRT})
+if(CAFFE2_USE_TENSORRT)
   add_library(caffe2::tensorrt UNKNOWN IMPORTED)
   set_property(
       TARGET caffe2::tensorrt PROPERTY IMPORTED_LOCATION
@@ -270,191 +265,6 @@ set_property(
 # now, Caffe2 only uses the above libraries, so we will only wrap
 # these.
 
-# ---[ Cuda flags
-
-# Known NVIDIA GPU achitectures Caffe2 can be compiled for.
-# Default is set to cuda 9. If we detect the cuda architectures to be less than
-# 9, we will lower it to the corresponding known archs.
-set(Caffe2_known_gpu_archs "30 35 50 52 60 61 70") # for CUDA 9.x
-set(Caffe2_known_gpu_archs8 "30 35 50 52 60 61") # for CUDA 8.x
-set(Caffe2_known_gpu_archs7 "30 35 50 52") # for CUDA 7.x
-
-################################################################################################
-# A function for automatic detection of GPUs installed  (if autodetection is enabled)
-# Usage:
-#   caffe2_detect_installed_gpus(out_variable)
-function(caffe2_detect_installed_gpus out_variable)
-  if(NOT CUDA_gpu_detect_output)
-    set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
-
-    file(WRITE ${__cufile} ""
-      "#include <cstdio>\n"
-      "int main()\n"
-      "{\n"
-      "  int count = 0;\n"
-      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
-      "  if (count == 0) return -1;\n"
-      "  for (int device = 0; device < count; ++device)\n"
-      "  {\n"
-      "    cudaDeviceProp prop;\n"
-      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
-      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
-      "  }\n"
-      "  return 0;\n"
-      "}\n")
-
-    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}" ${CUDA_NVCC_FLAGS} "--run" "${__cufile}"
-                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-                    RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-    if(__nvcc_res EQUAL 0)
-      string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
-      set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from caffe2_detect_installed_gpus tool" FORCE)
-    endif()
-  endif()
-
-  if(NOT CUDA_gpu_detect_output)
-    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
-    set(${out_variable} ${Caffe2_known_gpu_archs} PARENT_SCOPE)
-  else()
-    message(STATUS "Automatic GPU detection returned ${CUDA_gpu_detect_output}.")
-    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
-  endif()
-endfunction()
-
-
-################################################################################################
-# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
-# Usage:
-#   caffe_select_nvcc_arch_flags(out_variable)
-function(caffe2_select_nvcc_arch_flags out_variable)
-  # List of arch names
-  set(__archs_names "Kepler" "Maxwell" "Pascal" "Volta" "All" "Manual")
-  set(__archs_name_default "All")
-  if(NOT CMAKE_CROSSCOMPILING)
-    list(APPEND __archs_names "Auto")
-    set(__archs_name_default "Auto")
-  endif()
-
-  # Set CUDA_ARCH_NAME strings (so it will be seen as dropbox in the CMake GUI)
-  set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU architecture.")
-  set_property(CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names})
-  mark_as_advanced(CUDA_ARCH_NAME)
-
-  # Verify CUDA_ARCH_NAME value
-  if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
-    string(REPLACE ";" ", " __archs_names "${__archs_names}")
-    message(FATAL_ERROR "Invalid CUDA_ARCH_NAME, supported values: ${__archs_names}. Got ${CUDA_ARCH_NAME}.")
-  endif()
-
-  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
-    set(CUDA_ARCH_BIN "" CACHE STRING
-      "Specify GPU architectures to build binaries for (BIN(PTX) format is supported)")
-    set(CUDA_ARCH_PTX "" CACHE STRING
-      "Specify GPU architectures to build PTX intermediate code for")
-    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
-  else()
-    unset(CUDA_ARCH_BIN CACHE)
-    unset(CUDA_ARCH_PTX CACHE)
-  endif()
-
-  set(CUDA_ARCH_LIST)
-  if(DEFINED ENV{TORCH_CUDA_ARCH_LIST})
-    set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
-    string(REGEX REPLACE "[ \t]+" ";" TORCH_CUDA_ARCH_LIST "${TORCH_CUDA_ARCH_LIST}")
-    list(APPEND CUDA_ARCH_LIST ${TORCH_CUDA_ARCH_LIST})
-    message(STATUS "Set CUDA arch from TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST}")
-  else()
-    list(APPEND CUDA_ARCH_LIST ${CUDA_ARCH_NAME})
-    message(STATUS "Set CUDA arch from CUDA_ARCH_NAME: ${CUDA_ARCH_NAME}")
-  endif()
-  list(REMOVE_DUPLICATES CUDA_ARCH_LIST)
-
-  set(__cuda_arch_bin)
-  set(__cuda_arch_ptx)
-  foreach(arch_name ${CUDA_ARCH_LIST})
-    set(arch_bin)
-    set(arch_ptx)
-    set(add_ptx FALSE)
-    # Check to see if we are compiling PTX
-    if(arch_name MATCHES "(.*)\\+PTX$")
-      set(add_ptx TRUE)
-      set(arch_name ${CMAKE_MATCH_1})
-    endif()
-    if(arch_name MATCHES "(^[0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
-      set(arch_bin ${CMAKE_MATCH_1})
-      set(arch_ptx ${arch_bin})
-    else()
-      # Look for it in our list of known architectures
-     if(${arch_name} STREQUAL "Kepler")
-        set(arch_bin "30 35")
-      elseif(${arch_name} STREQUAL "Maxwell")
-        set(arch_bin "50")
-      elseif(${arch_name} STREQUAL "Pascal")
-        set(arch_bin "60 61")
-      elseif(${arch_name} STREQUAL "Volta")
-        set(arch_bin "70")
-      elseif(${arch_name} STREQUAL "All")
-        set(arch_bin ${Caffe2_known_gpu_archs})
-      elseif(${arch_name} STREQUAL "Manual")
-        set(arch_bin ${CUDA_ARCH_BIN})
-        set(arch_ptx ${CUDA_ARCH_PTX})
-        set(add_ptx TRUE)
-      elseif(${arch_name} STREQUAL "Auto")
-        caffe2_detect_installed_gpus(arch_bin)
-      else()
-        message(FATAL_ERROR "Unknown CUDA architecture name ${arch_name}")
-      endif()
-    endif()
-    list(APPEND __cuda_arch_bin ${arch_bin})
-    if(add_ptx)
-      if (NOT arch_ptx)
-        set(arch_ptx ${arch_bin})
-      endif()
-      list(APPEND __cuda_arch_ptx ${arch_ptx})
-    endif()
-  endforeach()
-
-  # Remove dots and convert to lists
-  string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}")
-  string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${__cuda_arch_ptx}")
-  string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}")
-  string(REGEX MATCHALL "[0-9]+"   __cuda_arch_ptx "${__cuda_arch_ptx}")
-  list(REMOVE_DUPLICATES __cuda_arch_bin)
-  list(REMOVE_DUPLICATES __cuda_arch_ptx)
-
-  set(__nvcc_flags "")
-  set(__nvcc_archs_readable "")
-
-  # Tell NVCC to add binaries for the specified GPUs
-  foreach(__arch ${__cuda_arch_bin})
-    if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
-      # User explicitly specified PTX for the concrete BIN
-      list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
-      list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1})
-    else()
-      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
-      list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch})
-      list(APPEND __nvcc_archs_readable sm_${__arch})
-    endif()
-  endforeach()
-
-  # Tell NVCC to add PTX intermediate code for the specified architectures
-  foreach(__arch ${__cuda_arch_ptx})
-    list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch})
-    list(APPEND __nvcc_archs_readable compute_${__arch})
-  endforeach()
-
-  string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}")
-  set(${out_variable}          ${__nvcc_flags}          PARENT_SCOPE)
-  set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-###  Non macro section
-################################################################################################
-
 # Special care for windows platform: we know that 32-bit windows does not
 # support cuda.
 if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
@@ -467,11 +277,9 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
 endif()
 
 if (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
-  set(Caffe2_known_gpu_archs ${Caffe2_known_gpu_archs7})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
   list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
 elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
-  set(Caffe2_known_gpu_archs ${Caffe2_known_gpu_archs8})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
   list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
   # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
@@ -510,9 +318,26 @@ elseif (CUDA_VERSION VERSION_EQUAL 8.0)
 endif()
 
 # setting nvcc arch flags
-caffe2_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
+if ((NOT EXISTS ${TORCH_CUDA_ARCH_LIST}) AND (DEFINED ENV{TORCH_CUDA_ARCH_LIST}))
+  message(WARNING
+      "In the future we will require one to explicitly pass "
+      "TORCH_CUDA_ARCH_LIST to cmake instead of implicitly setting it as an "
+      "env variable. This will become a FATAL_ERROR in future version of "
+      "pytorch.")
+  set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
+endif()
+if (EXISTS ${CUDA_ARCH_NAME})
+  message(WARNING
+      "CUDA_ARCH_NAME is no longer used. Use TORCH_CUDA_ARCH_LIST instead. "
+      "Right now, CUDA_ARCH_NAME is ${CUDA_ARCH_NAME} and "
+      "TORCH_CUDA_ARCH_LIST is ${TORCH_CUDA_ARCH_LIST}.")
+  set(TORCH_CUDA_ARCH_LIST TORCH_CUDA_ARCH_LIST ${CUDA_ARCH_NAME})
+endif()
+
+# Invoke cuda_select_nvcc_arch_flags from proper cmake FindCUDA.
+cuda_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA ${TORCH_CUDA_ARCH_LIST})
 list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
-message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA}")
 
 # disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc.
 foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used)
diff --git a/conda/caffe2/normal/.gitignore b/conda/caffe2/normal/.gitignore
new file mode 100644
index 00000000000000..0552636118f261
--- /dev/null
+++ b/conda/caffe2/normal/.gitignore
@@ -0,0 +1 @@
+meta.yaml
diff --git a/conda/integrated/.gitignore b/conda/integrated/.gitignore
new file mode 100644
index 00000000000000..0552636118f261
--- /dev/null
+++ b/conda/integrated/.gitignore
@@ -0,0 +1 @@
+meta.yaml
diff --git a/docker/caffe2/ubuntu-16.04-cuda8-cudnn7-all-options/Dockerfile b/docker/caffe2/ubuntu-16.04-cuda8-cudnn7-all-options/Dockerfile
index 194d7cbdf06387..9235da3102275f 100644
--- a/docker/caffe2/ubuntu-16.04-cuda8-cudnn7-all-options/Dockerfile
+++ b/docker/caffe2/ubuntu-16.04-cuda8-cudnn7-all-options/Dockerfile
@@ -29,7 +29,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     wget \
     && rm -rf /var/lib/apt/lists/*
 
-RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
+RUN pip install --no-cache-dir --upgrade pip==9.0.3 setuptools wheel && \
     pip install --no-cache-dir \
     flask \
     future \
@@ -50,8 +50,8 @@ RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
     tornado
 
 ########## INSTALLATION STEPS ###################
-RUN git clone --branch master --recursive https://github.com/caffe2/caffe2.git
-RUN cd caffe2 && mkdir build && cd build \
+RUN git clone --branch master --recursive https://github.com/pytorch/pytorch.git
+RUN cd pytorch && mkdir build && cd build \
     && cmake .. \
     -DCUDA_ARCH_NAME=Manual \
     -DCUDA_ARCH_BIN="35 52 60 61" \
diff --git a/docs/source/autograd.rst b/docs/source/autograd.rst
index e220aa930eda1f..058c0e6ba1dfa4 100644
--- a/docs/source/autograd.rst
+++ b/docs/source/autograd.rst
@@ -73,6 +73,15 @@ Tensor autograd functions
 .. autoclass:: Function
     :members:
 
+.. _grad-check:
+
+Numerical gradient checking
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autofunction:: gradcheck
+
+.. autofunction:: gradgradcheck
+
 Profiler
 ^^^^^^^^
 
diff --git a/docs/source/notes/extending.rst b/docs/source/notes/extending.rst
index 55732ae0ca6231..810ef12bcc18a4 100644
--- a/docs/source/notes/extending.rst
+++ b/docs/source/notes/extending.rst
@@ -98,6 +98,12 @@ non-Variable arguments::
             # Gradients of non-Tensor arguments to forward must be None.
             return grad_output * ctx.constant, None
 
+.. note::
+    Inputs to ``backward``, i.e., :attr:`grad_output`, can also be Tensors that
+    track history. So if ``backward`` is implemented with differentiable
+    operations, (e.g., invocation of another custom
+    :class:`~torch.autograd.function`), higher order derivatives will work.
+
 You probably want to check if the backward method you implemented actually
 computes the derivatives of your function. It is possible by comparing with
 numerical approximations using small finite differences::
@@ -111,6 +117,8 @@ numerical approximations using small finite differences::
     test = gradcheck(linear, input, eps=1e-6, atol=1e-4)
     print(test)
 
+See :ref:`grad-check` for more details on finite-difference gradient comparisons.
+
 Extending :mod:`torch.nn`
 -------------------------
 
diff --git a/modules/observers/net_observer_reporter_print.cc b/modules/observers/net_observer_reporter_print.cc
index 488fba3abfe1d1..51958dabdf5df4 100644
--- a/modules/observers/net_observer_reporter_print.cc
+++ b/modules/observers/net_observer_reporter_print.cc
@@ -1,5 +1,6 @@
 #include "observers/net_observer_reporter_print.h"
 
+#include <sstream>
 #include "caffe2/core/init.h"
 #include "observers/observer_config.h"
 
@@ -10,11 +11,44 @@ const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer ";
 void NetObserverReporterPrint::report(
     NetBase* net,
     std::map<std::string, PerformanceInformation>& info) {
-  LOG(INFO) << IDENTIFIER << "Net Name - " << net->Name();
-  LOG(INFO) << IDENTIFIER << "Delay Start";
+  std::map<std::string, std::map<std::string, std::string>> caffe2_perf;
+  std::map<std::string, std::string> op_perf;
+  std::map<std::string, std::string> net_perf;
+
   for (auto& p : info) {
-    LOG(INFO) << IDENTIFIER << p.first << " - " << p.second.latency << "\t(ms)";
+    if ((p.first == "NET_DELAY") && (info.size() == 1)) {
+      // for Net_delay perf
+      net_perf["latency"] = caffe2::to_string(p.second.latency);
+      net_perf["flops"] = caffe2::to_string(-1);
+      caffe2_perf["NET"] = net_perf;
+    } else if (p.first != "NET_DELAY") {
+      // for operator perf
+      op_perf["latency"] = caffe2::to_string(p.second.latency);
+      op_perf["flops"] = caffe2::to_string(p.second.flops);
+      caffe2_perf[p.first] = op_perf;
+    }
+  }
+
+  std::stringstream buffer;
+  buffer << IDENTIFIER << "{";
+  for (auto it = caffe2_perf.begin(); it != caffe2_perf.end(); it++) {
+    buffer << "\"" << it->first << "\""
+           << ": {";
+    for (auto jt = it->second.begin(); jt != it->second.end(); jt++) {
+      buffer << "\"" << jt->first << "\""
+             << ": " << jt->second;
+      auto kt = jt;
+      if ((++kt) != it->second.end()) {
+        buffer << ", ";
+      }
+    }
+    buffer << "}";
+    auto lt = it;
+    if ((++lt) != caffe2_perf.end()) {
+      buffer << ", ";
+    }
   }
-  LOG(INFO) << IDENTIFIER << "Delay End";
+  buffer << "}";
+  LOG(INFO) << buffer.str();
 }
 }
diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc
index 9320625f147ac3..f5ab3f11af1c6f 100644
--- a/modules/observers/perf_observer.cc
+++ b/modules/observers/perf_observer.cc
@@ -85,6 +85,11 @@ void PerfNetObserver::Stop() {
 
       p.latency = static_cast<const PerfOperatorObserver*>(observerMap_[op])
                       ->getMilliseconds();
+#ifndef CAFFE2_IOS
+      auto cost = static_cast<const PerfOperatorObserver*>(observerMap_[op])
+                      ->getAnalyticalCost();
+      p.flops = cost.flops;
+#endif // CAFFE2_MOBILE
 
       p.engine = op->engine();
       p.type = op->type();
diff --git a/scripts/build_anaconda.sh b/scripts/build_anaconda.sh
index 4497a330facd5a..ce65c7914d7b11 100755
--- a/scripts/build_anaconda.sh
+++ b/scripts/build_anaconda.sh
@@ -324,7 +324,7 @@ if [[ -n $pytorch_too ]]; then
 fi
 
 if [[ -z $slim ]]; then
-  add_package 'opencv'
+  add_package 'opencv' '<3.4'
 else
   caffe2_cmake_args+=("-DUSE_OPENCV=OFF")
 fi
@@ -355,13 +355,8 @@ else
 fi
 
 # Change flags based on target gcc ABI
+# Default conda channels use gcc 7.2, conda-forge uses gcc 4.8.5
 if [[ "$(uname)" != 'Darwin' && "$GCC_USE_C11" -eq 0 ]]; then
-  # opencv 3.3.1 in conda-forge doesn't have imgcodecs, and opencv 3.1.0
-  # requires numpy 1.12
-  remove_lines_with 'opencv'
-  add_package 'opencv' '==3.1.0'
-
-  # Default conda channels use gcc 7.2, conda-forge uses gcc 4.8.5
   caffe2_cmake_args+=("-DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0")
   conda_channel+=('-c conda-forge')
 fi
diff --git a/scripts/build_windows.bat b/scripts/build_windows.bat
index b3432d37ef495e..185d43ac0a8992 100644
--- a/scripts/build_windows.bat
+++ b/scripts/build_windows.bat
@@ -51,7 +51,7 @@ cmake .. ^
   -DBUILD_TEST=OFF ^
   -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^
   -DUSE_CUDA=%USE_CUDA% ^
-  -DCUDA_ARCH_NAME=Maxwell ^
+  -DTORCH_CUDA_ARCH_LIST=5.0 ^
   -DUSE_NNPACK=OFF ^
   -DUSE_CUB=OFF ^
   -DUSE_GLOG=OFF ^
diff --git a/setup.py b/setup.py
index 8dfeca7424d48b..50c28867b296fd 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,9 @@
 #   NO_CUDNN
 #     disables the cuDNN build
 #
+#   NO_MIOPEN
+#     disables the MIOpen build
+#
 #   NO_MKLDNN
 #     disables the MKLDNN build
 #
@@ -91,6 +94,7 @@
 import distutils.unixccompiler
 import distutils.command.build
 import distutils.command.clean
+import distutils.sysconfig
 import platform
 import subprocess
 import shutil
@@ -103,8 +107,11 @@
 
 from tools.setup_helpers.env import check_env_flag
 from tools.setup_helpers.cuda import WITH_CUDA, CUDA_HOME, CUDA_VERSION
+from tools.setup_helpers.rocm import WITH_ROCM, ROCM_HOME, ROCM_VERSION
 from tools.setup_helpers.cudnn import (WITH_CUDNN, CUDNN_LIBRARY,
                                        CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
+from tools.setup_helpers.miopen import (WITH_MIOPEN, MIOPEN_LIBRARY,
+                                        MIOPEN_LIB_DIR, MIOPEN_INCLUDE_DIR)
 from tools.setup_helpers.nccl import WITH_NCCL, WITH_SYSTEM_NCCL, NCCL_LIB_DIR, \
     NCCL_INCLUDE_DIR, NCCL_ROOT_DIR, NCCL_SYSTEM_LIB
 from tools.setup_helpers.mkldnn import (WITH_MKLDNN, MKLDNN_LIBRARY,
@@ -116,31 +123,58 @@
 from tools.setup_helpers.dist_check import WITH_DISTRIBUTED, \
     WITH_DISTRIBUTED_MW, WITH_GLOO_IBVERBS
 
-DEBUG = check_env_flag('DEBUG')
 
+################################################################################
+# Parameters parsed from environment
+################################################################################
+
+DEBUG = check_env_flag('DEBUG')
 IS_WINDOWS = (platform.system() == 'Windows')
 IS_DARWIN = (platform.system() == 'Darwin')
 IS_LINUX = (platform.system() == 'Linux')
-
-# Check if ROCM is enabled
-WITH_ROCM = check_env_flag('WITH_ROCM')
+FULL_CAFFE2 = check_env_flag('FULL_CAFFE2')
+BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
 
 NUM_JOBS = multiprocessing.cpu_count()
 max_jobs = os.getenv("MAX_JOBS")
 if max_jobs is not None:
     NUM_JOBS = min(NUM_JOBS, int(max_jobs))
 
+# Ninja
 try:
     import ninja
     WITH_NINJA = True
+    ninja_global = NinjaBuilder('global')
 except ImportError:
     WITH_NINJA = False
+    ninja_global = None
 
-if not WITH_NINJA:
-    ################################################################################
-    # Monkey-patch setuptools to compile in parallel
-    ################################################################################
+# Constant known variables used throughout this file
+cwd = os.path.dirname(os.path.abspath(__file__))
+lib_path = os.path.join(cwd, "torch", "lib")
+third_party_path = os.path.join(cwd, "third_party")
+tmp_install_path = lib_path + "/tmp_install"
+
+
+class PytorchCommand(setuptools.Command):
+    """
+    Base Pytorch command to avoid implementing initialize/finalize_options in
+    every subclass
+    """
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
 
+
+################################################################################
+# Patches and workarounds
+################################################################################
+# Monkey-patch setuptools to compile in parallel
+if not WITH_NINJA:
     def parallelCCompile(self, sources, output_dir=None, macros=None,
                          include_dirs=None, debug=0, extra_preargs=None,
                          extra_postargs=None, depends=None):
@@ -160,6 +194,7 @@ def _single_compile(obj):
         return objects
     distutils.ccompiler.CCompiler.compile = parallelCCompile
 
+# Patch for linking with ccache
 original_link = distutils.unixccompiler.UnixCCompiler.link
 
 
@@ -170,35 +205,72 @@ def patched_link(self, *args, **kwargs):
     self.compiler_cxx = _cxx
     return result
 
-
 distutils.unixccompiler.UnixCCompiler.link = patched_link
 
-################################################################################
 # Workaround setuptools -Wstrict-prototypes warnings
 # I lifted this code from https://stackoverflow.com/a/29634231/23845
-################################################################################
-import distutils.sysconfig
 cfg_vars = distutils.sysconfig.get_config_vars()
 for key, value in cfg_vars.items():
     if type(value) == str:
         cfg_vars[key] = value.replace("-Wstrict-prototypes", "")
 
+
+################################################################################
+# Version and create_version_file
+################################################################################
+version = '0.5.0a0'
+if os.getenv('PYTORCH_BUILD_VERSION'):
+    assert os.getenv('PYTORCH_BUILD_NUMBER') is not None
+    build_number = int(os.getenv('PYTORCH_BUILD_NUMBER'))
+    version = os.getenv('PYTORCH_BUILD_VERSION')
+    if build_number > 1:
+        version += '.post' + str(build_number)
+else:
+    try:
+        sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
+        version += '+' + sha[:7]
+    except Exception:
+        pass
+
+
+class create_version_file(PytorchCommand):
+    def run(self):
+        global version, cwd
+        print('-- Building version ' + version)
+        version_path = os.path.join(cwd, 'torch', 'version.py')
+        with open(version_path, 'w') as f:
+            f.write("__version__ = '{}'\n".format(version))
+            # NB: This is not 100% accurate, because you could have built the
+            # library code with DEBUG, but csrc without DEBUG (in which case
+            # this would claim to be a release build when it's not.)
+            f.write("debug = {}\n".format(repr(DEBUG)))
+            f.write("cuda = {}\n".format(repr(CUDA_VERSION)))
+
+
 ################################################################################
-# Custom build commands
+# Building dependent libraries
 ################################################################################
 
+# All libraries that torch could depend on
 dep_libs = [
     'nccl', 'caffe2',
     'libshm', 'libshm_windows', 'gloo', 'THD', 'nanopb',
 ]
 
+missing_pydep = '''
+Missing build dependency: Unable to `import {importname}`.
+Please install it via `conda install {module}` or `pip install {module}`
+'''.strip()
+
 
-# global ninja file for building generated code stuff
-ninja_global = None
-if WITH_NINJA:
-    ninja_global = NinjaBuilder('global')
+def check_pydep(importname, module):
+    try:
+        importlib.import_module(importname)
+    except ImportError:
+        raise RuntimeError(missing_pydep.format(importname=importname, module=module))
 
 
+# Calls build_pytorch_libs.sh/bat with the correct env variables
 def build_libs(libs):
     for lib in libs:
         assert lib in dep_libs, 'invalid lib: {}'.format(lib)
@@ -229,44 +301,30 @@ def build_libs(libs):
         my_env["CUDNN_LIB_DIR"] = CUDNN_LIB_DIR
         my_env["CUDNN_LIBRARY"] = CUDNN_LIBRARY
         my_env["CUDNN_INCLUDE_DIR"] = CUDNN_INCLUDE_DIR
+    if WITH_MIOPEN:
+        my_env["MIOPEN_LIB_DIR"] = MIOPEN_LIB_DIR
+        my_env["MIOPEN_LIBRARY"] = MIOPEN_LIBRARY
+        my_env["MIOPEN_INCLUDE_DIR"] = MIOPEN_INCLUDE_DIR
     if WITH_MKLDNN:
         my_env["MKLDNN_LIB_DIR"] = MKLDNN_LIB_DIR
         my_env["MKLDNN_LIBRARY"] = MKLDNN_LIBRARY
         my_env["MKLDNN_INCLUDE_DIR"] = MKLDNN_INCLUDE_DIR
         build_libs_cmd += ['--with-mkldnn']
-
     if WITH_GLOO_IBVERBS:
         build_libs_cmd += ['--with-gloo-ibverbs']
-
     if WITH_DISTRIBUTED_MW:
         build_libs_cmd += ['--with-distributed-mw']
 
+    if FULL_CAFFE2:
+        build_libs_cmd += ['--full-caffe2']
+
     if subprocess.call(build_libs_cmd + libs, env=my_env) != 0:
         print("Failed to run '{}'".format(' '.join(build_libs_cmd + libs)))
         sys.exit(1)
 
-missing_pydep = '''
-Missing build dependency: Unable to `import {importname}`.
-Please install it via `conda install {module}` or `pip install {module}`
-'''.strip()
-
-
-def check_pydep(importname, module):
-    try:
-        importlib.import_module(importname)
-    except ImportError:
-        raise RuntimeError(missing_pydep.format(importname=importname, module=module))
-
-
-class build_deps(Command):
-    user_options = []
-
-    def initialize_options(self):
-        pass
-
-    def finalize_options(self):
-        pass
 
+# Build all dependent libraries
+class build_deps(PytorchCommand):
     def run(self):
         # Check if you remembered to check out submodules
         def check_file(f):
@@ -334,15 +392,7 @@ def run(self):
     build_dep_cmds['build_' + lib.lower()] = build_dep
 
 
-class build_module(Command):
-    user_options = []
-
-    def initialize_options(self):
-        pass
-
-    def finalize_options(self):
-        pass
-
+class build_module(PytorchCommand):
     def run(self):
         self.run_command('build_py')
         self.run_command('build_ext')
@@ -351,27 +401,14 @@ def run(self):
 class build_py(setuptools.command.build_py.build_py):
 
     def run(self):
-        self.create_version_file()
+        self.run_command('create_version_file')
         setuptools.command.build_py.build_py.run(self)
 
-    @staticmethod
-    def create_version_file():
-        global version, cwd
-        print('-- Building version ' + version)
-        version_path = os.path.join(cwd, 'torch', 'version.py')
-        with open(version_path, 'w') as f:
-            f.write("__version__ = '{}'\n".format(version))
-            # NB: This is not 100% accurate, because you could have built the
-            # library code with DEBUG, but csrc without DEBUG (in which case
-            # this would claim to be a release build when it's not.)
-            f.write("debug = {}\n".format(repr(DEBUG)))
-            f.write("cuda = {}\n".format(repr(CUDA_VERSION)))
-
 
 class develop(setuptools.command.develop.develop):
 
     def run(self):
-        build_py.create_version_file()
+        self.run_command('create_version_file')
         setuptools.command.develop.develop.run(self)
         self.create_compile_commands()
 
@@ -426,6 +463,8 @@ def run(self):
             print('-- Detected cuDNN at ' + CUDNN_LIBRARY + ', ' + CUDNN_INCLUDE_DIR)
         else:
             print('-- Not using cuDNN')
+        if WITH_MIOPEN:
+            print('-- Detected MIOpen at ' + MIOPEN_LIBRARY + ', ' + MIOPEN_INCLUDE_DIR)
         if WITH_CUDA:
             print('-- Detected CUDA at ' + CUDA_HOME)
         else:
@@ -474,6 +513,39 @@ def run(self):
 
             self.copy_file(export_lib, target_lib)
 
+    def build_extensions(self):
+        # If also building Caffe2 python, then we need this extra logic copied
+        # from the old setup_caffe2.py
+        if FULL_CAFFE2:
+            pybind_exts = [
+                'caffe2.python.caffe2_pybind11_state',
+                'caffe2.python.caffe2_pybind11_state_gpu',
+            ]
+            # The cmake of Caffe2 puts these in the site-packages rather than
+            # the build directory like the other torch extensions
+            sp_dir = distutils.sysconfig.get_python_lib(prefix='')
+            i = 0
+            while i < len(self.extensions):
+                ext = self.extensions[i]
+                if ext.name not in pybind_exts:
+                    i += 1
+                    continue
+                fullname = self.get_ext_fullname(ext.name)
+                filename = self.get_ext_filename(fullname)
+
+                src = os.path.join(tmp_install_path, sp_dir, filename)
+                if not os.path.exists(src):
+                    print("{} does not exist".format(src))
+                    del self.extensions[i]
+                else:
+                    dst = os.path.join(os.path.realpath(self.build_lib), filename)
+                    dst_dir = os.path.dirname(dst)
+                    if not os.path.exists(dst_dir):
+                        os.makedirs(dst_dir)
+                    self.copy_file(src, dst)
+                    i += 1
+        distutils.command.build_ext.build_ext.build_extensions(self)
+
 
 class build(distutils.command.build.build):
     sub_commands = [
@@ -553,12 +625,6 @@ def run(self):
     if check_env_flag('WERROR'):
         extra_compile_args.append('-Werror')
 
-cwd = os.path.dirname(os.path.abspath(__file__))
-lib_path = os.path.join(cwd, "torch", "lib")
-third_party_path = os.path.join(cwd, "third_party")
-
-
-tmp_install_path = lib_path + "/tmp_install"
 include_dirs += [
     cwd,
     os.path.join(cwd, "torch", "csrc"),
@@ -573,8 +639,10 @@ def run(self):
 
 # we specify exact lib names to avoid conflict with lua-torch installs
 CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.so')]
-if WITH_CUDA or WITH_ROCM:
+if WITH_CUDA:
     CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_gpu.so'), '-Wl,--as-needed'])
+if WITH_ROCM:
+    CAFFE2_LIBS.extend(['-Wl,--no-as-needed', os.path.join(lib_path, 'libcaffe2_hip.so'), '-Wl,--as-needed'])
 THD_LIB = os.path.join(lib_path, 'libTHD.a')
 NCCL_LIB = os.path.join(lib_path, 'libnccl.so.1')
 
@@ -583,14 +651,18 @@ def run(self):
 
 if IS_DARWIN:
     CAFFE2_LIBS = [os.path.join(lib_path, 'libcaffe2.dylib')]
-    if WITH_CUDA or WITH_ROCM:
+    if WITH_CUDA:
         CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_gpu.dylib'))
+    if WITH_ROCM:
+        CAFFE2_LIBS.append(os.path.join(lib_path, 'libcaffe2_hip.dylib'))
     NCCL_LIB = os.path.join(lib_path, 'libnccl.1.dylib')
 
 if IS_WINDOWS:
     CAFFE2_LIBS = [os.path.join(lib_path, 'caffe2.lib')]
-    if WITH_CUDA or WITH_ROCM:
+    if WITH_CUDA:
         CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_gpu.lib'))
+    if WITH_ROCM:
+        CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_hip.lib'))
     if DEBUG:
         NANOPB_STATIC_LIB = os.path.join(lib_path, 'protobuf-nanopbd.lib')
     else:
@@ -664,6 +736,7 @@ def run(self):
     "torch/csrc/jit/passes/canonicalize.cpp",
     "torch/csrc/jit/passes/batch_mm.cpp",
     "torch/csrc/jit/passes/decompose_addmm.cpp",
+    "torch/csrc/jit/passes/loop_unrolling.cpp",
     "torch/csrc/jit/passes/onnx/peephole.cpp",
     "torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp",
     "torch/csrc/jit/generated/aten_dispatch.cpp",
@@ -789,7 +862,6 @@ def run(self):
     extra_link_args.append('-Wl,-rpath,' + hip_lib_path)
     extra_compile_args += ['-DWITH_ROCM']
     extra_compile_args += ['-D__HIP_PLATFORM_HCC__']
-
     main_sources += [
         "torch/csrc/cuda/Module.cpp",
         "torch/csrc/cuda/Storage.cpp",
@@ -819,6 +891,12 @@ def run(self):
     if not IS_WINDOWS:
         extra_link_args.insert(0, '-Wl,-rpath,' + CUDNN_LIB_DIR)
     extra_compile_args += ['-DWITH_CUDNN']
+if WITH_MIOPEN:
+    # main_libraries += [MIOPEN_LIBRARY]
+    include_dirs.insert(0, MIOPEN_INCLUDE_DIR)
+    if not IS_WINDOWS:
+        extra_link_args.insert(0, '-Wl,-rpath,' + MIOPEN_LIB_DIR)
+    extra_compile_args += ['-DWITH_MIOPEN']
 
 if DEBUG:
     if IS_WINDOWS:
@@ -884,21 +962,22 @@ def make_relative_rpath(path):
                         )
     extensions.append(THNVRTC)
 
-version = '0.5.0a0'
-if os.getenv('PYTORCH_BUILD_VERSION'):
-    assert os.getenv('PYTORCH_BUILD_NUMBER') is not None
-    build_number = int(os.getenv('PYTORCH_BUILD_NUMBER'))
-    version = os.getenv('PYTORCH_BUILD_VERSION')
-    if build_number > 1:
-        version += '.post' + str(build_number)
-else:
-    try:
-        sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
-        version += '+' + sha[:7]
-    except Exception:
-        pass
+if FULL_CAFFE2:
+    # If building Caffe2 python as well, these extensions are built by cmake
+    # copied manually in build_extensions() inside the build_ext implementaiton
+    extensions.append(
+        setuptools.Extension(
+            name=str('caffe2.python.caffe2_pybind11_state'),
+            sources=[]),
+    )
+    extensions.append(
+        setuptools.Extension(
+            name=str('caffe2.python.caffe2_pybind11_state_gpu'),
+            sources=[]),
+    )
 
 cmdclass = {
+    'create_version_file': create_version_file,
     'build': build,
     'build_py': build_py,
     'build_ext': build_ext,
diff --git a/setup_caffe2.py b/setup_caffe2.py
index d5a0e1db20f0cc..77a7b64f4a3302 100644
--- a/setup_caffe2.py
+++ b/setup_caffe2.py
@@ -261,4 +261,10 @@ def run(self):
     author='jiayq',
     author_email='jiayq@fb.com',
     url='https://caffe2.ai',
+    entry_points={
+        'console_scripts': [
+            'convert-caffe2-to-onnx = caffe2.python.onnx.bin.conversion:caffe2_to_onnx',
+            'convert-onnx-to-caffe2 = caffe2.python.onnx.bin.conversion:onnx_to_caffe2',
+        ]
+    },
 )
diff --git a/test/common.py b/test/common.py
index 52033b3a8858bb..94da44ea1b664c 100644
--- a/test/common.py
+++ b/test/common.py
@@ -187,7 +187,8 @@ def __init__(self, method_name='runTest'):
         # Wraps the tested method if we should do CUDA memory check.
         test_method = getattr(self, method_name)
         self._do_cuda_memory_leak_check &= getattr(test_method, '_do_cuda_memory_leak_check', True)
-        if self._do_cuda_memory_leak_check:
+        # FIXME: figure out the flaky -1024 anti-leaks on windows. See #8044
+        if self._do_cuda_memory_leak_check and not IS_WINDOWS:
             # the import below may initialize CUDA context, so we do it only if
             # self._do_cuda_memory_leak_check is True.
             from common_cuda import TEST_CUDA
diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp
index f8f43da1c8ddad..0f4fa33ddf3e1c 100644
--- a/test/cpp/api/misc.cpp
+++ b/test/cpp/api/misc.cpp
@@ -1,5 +1,6 @@
 #include <catch.hpp>
 
+#include <torch/detail/ordered_dict.h>
 #include <torch/expanding_array.h>
 #include <torch/nn/modules/linear.h>
 #include <torch/tensor.h>
@@ -12,6 +13,9 @@
 using namespace torch;
 using namespace torch::nn;
 
+template <typename T>
+using OrderedDict = detail::OrderedDict<std::string, T>;
+
 using Catch::StartsWith;
 
 TEST_CASE("misc") {
@@ -67,7 +71,7 @@ TEST_CASE("autograd") {
   }
   SECTION("custom gradient inputs") {
     z.sum().backward(
-        autograd::make_variable(at::ones(at::CPU(at::kFloat), {1}) * 2));
+        autograd::make_variable(at::ones(at::CPU(at::kFloat), {}) * 2));
     REQUIRE(x.grad().allclose(y * 2));
   }
   // Assume everything else is safe from PyTorch tests.
@@ -153,3 +157,193 @@ TEST_CASE("make_unique") {
     REQUIRE(ptr[2] == 0);
   }
 }
+
+TEST_CASE("ordered-dict") {
+  SECTION("is empty after default construction") {
+    OrderedDict<int> dict;
+    REQUIRE(dict.subject() == "Key");
+    REQUIRE(dict.is_empty());
+    REQUIRE(dict.size() == 0);
+  }
+
+  SECTION("insert inserts elements when they are not yet present") {
+    OrderedDict<int> dict;
+    dict.insert("a", 1);
+    dict.insert("b", 2);
+    REQUIRE(dict.size() == 2);
+  }
+
+  SECTION("get returns values when present") {
+    OrderedDict<int> dict;
+    dict.insert("a", 1);
+    dict.insert("b", 2);
+    REQUIRE(dict.get("a") == 1);
+    REQUIRE(dict.get("b") == 2);
+  }
+
+  SECTION("get throws when passed keys that are not present") {
+    OrderedDict<int> dict;
+    dict.insert("a", 1);
+    dict.insert("b", 2);
+    REQUIRE_THROWS_WITH(
+        dict.get("foo"), StartsWith("Key 'foo' is not defined"));
+    REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined"));
+  }
+
+  SECTION("can initialize from list") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict.size() == 2);
+    REQUIRE(dict.get("a") == 1);
+    REQUIRE(dict.get("b") == 2);
+  }
+
+  SECTION("insert throws when passed elements that are present") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE_THROWS_WITH(
+        dict.insert("a", 1), StartsWith("Key 'a' already defined"));
+    REQUIRE_THROWS_WITH(
+        dict.insert("b", 1), StartsWith("Key 'b' already defined"));
+  }
+
+  SECTION("front() returns the first item") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict.front().key == "a");
+    REQUIRE(dict.front().value == 1);
+  }
+
+  SECTION("back() returns the last item") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict.back().key == "b");
+    REQUIRE(dict.back().value == 2);
+  }
+
+  SECTION("find returns pointers to values when present") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict.find("a") != nullptr);
+    REQUIRE(*dict.find("a") == 1);
+    REQUIRE(dict.find("b") != nullptr);
+    REQUIRE(*dict.find("b") == 2);
+  }
+
+  SECTION("find returns null pointers when passed keys that are not present") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict.find("bar") == nullptr);
+    REQUIRE(dict.find("") == nullptr);
+  }
+
+  SECTION("operator[] returns values when passed keys that are present") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict["a"] == 1);
+    REQUIRE(dict["b"] == 2);
+  }
+
+  SECTION("operator[] returns items positionally when passed integers") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(dict[0].key == "a");
+    REQUIRE(dict[0].value == 1);
+    REQUIRE(dict[1].key == "b");
+    REQUIRE(dict[1].value == 2);
+  }
+
+  SECTION("operator[] throws when passed keys that are not present") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE_THROWS_WITH(
+        dict.get("foo"), StartsWith("Key 'foo' is not defined"));
+    REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined"));
+  }
+
+  SECTION("update inserts all items from another OrderedDict") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    OrderedDict<int> dict2 = {{"c", 3}};
+    dict2.update(dict);
+    REQUIRE(dict2.size() == 3);
+    REQUIRE(dict2.find("a") != nullptr);
+    REQUIRE(dict2.find("b") != nullptr);
+    REQUIRE(dict2.find("c") != nullptr);
+  }
+
+  SECTION("update also checks for duplicates") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    OrderedDict<int> dict2 = {{"a", 1}};
+    REQUIRE_THROWS_WITH(
+        dict2.update(dict), StartsWith("Key 'a' already defined"));
+  }
+
+  SECTION("Can iterate items") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    auto iterator = dict.begin();
+    REQUIRE(iterator != dict.end());
+    REQUIRE(iterator->key == "a");
+    REQUIRE(iterator->value == 1);
+    ++iterator;
+    REQUIRE(iterator != dict.end());
+    REQUIRE(iterator->key == "b");
+    REQUIRE(iterator->value == 2);
+    ++iterator;
+    REQUIRE(iterator == dict.end());
+  }
+
+  SECTION("clear makes the dict empty") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    REQUIRE(!dict.is_empty());
+    dict.clear();
+    REQUIRE(dict.is_empty());
+  }
+
+  SECTION("can copy construct") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    OrderedDict<int> copy = dict;
+    REQUIRE(copy.size() == 2);
+    REQUIRE(*copy[0] == 1);
+    REQUIRE(*copy[1] == 2);
+  }
+
+  SECTION("can copy assign") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    OrderedDict<int> copy = {{"c", 1}};
+    REQUIRE(copy.find("c") != nullptr);
+    copy = dict;
+    REQUIRE(copy.size() == 2);
+    REQUIRE(*copy[0] == 1);
+    REQUIRE(*copy[1] == 2);
+    REQUIRE(copy.find("c") == nullptr);
+  }
+
+  SECTION("can move construct") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    OrderedDict<int> copy = std::move(dict);
+    REQUIRE(copy.size() == 2);
+    REQUIRE(*copy[0] == 1);
+    REQUIRE(*copy[1] == 2);
+  }
+
+  SECTION("can move assign") {
+    OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
+    OrderedDict<int> copy = {{"c", 1}};
+    REQUIRE(copy.find("c") != nullptr);
+    copy = std::move(dict);
+    REQUIRE(copy.size() == 2);
+    REQUIRE(*copy[0] == 1);
+    REQUIRE(*copy[1] == 2);
+    REQUIRE(copy.find("c") == nullptr);
+  }
+
+  SECTION("can insert with braces") {
+    OrderedDict<std::pair<int, int>> dict;
+    dict.insert("a", {1, 2});
+    REQUIRE(!dict.is_empty());
+    REQUIRE(dict["a"].first == 1);
+    REQUIRE(dict["a"].second == 2);
+  }
+
+  SECTION("Error messages include the what") {
+    OrderedDict<int> dict("Penguin");
+    REQUIRE(dict.subject() == "Penguin");
+    dict.insert("a", 1);
+    REQUIRE(!dict.is_empty());
+    REQUIRE_THROWS_WITH(
+        dict.get("b"), StartsWith("Penguin 'b' is not defined"));
+    REQUIRE_THROWS_WITH(
+        dict.insert("a", 1), StartsWith("Penguin 'a' already defined"));
+  }
+}
diff --git a/test/expect/TestScript.test_cat_lifts.expect b/test/expect/TestScript.test_cat_lifts.expect
new file mode 100644
index 00000000000000..5bcef43f7c7a3d
--- /dev/null
+++ b/test/expect/TestScript.test_cat_lifts.expect
@@ -0,0 +1,12 @@
+graph(%x : Dynamic) {
+  %1 : Dynamic = aten::cat[dim=1](%x, %x)
+  return (%1);
+}
+graph(%x : Dynamic) {
+  %1 : Dynamic = aten::cat[dim=1]()
+  return (%1);
+}
+graph(%x : Dynamic) {
+  %1 : Dynamic = aten::cat[dim=1](%x)
+  return (%1);
+}
diff --git a/test/expect/TestScript.test_loop_unroll_unused_counter.expect b/test/expect/TestScript.test_loop_unroll_unused_counter.expect
new file mode 100644
index 00000000000000..6db04ad55ec313
--- /dev/null
+++ b/test/expect/TestScript.test_loop_unroll_unused_counter.expect
@@ -0,0 +1,36 @@
+graph(%x : Dynamic) {
+  %y.1 : Long() = prim::Constant[value={0}]()
+  %2 : Byte() = prim::Constant[value={1}]()
+  %3 : Dynamic = aten::div[other={8}](%x)
+  %4 : Dynamic = aten::mul[other={8}](%3)
+  %5 : Dynamic = aten::sub[alpha={1}](%x, %4)
+  %y.3 : Long() = prim::Loop(%3, %2, %y.1)
+    block0(%i.1 : Dynamic, %8 : Long()) {
+      %9 : Long() = prim::Constant[value={1}]()
+      %y.12 : Dynamic = aten::add[alpha={1}](%8, %9)
+      %11 : Long() = prim::Constant[value={1}]()
+      %y.5 : Dynamic = aten::add[alpha={1}](%y.12, %11)
+      %13 : Long() = prim::Constant[value={1}]()
+      %y.6 : Dynamic = aten::add[alpha={1}](%y.5, %13)
+      %15 : Long() = prim::Constant[value={1}]()
+      %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %15)
+      %17 : Long() = prim::Constant[value={1}]()
+      %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %17)
+      %19 : Long() = prim::Constant[value={1}]()
+      %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %19)
+      %21 : Long() = prim::Constant[value={1}]()
+      %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %21)
+      %23 : Long() = prim::Constant[value={1}]()
+      %y.11 : Dynamic = aten::add[alpha={1}](%y.10, %23)
+      %25 : Byte() = prim::Constant[value={1}]()
+      -> (%25, %y.11)
+    }
+  %y : Long() = prim::Loop(%5, %2, %y.3)
+    block0(%i : Dynamic, %28 : Long()) {
+      %29 : Long() = prim::Constant[value={1}]()
+      %y.4 : Dynamic = aten::add[alpha={1}](%28, %29)
+      %31 : Byte() = prim::Constant[value={1}]()
+      -> (%31, %y.4)
+    }
+  return (%y);
+}
diff --git a/test/expect/TestScript.test_loop_unrolling.expect b/test/expect/TestScript.test_loop_unrolling.expect
new file mode 100644
index 00000000000000..0c61346c786fbc
--- /dev/null
+++ b/test/expect/TestScript.test_loop_unrolling.expect
@@ -0,0 +1,37 @@
+graph(%x : Dynamic) {
+  %y.1 : Long() = prim::Constant[value={0}]()
+  %2 : Byte() = prim::Constant[value={1}]()
+  %3 : Long() = prim::Constant[value={0}]()
+  %4 : Dynamic = aten::div[other={8}](%x)
+  %5 : Dynamic = aten::mul[other={8}](%4)
+  %6 : Dynamic = aten::sub[alpha={1}](%x, %5)
+  %7 : Dynamic, %y.3 : Long() = prim::Loop(%4, %2, %3, %y.1)
+    block0(%i.1 : Dynamic, %10 : Dynamic, %11 : Long()) {
+      %y.12 : Dynamic = aten::add[alpha={1}](%11, %10)
+      %13 : Dynamic = aten::add[alpha={1}, other={1}](%10)
+      %y.5 : Dynamic = aten::add[alpha={1}](%y.12, %13)
+      %15 : Dynamic = aten::add[alpha={1}, other={1}](%13)
+      %y.6 : Dynamic = aten::add[alpha={1}](%y.5, %15)
+      %17 : Dynamic = aten::add[alpha={1}, other={1}](%15)
+      %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %17)
+      %19 : Dynamic = aten::add[alpha={1}, other={1}](%17)
+      %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %19)
+      %21 : Dynamic = aten::add[alpha={1}, other={1}](%19)
+      %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %21)
+      %23 : Dynamic = aten::add[alpha={1}, other={1}](%21)
+      %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %23)
+      %25 : Dynamic = aten::add[alpha={1}, other={1}](%23)
+      %y.11 : Dynamic = aten::add[alpha={1}](%y.10, %25)
+      %27 : Byte() = prim::Constant[value={1}]()
+      %28 : Dynamic = aten::add[alpha={1}, other={1}](%25)
+      -> (%27, %28, %y.11)
+    }
+  %29 : Dynamic, %y : Long() = prim::Loop(%6, %2, %7, %y.3)
+    block0(%i : Dynamic, %32 : Dynamic, %33 : Long()) {
+      %y.4 : Dynamic = aten::add[alpha={1}](%33, %32)
+      %35 : Byte() = prim::Constant[value={1}]()
+      %36 : Dynamic = aten::add[alpha={1}, other={1}](%32)
+      -> (%35, %36, %y.4)
+    }
+  return (%y);
+}
diff --git a/test/expect/TestScript.test_loop_unrolling_const-add_const.expect b/test/expect/TestScript.test_loop_unrolling_const-add_const.expect
new file mode 100644
index 00000000000000..28eb86abd3f0bd
--- /dev/null
+++ b/test/expect/TestScript.test_loop_unrolling_const-add_const.expect
@@ -0,0 +1,24 @@
+graph() {
+  %y.1 : Long() = prim::Constant[value={0}]()
+  %1 : Long() = prim::Constant[value={1}]()
+  %y.11 : Dynamic = aten::add[alpha={1}](%y.1, %1)
+  %3 : Long() = prim::Constant[value={1}]()
+  %y.2 : Dynamic = aten::add[alpha={1}](%y.11, %3)
+  %5 : Long() = prim::Constant[value={1}]()
+  %y.3 : Dynamic = aten::add[alpha={1}](%y.2, %5)
+  %7 : Long() = prim::Constant[value={1}]()
+  %y.4 : Dynamic = aten::add[alpha={1}](%y.3, %7)
+  %9 : Long() = prim::Constant[value={1}]()
+  %y.5 : Dynamic = aten::add[alpha={1}](%y.4, %9)
+  %11 : Long() = prim::Constant[value={1}]()
+  %y.6 : Dynamic = aten::add[alpha={1}](%y.5, %11)
+  %13 : Long() = prim::Constant[value={1}]()
+  %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %13)
+  %15 : Long() = prim::Constant[value={1}]()
+  %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %15)
+  %17 : Long() = prim::Constant[value={1}]()
+  %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %17)
+  %19 : Long() = prim::Constant[value={1}]()
+  %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %19)
+  return (%y.10);
+}
diff --git a/test/expect/TestScript.test_loop_unrolling_const-add_iter.expect b/test/expect/TestScript.test_loop_unrolling_const-add_iter.expect
new file mode 100644
index 00000000000000..99e0996ffcf995
--- /dev/null
+++ b/test/expect/TestScript.test_loop_unrolling_const-add_iter.expect
@@ -0,0 +1,24 @@
+graph() {
+  %y.1 : Long() = prim::Constant[value={0}]()
+  %1 : Long() = prim::Constant[value={0}]()
+  %y.11 : Dynamic = aten::add[alpha={1}](%y.1, %1)
+  %3 : Dynamic = aten::add[alpha={1}, other={1}](%1)
+  %y.2 : Dynamic = aten::add[alpha={1}](%y.11, %3)
+  %5 : Dynamic = aten::add[alpha={1}, other={1}](%3)
+  %y.3 : Dynamic = aten::add[alpha={1}](%y.2, %5)
+  %7 : Dynamic = aten::add[alpha={1}, other={1}](%5)
+  %y.4 : Dynamic = aten::add[alpha={1}](%y.3, %7)
+  %9 : Dynamic = aten::add[alpha={1}, other={1}](%7)
+  %y.5 : Dynamic = aten::add[alpha={1}](%y.4, %9)
+  %11 : Dynamic = aten::add[alpha={1}, other={1}](%9)
+  %y.6 : Dynamic = aten::add[alpha={1}](%y.5, %11)
+  %13 : Dynamic = aten::add[alpha={1}, other={1}](%11)
+  %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %13)
+  %15 : Dynamic = aten::add[alpha={1}, other={1}](%13)
+  %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %15)
+  %17 : Dynamic = aten::add[alpha={1}, other={1}](%15)
+  %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %17)
+  %19 : Dynamic = aten::add[alpha={1}, other={1}](%17)
+  %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %19)
+  return (%y.10);
+}
diff --git a/test/expect/TestScript.test_loop_unrolling_nested.expect b/test/expect/TestScript.test_loop_unrolling_nested.expect
new file mode 100644
index 00000000000000..97eb4245c4f86d
--- /dev/null
+++ b/test/expect/TestScript.test_loop_unrolling_nested.expect
@@ -0,0 +1,44 @@
+graph(%x : Dynamic) {
+  %y.1 : Long() = prim::Constant[value={0}]()
+  %2 : Long() = prim::Constant[value={10}]()
+  %3 : Byte() = prim::Constant[value={1}]()
+  %y : Long() = prim::Loop(%2, %3, %y.1)
+    block0(%i : Dynamic, %6 : Long()) {
+      %7 : Byte() = prim::Constant[value={1}]()
+      %8 : Long() = prim::Constant[value={0}]()
+      %9 : Dynamic = aten::div[other={8}](%x)
+      %10 : Dynamic = aten::mul[other={8}](%9)
+      %11 : Dynamic = aten::sub[alpha={1}](%x, %10)
+      %12 : Dynamic, %y.4 : Long() = prim::Loop(%9, %7, %8, %6)
+        block0(%j.1 : Dynamic, %15 : Dynamic, %16 : Long()) {
+          %y.13 : Dynamic = aten::add[alpha={1}](%16, %15)
+          %18 : Dynamic = aten::add[alpha={1}, other={1}](%15)
+          %y.6 : Dynamic = aten::add[alpha={1}](%y.13, %18)
+          %20 : Dynamic = aten::add[alpha={1}, other={1}](%18)
+          %y.7 : Dynamic = aten::add[alpha={1}](%y.6, %20)
+          %22 : Dynamic = aten::add[alpha={1}, other={1}](%20)
+          %y.8 : Dynamic = aten::add[alpha={1}](%y.7, %22)
+          %24 : Dynamic = aten::add[alpha={1}, other={1}](%22)
+          %y.9 : Dynamic = aten::add[alpha={1}](%y.8, %24)
+          %26 : Dynamic = aten::add[alpha={1}, other={1}](%24)
+          %y.10 : Dynamic = aten::add[alpha={1}](%y.9, %26)
+          %28 : Dynamic = aten::add[alpha={1}, other={1}](%26)
+          %y.11 : Dynamic = aten::add[alpha={1}](%y.10, %28)
+          %30 : Dynamic = aten::add[alpha={1}, other={1}](%28)
+          %y.12 : Dynamic = aten::add[alpha={1}](%y.11, %30)
+          %32 : Byte() = prim::Constant[value={1}]()
+          %33 : Dynamic = aten::add[alpha={1}, other={1}](%30)
+          -> (%32, %33, %y.12)
+        }
+      %34 : Dynamic, %y.3 : Long() = prim::Loop(%11, %7, %12, %y.4)
+        block0(%j : Dynamic, %37 : Dynamic, %38 : Long()) {
+          %y.5 : Dynamic = aten::add[alpha={1}](%38, %37)
+          %40 : Byte() = prim::Constant[value={1}]()
+          %41 : Dynamic = aten::add[alpha={1}, other={1}](%37)
+          -> (%40, %41, %y.5)
+        }
+      %42 : Byte() = prim::Constant[value={1}]()
+      -> (%42, %y.3)
+    }
+  return (%y);
+}
diff --git a/test/onnx/expect/TestOperators.test_flatten.expect b/test/onnx/expect/TestOperators.test_flatten.expect
index a55811b061a9a1..355af689a3db80 100644
--- a/test/onnx/expect/TestOperators.test_flatten.expect
+++ b/test/onnx/expect/TestOperators.test_flatten.expect
@@ -134,7 +134,7 @@ graph {
     op_type: "Cast"
     attribute {
       name: "to"
-      i: 11
+      i: 7
       type: INT
     }
   }
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 0c19e24208998e..6211de479b40be 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -75,7 +75,7 @@ def _function_test(self, cls):
         x = torch.randn(5, 5, requires_grad=True)
         y = torch.randn(5, 5, requires_grad=True)
         result = cls.apply(x, 2, y)
-        go = torch.ones(1, requires_grad=True)
+        go = torch.ones((), requires_grad=True)
         result.sum().backward(go, create_graph=True)
 
         self.assertEqual(x.grad.data, y.data + torch.ones(5, 5))
@@ -173,6 +173,23 @@ def backward(self, grad_output):
         MyFunction()(y).sum().backward()
         self.assertEqual(v.grad.data, torch.zeros(shape))
 
+    def test_invalid_gradients(self):
+        class MyFunction(Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return torch.randn(10, dtype=torch.float)
+
+        with self.assertRaisesRegex(RuntimeError, 'expected shape'):
+            input = torch.randn(5, 5, dtype=torch.float, requires_grad=True)
+            MyFunction.apply(input).sum().backward()
+        with self.assertRaisesRegex(RuntimeError, 'expected type'):
+            input = torch.randn(10, dtype=torch.double, requires_grad=True)
+            MyFunction.apply(input).sum().backward()
+
     def test_accumulate_grad(self):
         grad_output = torch.ones(5, 5)
 
@@ -495,7 +512,6 @@ def test_backward(self):
 
     def test_sparse_backward(self):
         class FixedGradientFunction(Function):
-
             def __init__(self, grad):
                 self.grad = grad
 
@@ -524,15 +540,15 @@ def backward(self, grad_x):
         dense_fn = FixedGradientFunction(dense_grad)
 
         # sparse first
-        x = torch.randn(5, 5, requires_grad=True)
+        x = torch.randn(size, requires_grad=True)
         (sparse_fn1(x) + dense_fn(x) + sparse_fn2(x)).sum().backward()
         self.assertEqual(x.grad, dense_grad + sparse_grad1 + sparse_grad2)
         # dense first
-        x = torch.randn(5, 5, requires_grad=True)
+        x = torch.randn(size, requires_grad=True)
         (dense_fn(x) + sparse_fn1(x) + sparse_fn2(x)).sum().backward()
         self.assertEqual(x.grad, dense_grad + sparse_grad1 + sparse_grad2)
         # sparse only
-        x = torch.randn(5, 5, requires_grad=True)
+        x = torch.randn(size, requires_grad=True)
         (sparse_fn1(x) + sparse_fn2(x)).sum().backward()
         self.assertEqual(x.grad, sparse_grad1 + sparse_grad2)
 
@@ -2277,6 +2293,19 @@ def test_set_requires_grad_only_for_floats_cuda(self):
     def test_set_requires_grad_only_for_floats(self):
         self._test_set_requires_grad_only_for_floats(self, False)
 
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_rnn_backward_to_input_but_not_parameters_cuda(self):
+        # this checks whether it is possible to not require
+        # weight parameters, but require inputs, see #7722
+        dev = torch.device('cuda')
+        l = torch.nn.LSTM(2, 3).to(dev)
+        for p in l.parameters():
+            p.requires_grad = False
+        s = torch.randn(1, 1, 2, requires_grad=True, device=dev)
+        out, _ = l(s)
+        out.sum().backward()
+        self.assertFalse(s.grad is None or s.grad.abs().sum().item() == 0)
+
 
 def index_variable(shape, max_indices):
     if not isinstance(shape, tuple):
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 0139c746f4f50b..d9bf6e66a41178 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -3,16 +3,23 @@
 import tempfile
 import re
 import unittest
+import sys
 from itertools import repeat
 
 import torch
 import torch.cuda
 import torch.cuda.comm as comm
+from torch import multiprocessing as mp
 
 from test_torch import TestTorch
 from common import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests, PY3
-from common_cuda import TEST_CUDA, TEST_MULTIGPU
 
+# We cannot import TEST_CUDA and TEST_MULTIGPU from common_cuda here,
+# because if we do that, the TEST_CUDNN line from common_cuda will be executed
+# multiple times as well during the execution of this test suite, and it will
+# cause CUDA OOM error on Windows.
+TEST_CUDA = torch.cuda.is_available()
+TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
 
 if not TEST_CUDA:
     print('CUDA not available, skipping tests')
@@ -85,6 +92,10 @@ def number(floating, integer, t):
     else:
         return integer
 
+
+def cast_tensor(tensor, t):
+    return t(tensor.size()).copy_(tensor)
+
 S = 10
 M = 50
 
@@ -401,6 +412,10 @@ def tmp(t):
     ('rsqrt', lambda t: constant_tensor_add(1, small_3d(t)), lambda t: [], None, float_types),
     ('sinh', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types),
     ('tan', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types),
+    ('__lshift__', lambda t: torch.pow(2, cast_tensor(torch.arange(1, 5), t)),
+        lambda t: [2], None, signed_types),
+    ('__rshift__', lambda t: torch.pow(2, cast_tensor(torch.arange(3, 7), t)),
+        lambda t: [2], None, signed_types),
     # lapack tests
     ('qr', small_2d_lapack, lambda t: [], 'square', float_types, False,
         unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
@@ -480,6 +495,8 @@ def tmp(t):
     'tanh': 1e-3,
     'trace': 1e-3,
     'var': 1e-3,
+    '__lshift__': 1e-3,
+    '__rshift__': 1e-3,
 }
 
 simple_pointwise = [
@@ -1389,6 +1406,34 @@ def test_multinomial(self):
         sample = torch.multinomial(freqs, 1000, True)
         self.assertNotEqual(freqs[sample].min(), 0)
 
+    def _spawn_method(self, method, arg):
+        try:
+            mp.set_start_method('spawn')
+        except RuntimeError:
+            pass
+        with mp.Pool(1) as pool:
+            self.assertTrue(pool.map(method, [arg]))
+
+    @staticmethod
+    def _test_multinomial_invalid_probs_cuda(probs):
+        try:
+            with torch.random.fork_rng(devices=[0]):
+                torch.multinomial(probs.to('cuda'), 1)
+                torch.cuda.synchronize()
+            return False  # Should not be reached
+        except RuntimeError as e:
+            return 'device-side assert triggered' in str(e)
+
+    @unittest.skipIf(not PY3,
+                     "spawn start method is not supported in Python 2, \
+                     but we need it for creating another process with CUDA")
+    def test_multinomial_invalid_probs_cuda(self):
+        test_method = TestCuda._test_multinomial_invalid_probs_cuda
+        self._spawn_method(test_method, torch.Tensor([0, -1]))
+        self._spawn_method(test_method, torch.Tensor([0, float('inf')]))
+        self._spawn_method(test_method, torch.Tensor([0, float('-inf')]))
+        self._spawn_method(test_method, torch.Tensor([0, float('nan')]))
+
     def test_broadcast(self):
         TestTorch._test_broadcast(self, lambda t: t.cuda())
 
@@ -1643,6 +1688,41 @@ def test_nvtx(self):
         torch.cuda.nvtx.mark("bar")
         torch.cuda.nvtx.range_pop()
 
+    def test_randperm_cuda(self):
+        cuda = torch.device('cuda:0')
+
+        # For small inputs, randperm is offloaded to CPU instead
+        with torch.random.fork_rng(devices=[0]):
+            res1 = torch.randperm(100, device=cuda)
+        res2 = torch.cuda.LongTensor()
+        torch.randperm(100, out=res2, device=cuda)
+        self.assertEqual(res1, res2, 0)
+
+        with torch.random.fork_rng(devices=[0]):
+            res1 = torch.randperm(100000, device=cuda)
+        res2 = torch.cuda.LongTensor()
+        torch.randperm(100000, out=res2, device=cuda)
+        self.assertEqual(res1, res2, 0)
+
+        with torch.random.fork_rng(devices=[0]):
+            res1 = torch.randperm(100, dtype=torch.half, device=cuda)
+        res2 = torch.cuda.HalfTensor()
+        torch.randperm(100, out=res2, device=cuda)
+        self.assertEqual(res1, res2, 0)
+
+        with torch.random.fork_rng(devices=[0]):
+            res1 = torch.randperm(50000, dtype=torch.half, device=cuda)
+        res2 = torch.cuda.HalfTensor()
+        torch.randperm(50000, out=res2, device=cuda)
+        self.assertEqual(res1, res2, 0)
+
+        # randperm of 0 elements is an empty tensor
+        res1 = torch.randperm(0, device=cuda)
+        res2 = torch.cuda.LongTensor(5)
+        torch.randperm(0, out=res2, device=cuda)
+        self.assertEqual(res1.numel(), 0)
+        self.assertEqual(res2.numel(), 0)
+
     def test_random_neg_values(self):
         TestTorch._test_random_neg_values(self, use_cuda=True)
 
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 23bc1ba2ec0bd8..49122d41d53082 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -344,7 +344,6 @@ def test_sequential_pin_memory(self):
             self.assertTrue(input.is_pinned())
             self.assertTrue(target.is_pinned())
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     def test_multiple_dataloaders(self):
         loader1_it = iter(DataLoader(self.dataset, num_workers=1))
         loader2_it = iter(DataLoader(self.dataset, num_workers=2))
@@ -355,7 +354,6 @@ def test_multiple_dataloaders(self):
         next(loader1_it)
         next(loader2_it)
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     @unittest.skip("temporarily disable until flaky failures are fixed")
     def test_segfault(self):
         p = ErrorTrackingProcess(target=_test_segfault)
@@ -373,7 +371,6 @@ def test_segfault(self):
         finally:
             p.terminate()
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     def test_timeout(self):
         p = ErrorTrackingProcess(target=_test_timeout)
         p.start()
@@ -386,7 +383,6 @@ def test_timeout(self):
         finally:
             p.terminate()
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     def test_worker_seed(self):
         num_workers = 6
         dataset = SynchronizedSeedDataset(num_workers, num_workers)
@@ -396,7 +392,6 @@ def test_worker_seed(self):
             seeds.add(batch[0])
         self.assertEqual(len(seeds), num_workers)
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     def test_worker_init_fn(self):
         dataset = SeedDataset(4)
         dataloader = DataLoader(dataset, batch_size=2, num_workers=2,
@@ -411,19 +406,15 @@ def test_shuffle(self):
     def test_shuffle_batch(self):
         self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True))
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     def test_sequential_workers(self):
         self._test_sequential(DataLoader(self.dataset, num_workers=4))
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     def test_seqential_batch_workers(self):
         self._test_sequential(DataLoader(self.dataset, batch_size=2, num_workers=4))
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     def test_shuffle_workers(self):
         self._test_shuffle(DataLoader(self.dataset, shuffle=True, num_workers=4))
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     def test_shuffle_batch_workers(self):
         self._test_shuffle(DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4))
 
@@ -446,12 +437,10 @@ def _test_batch_sampler(self, **kwargs):
                 self.assertEqual(len(input), 3)
                 self.assertEqual(input, self.data[offset:offset + 3])
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     def test_batch_sampler(self):
         self._test_batch_sampler()
         self._test_batch_sampler(num_workers=4)
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_shuffle_pin_memory(self):
         loader = DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True)
@@ -478,11 +467,10 @@ def __len__(self):
     def test_error(self):
         self._test_error(DataLoader(ErrorDataset(100), batch_size=2, shuffle=True))
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     def test_error_workers(self):
         self._test_error(DataLoader(ErrorDataset(41), batch_size=2, shuffle=True, num_workers=4))
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
+    @unittest.skipIf(IS_WINDOWS, "FIXME: stuck test")
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_partial_workers(self):
         "check that workers exit even if the iterator is not exhausted"
@@ -637,7 +625,6 @@ class TestStringDataLoader(TestCase):
     def setUp(self):
         self.dataset = StringDataset()
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_shuffle_pin_memory(self):
         loader = DataLoader(self.dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True)
@@ -721,7 +708,6 @@ def _run_ind_worker_queue_test(self, batch_size, num_workers):
             if current_worker_idx == num_workers:
                 current_worker_idx = 0
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     def test_ind_worker_queue(self):
         for batch_size in (8, 16, 32, 64):
             for num_workers in range(1, 6):
diff --git a/test/test_jit.py b/test/test_jit.py
index e667520024be8a..7cccc6a73d9a25 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -92,7 +92,7 @@ def get_fn(file_name, script_path):
     return fn
 
 
-class TestJit(TestCase):
+class JitTestCase(TestCase):
     def assertExpectedONNXGraph(self, trace, *args, **kwargs):
         torch.onnx._optimize_trace(trace, aten=False)
         self.assertExpectedGraph(trace, *args, **kwargs)
@@ -110,21 +110,6 @@ def assertExpectedGraph(self, trace, *args, **kwargs):
         torch._C._jit_pass_lint(graph)
         self.assertExpected(str(graph), *args, **kwargs)
 
-    def assertExportImport(self, trace, inputs):
-        initializers = []
-
-        def run(graph):
-            return torch._C.GraphExecutor(graph, False)(*inputs)
-
-        proto, _ = trace.graph().export(initializers, onnx_opset_version=0,
-                                        defer_weight_export=False, export_raw_ir=True)
-        self.assertFalse(initializers)
-
-        imported_graph, initializers = torch._C._jit_import_graph(proto)
-        self.assertFalse(initializers)
-
-        self.assertEqual(run(trace.graph()), run(imported_graph))
-
     def run_pass(self, name, trace):
         if isinstance(trace, torch._C.Graph):
             graph = trace
@@ -143,6 +128,23 @@ def run_pass(self, name, trace):
             trace.set_graph(graph)
         return graph
 
+
+class TestJit(JitTestCase):
+    def assertExportImport(self, trace, inputs):
+        initializers = []
+
+        def run(graph):
+            return torch._C.GraphExecutor(graph, False)(*inputs)
+
+        proto, _ = trace.graph().export(initializers, onnx_opset_version=0,
+                                        defer_weight_export=False, export_raw_ir=True)
+        self.assertFalse(initializers)
+
+        imported_graph, initializers = torch._C._jit_import_graph(proto)
+        self.assertFalse(initializers)
+
+        self.assertEqual(run(trace.graph()), run(imported_graph))
+
     def test_simple(self):
         x = torch.tensor([0.4], requires_grad=True)
         y = torch.tensor([0.7], requires_grad=True)
@@ -154,17 +156,16 @@ def f(x, y):
         self.assertExpectedGraph(trace)
         self.assertExportImport(trace, (x, y))
 
-    # index-2 is not implemented in interpreter
-    @unittest.expectedFailure
     def test_index(self):
         x = torch.tensor([0.4], requires_grad=True)
-        y = torch.tensor([0], dtype=torch.int64, requires_grad=True)
+        y = torch.tensor([0], dtype=torch.int64)
 
-        @torch.jit.compile(nderivs=0)
         def fn(x, y):
             return x[y]
 
-        fn(x, y)  # Fails
+        fn_traced = torch.jit.trace(x, y)(fn)
+
+        self.assertEqual(fn(x, y), fn_traced(x, y))
 
     # Backwards tracing was broken for indexing by a constant,
     # because it's internally implemented using as_strided,
@@ -704,7 +705,10 @@ def allSum(vs):
         nograd_inputs = reference_tensors
         recording_inputs = [t.clone().requires_grad_() for t in reference_tensors]
 
-        ge = torch.jit.trace(*input_tensors, optimize=optimize)(func)
+        if isinstance(func, torch._C.Graph):
+            ge = torch._C.GraphExecutor(func, optimize)
+        else:
+            ge = torch.jit.trace(*input_tensors, optimize=optimize)(func)
 
         # test no gradients case
 
@@ -925,8 +929,23 @@ def addmm(mat, mat1, mat2, alpha, beta):
         self.assertEqual(out_ref, out_test)
         self.assertExpected(canonical(addmm.graph))
 
+    def test_index_put(self):
+        ten = torch.zeros(3, 3)
+        mask = torch.Tensor([[True, True, True],
+                             [True, False, False],
+                             [True, True, False]]).byte()
+
+        def test_fn(ten, mask):
+            ten[mask] = torch.ones(6)
+            return ten
+
+        traced_test_fn = torch.jit.trace(ten, mask)(test_fn)
+
+        ten = torch.rand(3, 3)
+        self.assertEqual(test_fn(ten, mask), traced_test_fn(ten, mask))
 
-class TestScript(TestCase):
+
+class TestScript(JitTestCase):
 
     @contextmanager
     def capture_stdout(self):
@@ -982,7 +1001,7 @@ def checkScript(self, script, inputs, optimize=True, outputs=None, name='func',
             source = textwrap.dedent(inspect.getsource(script))
             self.checkScript(source, inputs, optimize, outputs, script.__name__, capture_output, frames_up=2)
             # Continue checking the Python frontend
-            ge = torch.jit.script(script, _frames_up=1)
+            ge = torch.jit.script(script, optimize, _frames_up=1)
 
         if capture_output:
             with self.capture_stdout() as captured:
@@ -1133,7 +1152,7 @@ def func(x, y):
         out = func(x, y)
         self.assertEqual(func(x, y), x + y)
 
-        grad = torch.randn(2, 3)
+        grad = torch.randn(2, 3, dtype=torch.float)
         out.backward(grad)
         self.assertEqual(x.grad, grad)
         self.assertEqual(y.grad, grad.sum(dim=0))
@@ -1165,6 +1184,24 @@ def func(x):
             def func(x):
                 return torch.cat((x, x), x, dim=0)
 
+    def test_cat_lifts(self):
+        @torch.jit.script
+        def foo(x):
+            return torch.cat([x, x], dim=1)
+
+        @torch.jit.script
+        def foo2(x):
+            return torch.cat([], dim=1)
+
+        @torch.jit.script
+        def foo3(x):
+            return torch.cat([x], dim=1)
+
+        self.assertExpected(
+            canonical(foo.graph) +
+            canonical(foo2.graph) +
+            canonical(foo3.graph))
+
     def test_func_call(self):
         script = '''
         def add(a, b):
@@ -1385,18 +1422,15 @@ def func(a, b):
         self.checkScript(func, inputs, optimize=True)
 
     def test_script_for_in_range(self):
-        script = '''
-        def test_for_in_range():
+        def fn():
             c = 0
             for i in range(100):
                 c += i
             return c
-        '''
-        self.checkScript(script, [], outputs=[4950], optimize=True, name='test_for_in_range')
+        self.checkScript(fn, (), outputs=4950, optimize=True)
 
     def test_script_for_in_range_dynamic(self):
-        script = '''
-        def test_script_for_in_range_dynamic():
+        def fn():
             c = 0
             for i in range(100):
                 acc = 0
@@ -1404,8 +1438,7 @@ def test_script_for_in_range_dynamic():
                     acc += j
                 c += acc
             return c
-        '''
-        self.checkScript(script, [], outputs=[161700], optimize=True, name='test_script_for_in_range_dynamic')
+        self.checkScript(fn, (), optimize=False)
 
     def test_script_for_in_range_ast(self):
         @torch.jit.script
@@ -2739,7 +2772,7 @@ def foo(a, b, x):
                 b = x
             return a
 
-        self.checkScript(foo, (torch.zeros(1), torch.zeros(4), torch.zeros(5)), False)
+        self.checkScript(foo, (torch.zeros(1), torch.zeros(4), torch.zeros(5)), optimize=False)
 
     def test_intlist_args(self):
         def func_1(x):
@@ -2920,6 +2953,78 @@ def test_rand():
 
         self.checkScript(test_rand, ())
 
+    def test_loop_unrolling(self):
+        def fn(x):
+            y = 0
+            for i in range(x):
+                y += i
+            return y
+
+        graph = torch.jit._script_graph(fn)
+        self.run_pass('loop_unrolling', graph)
+        self.assertExpectedGraph(graph)
+        self.checkScript(fn, (torch.tensor(10),))
+
+    def test_loop_unrolling_const(self):
+        def fn():
+            y = 0
+            for i in range(10):
+                y += 1
+            return y
+
+        def fn2():
+            y = 0
+            for i in range(10):
+                y += i
+            return y
+
+        def check(fn, name):
+            graph = torch.jit._script_graph(fn)
+            self.run_pass('loop_unrolling', graph)
+            self.assertExpectedGraph(graph, subname=name)
+            self.checkScript(fn, ())
+
+        check(fn, 'add_const')
+        check(fn2, 'add_iter')
+
+    def test_loop_unrolling_nested(self):
+        def fn(x):
+            y = 0
+            for i in range(10):
+                for j in range(x):
+                    y += j
+            return y
+
+        graph = torch.jit._script_graph(fn)
+        self.run_pass('loop_unrolling', graph)
+        self.assertExpectedGraph(graph)
+        self.checkScript(fn, (torch.tensor(10),))
+
+    def test_loop_unroll_unused_counter(self):
+        def fn(x):
+            y = 0
+            for i in range(x):
+                y += 1
+            return y
+
+        graph = torch.jit._script_graph(fn)
+        self.run_pass('loop_unrolling', graph)
+        self.assertExpectedGraph(graph)
+
+    def test_loop_unroll_negative(self):
+        def fn(x):
+            y = 0
+            for i in range(x):
+                y += 1
+            return y
+
+        self.checkScript(fn, (torch.tensor(-20),))
+        self.checkScript(fn, (torch.tensor(-2),))
+        self.checkScript(fn, (torch.tensor(-1),))
+        self.checkScript(fn, (torch.tensor(0),))
+        self.checkScript(fn, (torch.tensor(1),))
+        self.checkScript(fn, (torch.tensor(2),))
+
 
 # Smoke tests for export methods
 class TestPytorchExportModes(unittest.TestCase):
diff --git a/test/test_nn.py b/test/test_nn.py
index 3e3356dfad4725..13b7013200b805 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -2430,16 +2430,15 @@ def test_parallel_apply(self):
         i2 = torch.randn(2, 10, device="cuda:1", dtype=torch.float)
         expected1 = l1(i1).data
         expected2 = l2(i2).data
-        inputs = ((i1,), (i2,))
         modules = (l1, l2)
         expected_outputs = (expected1, expected2)
 
-        outputs = dp.parallel_apply(modules, inputs, None)
-        for out, expected in zip(outputs, expected_outputs):
-            self.assertEqual(out.data, expected)
-
-        inputs = (i1, i2.new_empty(0))
-        expected_outputs = (expected1, expected2.new_empty(0))
+        # each input can be either a collection of positional arguments
+        #                       or an object representing the single argument
+        for inputs in [((i1,), (i2,)), (i1, i2)]:
+            outputs = dp.parallel_apply(modules, inputs, None)
+            for out, expected in zip(outputs, expected_outputs):
+                self.assertEqual(out.data, expected)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_data_parallel_multiple_input(self):
@@ -6289,7 +6288,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='affine',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm1d',
@@ -6298,7 +6296,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='3d_input',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm1d',
@@ -6307,7 +6304,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='affine_simple_average',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm1d',
@@ -6316,7 +6312,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='not_affine',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm1d',
@@ -6325,7 +6320,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='not_tracking_stats',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm1d',
@@ -6334,7 +6328,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='3d_input_not_affine',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm2d',
@@ -6342,7 +6335,6 @@ def multimarginloss_weights_no_reduce_test():
         input_size=(2, 3, 6, 6),
         cudnn=True,
         check_eval=True,
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm2d',
@@ -6351,7 +6343,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='2d_simple_average',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm2d',
@@ -6360,7 +6351,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='momentum',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm2d',
@@ -6369,7 +6359,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='not_affine',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm2d',
@@ -6378,7 +6367,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='not_tracking_stats',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm3d',
@@ -6386,7 +6374,6 @@ def multimarginloss_weights_no_reduce_test():
         input_size=(2, 3, 4, 4, 4),
         cudnn=True,
         check_eval=True,
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm3d',
@@ -6395,7 +6382,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='3d_simple_average',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm3d',
@@ -6404,7 +6390,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='momentum',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm3d',
@@ -6413,7 +6398,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='not_affine',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='BatchNorm3d',
@@ -6422,7 +6406,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='not_tracking_stats',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='InstanceNorm1d',
@@ -6430,7 +6413,6 @@ def multimarginloss_weights_no_reduce_test():
         input_size=(4, 3, 15),
         cudnn=True,
         check_eval=True,
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='InstanceNorm1d',
@@ -6439,7 +6421,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='tracking_stats',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='InstanceNorm2d',
@@ -6447,7 +6428,6 @@ def multimarginloss_weights_no_reduce_test():
         input_size=(2, 3, 6, 6),
         cudnn=True,
         check_eval=True,
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='InstanceNorm2d',
@@ -6456,7 +6436,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='tracking_stats',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='InstanceNorm3d',
@@ -6464,7 +6443,6 @@ def multimarginloss_weights_no_reduce_test():
         input_size=(2, 3, 4, 4, 4),
         cudnn=True,
         check_eval=True,
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='InstanceNorm3d',
@@ -6473,7 +6451,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='tracking_stats',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='LayerNorm',
@@ -6482,7 +6459,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='1d_elementwise_affine',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='LayerNorm',
@@ -6491,7 +6467,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='1d_no_elementwise_affine',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='LayerNorm',
@@ -6500,7 +6475,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='3d_elementwise_affine',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='LayerNorm',
@@ -6509,7 +6483,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='3d_no_elementwise_affine',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='GroupNorm',
@@ -6518,7 +6491,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='1d_affine',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='GroupNorm',
@@ -6527,7 +6499,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='1d_no_affine_IN',  # this setting is equivalent with InstanceNorm
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='GroupNorm',
@@ -6536,7 +6507,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='1d_no_affine_LN',  # this setting is equivalent with LayerNorm
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='GroupNorm',
@@ -6545,7 +6515,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='2d_affine',
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='GroupNorm',
@@ -6554,7 +6523,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='2d_no_affine_IN',  # this setting is equivalent with InstanceNorm
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='GroupNorm',
@@ -6563,7 +6531,6 @@ def multimarginloss_weights_no_reduce_test():
         cudnn=True,
         check_eval=True,
         desc='2d_no_affine_LN',  # this setting is equivalent with LayerNorm
-        decorator=skipCUDAMemoryLeakCheckIf(IS_WINDOWS),
     ),
     dict(
         module_name='Conv1d',
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 3c9e0e93524428..f2d557faca05cd 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -910,6 +910,12 @@ def test_factory_size_check(self):
         with self.assertRaisesRegex(RuntimeError, "values and sizes are inconsistent"):
             self.SparseTensor(indices, values, sizes)
 
+    def test_factory_empty_indices(self):
+        device = 'cuda' if self.is_cuda else 'cpu'
+        tensor = torch.sparse_coo_tensor([], [], torch.Size([]), device=device)
+        expected_indices = torch.tensor([], dtype=torch.long, device=device)
+        self.assertEqual(tensor._indices(), expected_indices)
+
     @cpu_only
     def test_factory_type_inference(self):
         t = torch.sparse_coo_tensor(torch.tensor(([0], [2])), torch.tensor([1.], dtype=torch.float32))
diff --git a/test/test_torch.py b/test/test_torch.py
index 723481db1c4443..1d331b40abca60 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -17,6 +17,7 @@
 from torch._utils import _rebuild_tensor
 from itertools import product, combinations
 from functools import reduce
+from torch import multiprocessing as mp
 from common import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MKL, \
     run_tests, download_file, skipIfNoLapack, suppress_warnings, IS_WINDOWS, PY3
 
@@ -2368,6 +2369,32 @@ def make_prob_dist(shape, is_contiguous):
     def test_multinomial(self):
         self._test_multinomial(self, torch.FloatTensor)
 
+    def _spawn_method(self, method, arg):
+        try:
+            mp.set_start_method('spawn')
+        except RuntimeError:
+            pass
+        with mp.Pool(1) as pool:
+            self.assertTrue(pool.map(method, [arg]))
+
+    @staticmethod
+    def _test_multinomial_invalid_probs(probs):
+        try:
+            torch.multinomial(probs.to('cpu'), 1)
+            return False  # Should not be reached
+        except RuntimeError as e:
+            return 'invalid multinomial distribution' in str(e)
+
+    @unittest.skipIf(not PY3,
+                     "spawn start method is not supported in Python 2, \
+                     but we need it for for testing failure case for CPU RNG on Windows")
+    def test_multinomial_invalid_probs(self):
+        test_method = TestTorch._test_multinomial_invalid_probs
+        self._spawn_method(test_method, torch.Tensor([0, -1]))
+        self._spawn_method(test_method, torch.Tensor([0, float('inf')]))
+        self._spawn_method(test_method, torch.Tensor([0, float('-inf')]))
+        self._spawn_method(test_method, torch.Tensor([0, float('nan')]))
+
     @suppress_warnings
     def test_range(self):
         res1 = torch.range(0, 1)
diff --git a/test/test_utils.py b/test/test_utils.py
index 42d4b6444736a1..91dfdbff7f2143 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -17,7 +17,6 @@
 from torch.utils.trainer import Trainer
 from torch.utils.trainer.plugins import *
 from torch.utils.trainer.plugins.plugin import Plugin
-from torch.utils.serialization import load_lua
 from torch.autograd._functions.utils import prepare_onnx_paddings
 from torch.autograd._functions.utils import check_onnx_broadcast
 from common import IS_WINDOWS, IS_PPC
@@ -281,7 +280,6 @@ def test_multi_keep(self):
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 2)
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
     def test_multi_drop(self):
         dataloader = torch.utils.data.DataLoader(self.dataset,
                                                  batch_size=self.batch_size,
@@ -593,14 +591,14 @@ def _check_autograd_summary(self, output):
             'Distance between autograd prof output and end of output not in [6, 100] lines', output))
 
     def _check_cuda(self, output):
-        if torch.cuda.is_available():
+        if HAS_CUDA:
             results = re.search('CUDA mode', output)
             self.assertIsNotNone(results, self._fail_msg('Should tell users CUDA', output))
         else:
             results = re.search('CUDA mode', output)
             self.assertIsNone(results, self._fail_msg('Should not tell users about CUDA', output))
 
-    @unittest.skipIf(torch.cuda.is_available(), 'CPU-only test')
+    @unittest.skipIf(HAS_CUDA, 'CPU-only test')
     def test_bottleneck_cpu_only(self):
         rc, out, err = self._run_bottleneck('bottleneck/test.py')
         self.assertEqual(rc, 0, 'Run failed with\n{}'.format(err))
@@ -611,8 +609,7 @@ def test_bottleneck_cpu_only(self):
         self._check_cprof_summary(out)
         self._check_cuda(out)
 
-    @unittest.skipIf(IS_WINDOWS, "FIXME: Intermittent CUDA out-of-memory error")
-    @unittest.skipIf(not torch.cuda.is_available(), 'No CUDA')
+    @unittest.skipIf(not HAS_CUDA, 'No CUDA')
     def test_bottleneck_cuda(self):
         rc, out, err = self._run_bottleneck('bottleneck/test_cuda.py')
         self.assertEqual(rc, 0, 'Run failed with\n{}'.format(err))
@@ -747,6 +744,7 @@ def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail):
         try_check_onnx_broadcast(dims1, dims2, True, False)
 
 
-TestLuaReader.init()
 if __name__ == '__main__':
+    from torch.utils.serialization import load_lua
+    TestLuaReader.init()
     run_tests()
diff --git a/third_party/benchmark b/third_party/benchmark
index 105ac14b2f671f..505be96ab23056 160000
--- a/third_party/benchmark
+++ b/third_party/benchmark
@@ -1 +1 @@
-Subproject commit 105ac14b2f671f7d448f621290404098189f424d
+Subproject commit 505be96ab23056580a3a2315abba048f4428b04e
diff --git a/third_party/gloo b/third_party/gloo
index aad0002fb40612..69eef748cc1dfb 160000
--- a/third_party/gloo
+++ b/third_party/gloo
@@ -1 +1 @@
-Subproject commit aad0002fb40612e991390d8e807f247ed23f13c5
+Subproject commit 69eef748cc1dfbe0fefed69b34e6545495f67ac5
diff --git a/third_party/onnx b/third_party/onnx
index 968d28d901d2ef..3a035f439799de 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit 968d28d901d2efdaf6d5fcfd529106762524cdfa
+Subproject commit 3a035f439799de3568c364f3f87014841037708e
diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py
index 315220ec4c7072..5ce6a5514a10e8 100644
--- a/tools/amd_build/build_pytorch_amd.py
+++ b/tools/amd_build/build_pytorch_amd.py
@@ -53,7 +53,7 @@
             if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False):
                 continue
             # Update contents.
-            with open(source, "r+", encoding="utf-8") as f:
+            with open(source, "r+") as f:
                 contents = f.read()
                 contents = contents.replace("WITH_CUDA", "WITH_ROCM")
                 contents = contents.replace("CUDA_VERSION", "0")
diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml
index 60fd665e08d1ae..65dfb955a020a2 100644
--- a/tools/amd_build/disabled_features.yaml
+++ b/tools/amd_build/disabled_features.yaml
@@ -1,5 +1,5 @@
 {
-  "disabled_hip_function_calls":
+  "disable_unsupported_hip_calls":
     [
       {
         "path": "aten/src/THC/generic/THCTensorSort.cu",
@@ -72,6 +72,12 @@
           "#include <nvfunctional>": ""
         }
       },
+      {
+        "path": "aten/src/ATen/native/cuda/Distributions.cu",
+        "s_constants": {
+          "#include <nvfunctional>": ""
+        }
+      },
       {
         "path": "aten/src/THC/THCNumerics.cuh",
         "s_constants": {
@@ -129,10 +135,16 @@
   "disabled_modules": [
     "aten/src/ATen/native/cuda/CuFFTUtils.h",
     "aten/src/ATen/native/cuda/SpectralOps.cu",
-    "aten/src/ATen/native/cuda/Distributions.cu",
-    "aten/src/THCUNN/RReLU.cu"
+    "aten/src/THCUNN/RReLU.cu",
+    "aten/src/ATen/native/cuda/Distributions.cu"
   ],
   "disabled_functions": [
+    {
+      "path": "aten/src/ATen/cuda/CUDAApplyUtils.cuh",
+      "functions": [
+        "kernelPointwiseApply4"
+      ]
+    },
     {
       "path": "aten/src/ATen/cuda/detail/IndexUtils.cu",
       "non_device_functions": [
@@ -148,7 +160,9 @@
     {
       "path": "aten/src/ATen/native/cuda/Distributions.cu",
       "functions": [
-        "_s_poisson_cuda"
+        "_s_poisson_cuda",
+        "poisson_cuda_kernel",
+        "gamma_cuda_kernel"
       ]
     },
     {
@@ -304,3 +318,4 @@
     }
   ]
 }
+
diff --git a/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch b/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch
deleted file mode 100644
index d1bcb76bb5866d..00000000000000
--- a/tools/amd_build/patches/a_aten_src_THCUNN_THCHalfAutoNumerics.cuh.patch
+++ /dev/null
@@ -1,23 +0,0 @@
-diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
-index 2653fed0b..c4e9089e0 100644
---- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh
-+++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
-@@ -19,13 +19,17 @@ inline __host__ __device__ float fmaxType(float x, half y) {
- }
- #endif
- 
-+/* In ROCm we have a conversion from half to __fp16, and then there's a
-+conversion operator from __fp16 to double (w/ the standard conversion
-+double to float), so comment out these two lines to prevent ambiguous calls
-+for fmaxType when half is passed in.
- inline __host__ __device__ float fmaxType(float x, float y) {
-   return fmaxf(x, y);
- }
- 
- inline __host__ __device__ double fmaxType(double x, double y) {
-   return fmax(x, y);
--}
-+}*/
- 
- #ifdef CUDA_HALF_TENSOR
- 
diff --git a/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch b/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch
index 72e4160ea4a994..29c27ba5a90874 100644
--- a/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch
+++ b/tools/amd_build/patches/a_aten_src_THC_THCTensorMathReduce.cuh.patch
@@ -1,39 +1,37 @@
 diff --git a/aten/src/THC/THCTensorMathReduce.cuh b/aten/src/THC/THCTensorMathReduce.cuh
-index ca6bf7cbe..a523648f1 100644
+index d118f5dd3..dda7f3c9c 100644
 --- a/aten/src/THC/THCTensorMathReduce.cuh
 +++ b/aten/src/THC/THCTensorMathReduce.cuh
-@@ -105,7 +105,7 @@ struct SquareFunctor<ResT, half> {
+@@ -73,14 +73,14 @@ struct SquareFunctor {
  template <typename T>
  struct ReduceMin {
    inline __device__ T operator()(T a, T b) const {
--    return (THCNumerics<T>::lt(a, b) ||
-+    return ((int)THCNumerics<T>::sub(a, b) < 0 ||
-             THCNumerics<T>::isnan(a)) ? a : b;
+-    return (THCNumerics<T>::lt(a, b) || THCNumerics<T>::isnan(a)) ? a : b;
++    return ((int)THCNumerics<T>::sub(a, b) < 0 || THCNumerics<T>::isnan(a)) ? a : b;
    }
  };
-@@ -113,7 +113,7 @@ struct ReduceMin {
+ 
  template <typename T>
  struct ReduceMax {
    inline __device__ T operator()(T a, T b) const {
--    return (THCNumerics<T>::gt(a, b) ||
-+    return ((int)THCNumerics<T>::sub(a, b) > 0 ||
-             THCNumerics<T>::isnan(a)) ? a : b;
+-    return (THCNumerics<T>::gt(a, b) || THCNumerics<T>::isnan(a)) ? a : b;
++    return ((int)THCNumerics<T>::sub(a, b) > 0 || THCNumerics<T>::isnan(a)) ? a : b;
    }
  };
-@@ -167,7 +167,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrd
  
-   buffer[tx] = ScalarConvert<int, Real>::to(0);
-   Real norm;
--
-+  #if !defined(__HIP_DEVICE_COMPILE__)
-   if (THCNumerics<Real>::eq(value, ScalarConvert<float, Real>::to(INFINITY))) {
-     // get norm of axis
-     for (ptrdiff_t i=tx; i<size; i+=step)
-@@ -225,6 +225,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrd
-       row[i] = THCNumerics<Real>::mul(row[i], norm);
+@@ -108,6 +108,7 @@ __global__ void THCTensor_kernel_renorm(T *data,
+                                         const AccT value, 
+                                         const ptrdiff_t size, 
+                                         const AccT maxnorm) {
++#if !defined(__HIP_DEVICE_COMPILE__)
+   __shared__ AccT buffer[32];
+   int64_t tx = threadIdx.x;
+   int64_t bx = blockIdx.x;
+@@ -163,6 +164,7 @@ __global__ void THCTensor_kernel_renorm(T *data,
+       row[i] = scalar_cast<T>(THCNumerics<AccT>::mul(val, norm));
      }
    }
-+  #endif
++#endif
  }
  
  template <typename T>
diff --git a/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch b/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch
index 8fbc7ad7e90643..355c1e3a7c3faf 100644
--- a/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch
+++ b/tools/amd_build/patches/a_aten_src_THC_generic_THCTensorRandom.cu.patch
@@ -1,5 +1,5 @@
 diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu
-index 906780b4f..c99f156ab 100644
+index 906780b4f..b03e051cb 100644
 --- a/aten/src/THC/generic/THCTensorRandom.cu
 +++ b/aten/src/THC/generic/THCTensorRandom.cu
 @@ -504,11 +504,11 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_
@@ -7,7 +7,7 @@ index 906780b4f..c99f156ab 100644
    if (range > 1ULL << 32) {
      generate_random_64<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
 -        gen->state.gen_states, size, data, min_val, range);
-+        gen->state.gen_states, static_cast<int>(size), data, static_cast<std::conditional<std::is_same<real,float>::value, float, double>::type>(min_val), static_cast<std::conditional<std::is_same<real,float>::value, float, double>::type>(range));
++        gen->state.gen_states, static_cast<int>(size), data, min_val, range);
    } else {
  #endif
      generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
@@ -17,7 +17,7 @@ index 906780b4f..c99f156ab 100644
    }
  #endif
 @@ -534,19 +534,19 @@ THC_API void THCTensor_(random)(THCState* state, THCTensor *self_)
-
+ 
  #if defined(THC_REAL_IS_HALF)
    generate_random<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
 -      gen->state.gen_states, size, data, 0UL, (1UL << HLF_MANT_DIG) + 1);
@@ -39,5 +39,5 @@ index 906780b4f..c99f156ab 100644
 -      gen->state.gen_states, size, data, 0UL, static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
 +      gen->state.gen_states, static_cast<int>(size), data, static_cast<int32_t>(0UL), static_cast<uint32_t>(std::numeric_limits<real>::max()) + 1);
  #endif
-
+ 
    THCTensor_(freeCopyTo)(state, self, self_);
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 8792233d572f55..8a1adeb37efd9f 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -690,7 +690,7 @@
 - name: zero_(Tensor self)
   self: zeros_like(grad)
 
-- name: _sparse_mask(Tensor self, SparseTensor mask)
+- name: _sparse_mask(Tensor self, SparseTensorRef mask)
   self: not_implemented("_sparse_mask")
   mask: not_implemented("_sparse_mask")
 
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
index a343050eb34806..c91317e94043c7 100644
--- a/tools/autograd/gen_autograd_functions.py
+++ b/tools/autograd/gen_autograd_functions.py
@@ -18,7 +18,7 @@
 struct ${op} : public ${superclass} {
   using ${superclass}::${superclass};
   variable_list apply(const variable_list& grads) override;
-  std::string name() override { return "${op}"; }
+  std::string name() const override { return "${op}"; }
   void release_variables() override {
     ${release_variables}
   }
@@ -63,6 +63,12 @@
 }
 """)
 
+DERIVATIVE_MULTI_COPY_RANGE = CodeTemplate("""\
+  if (should_compute_output({ ${name}_ix })) {
+    copy_range(grad_inputs, ${name}_ix, std::get<${i}>(grad_result));
+  }
+""")
+
 DERIVATIVE_MULTI = CodeTemplate("""\
 if (should_compute_output({ ${idx_ranges} })) {
   ${grad_input_mask}
@@ -173,7 +179,7 @@ def emit_derivative(derivative):
             idx_ranges = ', '.join("{}_ix".format(n) for n in var_names)
             copy_ranges = []
             for i, n in enumerate(var_names):
-                copy_ranges.append("copy_range(grad_inputs, {}_ix, std::get<{}>(grad_result));".format(n, i))
+                copy_ranges.append(DERIVATIVE_MULTI_COPY_RANGE.substitute(name=n, i=i))
             return DERIVATIVE_MULTI.substitute(
                 idx_ranges=idx_ranges, copy_ranges=copy_ranges,
                 derivative=formula,
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 34b06d4c64a3c5..3b4d4216a20dcb 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -129,12 +129,12 @@ def should_generate_python_binding(declaration):
             return False
 
     # TODO: fix handling of SparseTensor. We don't want to generate Python
-    # bindings to SparseTensor overloads, such as add(Tensor, SparseTensor),
+    # bindings to SparseTensor overloads, such as add(Tensor, SparseTensorRef),
     # since the Tensor-based signature already dynamically dispatches correctly.
     # However, _sparse_mask only has a SparseTensor signature so we need to bind
     # that function.
     for arg in declaration['arguments']:
-        if arg['type'] == 'SparseTensor' and declaration['name'] != '_sparse_mask':
+        if arg['type'] == 'SparseTensorRef' and declaration['name'] != '_sparse_mask':
             return False
 
     return True
@@ -207,7 +207,7 @@ def create_python_bindings(python_functions, has_self, is_module=False):
 
     unpack_methods = {
         'const Tensor &': 'tensor',
-        'SparseTensor': 'tensor',
+        'SparseTensorRef': 'tensor',
         'Tensor &': 'tensor',
         'Generator *': 'generator',
         'Storage &': 'storage',
@@ -310,8 +310,8 @@ def parse_arg(arg, arg_index, unpack_args=False):
 
             if typename == 'Storage &':
                 expr = '*' + expr
-            if typename == 'SparseTensor':
-                expr = 'SparseTensor({})'.format(expr)
+            if typename == 'SparseTensorRef':
+                expr = 'SparseTensorRef({})'.format(expr)
 
             dispatch_type = typename
             if dispatch_type == 'Tensor':
@@ -542,7 +542,7 @@ def process_function(name, declarations):
             if not has_self:
                 # Use 'input' instead of 'self' for NN functions
                 signature = signature.replace('Tensor self', 'Tensor input')
-            signature = signature.replace('SparseTensor', 'Tensor')
+            signature = signature.replace('SparseTensorRef', 'Tensor')
             if dictionary['base'].get('deprecated', False):
                 signature += '|deprecated'
             env['signatures'].append('"{}",'.format(signature))
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index c402855a8ba800..6348807be93eb7 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -328,7 +328,7 @@ def save_variables(saved_variables, is_output):
     def reference_args(args):
         res = []
         for arg in args:
-            if arg['type'] == 'SparseTensor':
+            if arg['type'] == 'SparseTensorRef':
                 res.append('{}.tref'.format(arg['name']))
             else:
                 res.append(arg['name'])
@@ -538,7 +538,7 @@ def requires_unpack(arg):
 
         dynamic_type = arg['dynamic_type']
         is_nullable = arg.get('is_nullable', False)
-        ref = (not is_nullable) and dynamic_type not in ['TensorList', 'SparseTensor']
+        ref = (not is_nullable) and dynamic_type not in ['TensorList', 'SparseTensorRef']
         suffix = '_opt' if is_nullable else ''
 
         body.append(UNPACK_TENSOR.substitute(
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index 38fa9c2ed654eb..7c8a81cc2de8d4 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -36,7 +36,7 @@ namespace torch { namespace autograd {
 // don't want to make the codegen do the dispatch manually)
 static void setattr(jit::Node* n, jit::Symbol name, int64_t v)             { n->i_(name, v); }
 static void setattr(jit::Node* n, jit::Symbol name, const at::Scalar& v)   { n->t_(name, v.toTensor()); }
-static void setattr(jit::Node* n, jit::Symbol name, SparseTensor s)        { n->t_(name, s.tref); }
+static void setattr(jit::Node* n, jit::Symbol name, SparseTensorRef s)     { n->t_(name, s.tref); }
 static void setattr(jit::Node* n, jit::Symbol name, const at::IntList& v)  { n->is_(name, v); }
 static void setattr(jit::Node* n, jit::Symbol name, bool v)                { n->i_(name, v); }
 static void setattr(jit::Node* n, jit::Symbol name, double v)              { n->f_(name, v); }
@@ -60,7 +60,7 @@ void failPositionalAttr() {
 
 static void setposattr(jit::Node* n, size_t idx, const char *name, int64_t v)             { genericInsertInput(n, idx, v); }
 static void setposattr(jit::Node* n, size_t idx, const char *name, const at::Scalar& v)   { genericInsertInput(n, idx, v); }
-static void setposattr(jit::Node* n, size_t idx, const char *name, SparseTensor s)        { failPositionalAttr(); }
+static void setposattr(jit::Node* n, size_t idx, const char *name, SparseTensorRef s)     { failPositionalAttr(); }
 static void setposattr(jit::Node* n, size_t idx, const char *name, const at::IntList& v)  {
   using ArgumentStash = jit::tracer::ArgumentStash;
   if (ArgumentStash::hasIntList(name)) {
@@ -236,8 +236,8 @@ Tensor & VariableType::unpack(const Tensor & t, const char * name, int pos) {
   return checked_cast_variable(t, name, pos).data();
 }
 
-SparseTensor VariableType::unpack(SparseTensor t, const char * name, int pos) {
-  return SparseTensor(checked_cast_variable(t.tref, name, pos).data());
+SparseTensorRef VariableType::unpack(SparseTensorRef t, const char * name, int pos) {
+  return SparseTensorRef(checked_cast_variable(t.tref, name, pos).data());
 }
 
 Tensor VariableType::unpack_opt(const Tensor & t, const char * name, int pos) {
@@ -359,35 +359,35 @@ static void throw_error_out_requires_grad(const char* name) {
 
 static void rebase_history(Variable& var, std::shared_ptr<Function> grad_fn) {
   if (grad_fn && var.defined()) {
-    grad_fn->set_num_inputs(1);
+    grad_fn->add_input_metadata(var.type(), var.sizes());
     var.rebase_history({std::move(grad_fn), 0});
   }
 }
 
 static void rebase_history(ArrayRef<Variable> vars, std::shared_ptr<Function> grad_fn) {
   if (grad_fn) {
-    grad_fn->set_num_inputs(vars.size());
-    uint32_t output_nr = 0;
     for (auto& var : vars) {
       if (var.defined()) {
         // TODO: eliminate const_cast
+        auto output_nr = grad_fn->add_input_metadata(var.type(), var.sizes());
         const_cast<Variable&>(var).rebase_history({grad_fn, output_nr});
+      } else {
+        grad_fn->add_input_metadata(Function::undefined_input());
       }
-      output_nr++;
     }
   }
 }
 
 static void set_history(ArrayRef<Variable> vars, std::shared_ptr<Function> grad_fn) {
   if (grad_fn) {
-    grad_fn->set_num_inputs(vars.size());
-    uint32_t output_nr = 0;
     for (auto& var : vars) {
       if (var.defined()) {
         // TODO: eliminate const_cast
+        auto output_nr = grad_fn->add_input_metadata(var.type(), var.sizes());
         const_cast<Variable&>(var).set_gradient_edge({grad_fn, output_nr});
+      } else {
+        grad_fn->add_input_metadata(Function::undefined_input());
       }
-      output_nr++;
     }
   }
 }
@@ -428,7 +428,6 @@ Tensor & VariableType::s_copy_(Tensor & self, const Tensor & src, bool non_block
   if (requires_grad) {
     grad_fn = std::make_shared<CopyBackwards>();
     grad_fn->set_next_edges(collect_next_edges(self, src));
-    grad_fn->set_num_inputs(1);
     grad_fn->src_type = &src.type();
     grad_fn->src_device = src.is_cuda() ? src.get_device() : -1;
   }
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index c90e3f5247f17c..01cd96c4396257 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -17,7 +17,7 @@ using at::Context;
 using at::Generator;
 using at::IntList;
 using at::Scalar;
-using at::SparseTensor;
+using at::SparseTensorRef;
 using at::Storage;
 using at::Tensor;
 using at::TensorList;
@@ -62,7 +62,7 @@ struct VariableType final : public at::Type {
   // checks that t is actually a Variable
   static Variable & checked_cast_variable(const Tensor & t, const char * name, int pos);
   static at::Tensor & unpack(const Tensor & t, const char * name, int pos);
-  static at::SparseTensor unpack(SparseTensor t, const char * name, int pos);
+  static at::SparseTensorRef unpack(SparseTensorRef t, const char * name, int pos);
   static at::Tensor unpack_opt(const Tensor & t, const char * name, int pos);
   static std::vector<at::Tensor> unpack(at::TensorList tl, const char *name, int pos);
 
diff --git a/tools/autograd/templates/python_torch_functions_dispatch.h b/tools/autograd/templates/python_torch_functions_dispatch.h
index 45c4f79868ecc6..c6ab4d5d18e55f 100644
--- a/tools/autograd/templates/python_torch_functions_dispatch.h
+++ b/tools/autograd/templates/python_torch_functions_dispatch.h
@@ -19,7 +19,7 @@ using at::Scalar;
 using at::TensorList;
 using at::IntList;
 using at::Generator;
-using at::SparseTensor;
+using at::SparseTensorRef;
 using at::Storage;
 
 static at::Type& default_type() {
diff --git a/tools/autograd/templates/python_variable_methods_dispatch.h b/tools/autograd/templates/python_variable_methods_dispatch.h
index c17b30c51ed116..82e7e409472e29 100644
--- a/tools/autograd/templates/python_variable_methods_dispatch.h
+++ b/tools/autograd/templates/python_variable_methods_dispatch.h
@@ -16,7 +16,7 @@ using at::Scalar;
 using at::TensorList;
 using at::IntList;
 using at::Generator;
-using at::SparseTensor;
+using at::SparseTensorRef;
 using at::Storage;
 
 ${py_method_dispatch}
diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat
index 07683866995f75..7a7970e7f100e9 100755
--- a/tools/build_pytorch_libs.bat
+++ b/tools/build_pytorch_libs.bat
@@ -26,6 +26,13 @@ IF "%~1"=="--with-cuda" (
   set /a USE_CUDA=0
 )
 
+IF "%~1"=="--with-rocm" (
+  set /a WITH_ROCM=1
+  shift
+) ELSE (
+  set /a WITH_ROCM=0
+)
+
 IF "%~1"=="--with-nnpack" (
   set /a NO_NNPACK=0
   set /a USE_NNPACK=1
@@ -35,6 +42,29 @@ IF "%~1"=="--with-nnpack" (
   set /a USE_NNPACK=0
 )
 
+IF "%~1"=="--with-mkldnn" (
+  set /a NO_MKLDNN=0
+  shift
+) ELSE (
+  set /a NO_MKLDNN=1
+)
+
+IF "%~1"=="--with-gloo-ibverbs" (
+  set /a WITH_GLOO_IBVERBS=1
+  echo Warning: gloo iverbs is enabled but build is not yet implemented 1>&2
+  shift
+) ELSE (
+  set /a WITH_GLOO_IBVERBS=0
+)
+
+IF "%~1"=="--with-distributed-mw" (
+  set /a WITH_DISTRIBUTED_MW=1
+  echo Warning: distributed mw is enabled but build is not yet implemented 1>&2
+  shift
+) ELSE (
+  set /a WITH_DISTRIBUTED_MW=0
+)
+
 set BUILD_TYPE=Release
 IF "%DEBUG%"=="1" (
   set BUILD_TYPE=Debug
@@ -144,7 +174,7 @@ goto:eof
   mkdir build
   cd build
   cmake .. %CMAKE_GENERATOR_COMMAND% ^
-                  -DCMAKE_INSTALL_PREFIX="%INSTALL_DIR%" ^
+                  -DCMAKE_BUILD_TYPE=%BUILD_TYPE% ^
                   -DBUILD_CAFFE2=OFF ^
                   -DBUILD_ATEN=ON ^
                   -DBUILD_PYTHON=OFF ^
@@ -154,7 +184,18 @@ goto:eof
                   -DCUDNN_INCLUDE_DIR="%CUDNN_INCLUDE_DIR%" ^
                   -DCUDNN_LIB_DIR="%CUDNN_LIB_DIR%" ^
                   -DCUDNN_LIBRARY="%CUDNN_LIBRARY%" ^
-                  -DCMAKE_BUILD_TYPE=%BUILD_TYPE%
+                  -DNO_MKLDNN=%NO_MKLDNN% ^
+                  -DMKLDNN_INCLUDE_DIR="%MKLDNN_INCLUDE_DIR%" ^
+                  -DMKLDNN_LIB_DIR="%MKLDNN_LIB_DIR%" ^
+                  -DMKLDNN_LIBRARY="%MKLDNN_LIBRARY%" ^
+                  -DATEN_NO_CONTRIB=1 ^
+                  -DCMAKE_INSTALL_PREFIX="%INSTALL_DIR%" ^
+                  -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ^
+                  -DCMAKE_C_FLAGS="%USER_CFLAGS%" ^
+                  -DCMAKE_CXX_FLAGS="/EHa %USER_CFLAGS%" ^
+                  -DCMAKE_EXE_LINKER_FLAGS="%USER_LDFLAGS%" ^
+                  -DCMAKE_SHARED_LINKER_FLAGS="%USER_LDFLAGS%" ^
+                  -DWITH_ROCM=%WITH_ROCM%
 
   %MAKE_COMMAND%
   IF NOT %ERRORLEVEL%==0 exit 1
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 60e1dfeac863bf..0323b96c835b31 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -17,6 +17,7 @@ WITH_NNPACK=0
 WITH_MKLDNN=0
 WITH_GLOO_IBVERBS=0
 WITH_DISTRIBUTED_MW=0
+FULL_CAFFE2=0
 while [[ $# -gt 0 ]]; do
     case "$1" in
       --with-cuda)
@@ -37,6 +38,9 @@ while [[ $# -gt 0 ]]; do
       --with-distributed-mw)
           WITH_DISTRIBUTED_MW=1
           ;;
+      --full-caffe2)
+          FULL_CAFFE2=1
+          ;;
       *)
           break
           ;;
@@ -81,12 +85,10 @@ C_FLAGS=" -DTH_INDEX_BASE=0 -I\"$INSTALL_DIR/include\" \
 # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=686926
 C_FLAGS="${C_FLAGS} -DOMPI_SKIP_MPICXX=1"
 LDFLAGS="-L\"$INSTALL_DIR/lib\" "
-LD_POSTFIX=".so.1"
-LD_POSTFIX_UNVERSIONED=".so"
+LD_POSTFIX=".so"
 if [[ $(uname) == 'Darwin' ]]; then
     LDFLAGS="$LDFLAGS -Wl,-rpath,@loader_path"
-    LD_POSTFIX=".1.dylib"
-    LD_POSTFIX_UNVERSIONED=".dylib"
+    LD_POSTFIX=".dylib"
 else
     LDFLAGS="$LDFLAGS -Wl,-rpath,\$ORIGIN"
 fi
@@ -139,6 +141,9 @@ function build() {
       nanopb ) BUILD_C_FLAGS=$C_FLAGS" -fPIC -fexceptions";;
       *) BUILD_C_FLAGS=$C_FLAGS" -fexceptions";;
   esac
+  # TODO: The *_LIBRARIES cmake variables should eventually be
+  # deprecated because we are using .cmake files to handle finding
+  # installed libraries instead
   ${CMAKE_VERSION} ../../$1 -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/FindCUDA" \
               ${CMAKE_GENERATOR} \
               -DTorch_FOUND="1" \
@@ -177,9 +182,6 @@ function build() {
   popd
 
   local lib_prefix=$INSTALL_DIR/lib/lib$1
-  if [ -f "$lib_prefix$LD_POSTFIX" ]; then
-    rm -rf -- "$lib_prefix$LD_POSTFIX_UNVERSIONED"
-  fi
 
   if [[ $(uname) == 'Darwin' ]]; then
     pushd "$INSTALL_DIR/lib"
@@ -224,9 +226,9 @@ function build_caffe2() {
   ${CMAKE_VERSION} .. \
   ${CMAKE_GENERATOR} \
       -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-      -DBUILD_CAFFE2=OFF \
+      -DBUILD_CAFFE2=$FULL_CAFFE2 \
       -DBUILD_ATEN=ON \
-      -DBUILD_PYTHON=OFF \
+      -DBUILD_PYTHON=$FULL_CAFFE2 \
       -DBUILD_BINARY=OFF \
       -DBUILD_SHARED_LIBS=ON \
       -DUSE_CUDA=$WITH_CUDA \
@@ -249,6 +251,17 @@ function build_caffe2() {
       # to CMakeLists.txt and aten/CMakeLists.txt, not here.
       # We need the vanilla cmake build to work.
   ${CMAKE_INSTALL} -j"$NUM_JOBS"
+
+  # Install Python proto files
+  if [[ $FULL_CAFFE2 -ne 0 ]]; then
+    find . -name proto
+    for proto_file in ./caffe/proto/*.py; do
+      cp $proto_file "$BASE_DIR/caffe/proto/"
+    done
+    for proto_file in ./caffe2/proto/*.py; do
+      cp $proto_file "$BASE_DIR/caffe2/proto/"
+    done
+  fi
   popd
 }
 
diff --git a/tools/cpp_build/libtorch/CMakeLists.txt b/tools/cpp_build/libtorch/CMakeLists.txt
index 4feb0e52e5293d..3e0a012a620a66 100644
--- a/tools/cpp_build/libtorch/CMakeLists.txt
+++ b/tools/cpp_build/libtorch/CMakeLists.txt
@@ -231,6 +231,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/remove_expands.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/decompose_addmm.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/loop_unrolling.cpp
   ${TORCH_SRC_DIR}/csrc/jit/interned_strings.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/lexer.cpp
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index 7c3133aa3fd986..27fb9a9aa5c83a 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -77,7 +77,7 @@ def is_magic_method(api_name):
     return api_name.startswith('__') and api_name.endswith('__')
 
 
-blacklisted_types = {'SparseTensor', 'Storage', 'ScalarType', 'optional<ScalarType>'}
+blacklisted_types = {'SparseTensorRef', 'Storage', 'ScalarType', 'optional<ScalarType>', 'std::string'}
 default_only_types = {'Generator'}
 
 
@@ -97,9 +97,8 @@ def is_jit_op(decl):
     # we currently only support vararg tensor lists when they are the _first_ argument
     # and the only tensor argument
     arguments = decl['arguments']
-    has_tensorlist = any(arg['simple_type'] == 'TensorList' for arg in arguments)
-    num_tensor_args = sum(map(is_tensor_arg, arguments))
-    if has_tensorlist and (num_tensor_args != 1 or arguments[0]['simple_type'] != 'TensorList'):
+    # Only support a single TensorList arg
+    if sum(arg['simple_type'] == 'TensorList' for arg in arguments) > 1:
         return False
 
     return ((not decl['api_name'].endswith('_') or is_magic_method(decl['api_name'])) and
@@ -157,25 +156,54 @@ def emit_decl_variant(decl, is_positional_arg, has_tensorlist):
             # from the end of the stack
             static_inputs = sum(is_positional_arg) - 1
             num_dynamic_inputs = 'varargs_length'
+            tensorlist_idx = [i for i, arg in enumerate(decl['arguments']) if arg['simple_type'] == 'TensorList'][0]
         else:
             static_inputs = sum(is_positional_arg)
             num_dynamic_inputs = static_inputs
 
-        real_inputs = count()
+        real_inputs = 0
         for i, arg in enumerate(decl['arguments']):
-            # XXX: we currently support only TensorList ops that have a TensorList as
-            # the first argument, that is then followed by a number of positional args.
+            # This conditional allows us to process argument lists with a flattened argument list
+            # with a single TensorList. Given the sequence of arguments:
+            # a b c [d e f g] h i # [] is the list
+            #
+            # 1. For the section where we are processing positional inputs before the
+            #    TensorList:
+            #    a b c [d e f g] h i # [] is the list
+            #    ~~~~~~~~~~~~ <- N
+            #   we set this view_length to the total number of varargs inputs (i.e. the length)
+            #   of the whole argument list. This means that indexing into the list using peek()
+            #   we will retrieve arguments ar their true indices (i.e. peek at 0 points to a,
+            #   1 points to b, etc...). Similarly, we can use peekSlice() to index into the
+            #   list itself this way.
+            # 2. After the list:
+            #    a b c [d e f g] h i # [] is the list
+            #                 ~~~~~~ <- N
+            #   Here we set the view length to static_inputs. In our example,
+            #   we effectively ignore the fact that we have a list here. What is
+            #   significant is that our index i is equivalent when the view length
+            #   is right-justified, whether we have the list or not. Concretely,
+            #   indexing h or i from `a b c [d e f g] h i` is equvalent to indexing
+            #   h or i from `a b c h i`.
+            view_length = 'varargs_length' if has_tensorlist and i < tensorlist_idx else static_inputs
+
             if arg['simple_type'] == 'TensorList':
-                arguments.append('peekSlice(stack, 0, varargs_length - {}, varargs_length)'.format(static_inputs))
+                # NOTE: don't advance real_inputs here. After this we are going
+                # to switch over to indexing from the end as if we only had
+                # the static arguments.
+                arguments.append('peekSlice(stack, {}, varargs_length - {}, varargs_length)'
+                                 .format(real_inputs, static_inputs))
             elif arg['simple_type'] in default_only_types:
                 arguments.append(arg['default'])
             elif is_tensor_arg(arg):
-                arguments.append('std::move(peek(stack, {}, {}))'.format(next(real_inputs), static_inputs))
+                arguments.append('std::move(peek(stack, {}, {}))'.format(real_inputs, view_length))
+                real_inputs += 1
             elif is_positional_arg[i]:
                 template_kwargs = dict(type=arg['simple_type'],
                                        name=arg['name'],
-                                       i=next(real_inputs),
-                                       N=static_inputs)
+                                       i=real_inputs,
+                                       N=view_length)
+                real_inputs += 1
 
                 if is_sized_intlist_arg(arg):
                     assign = POS_INTLIST_ASSIGNMENT.substitute(size=arg['size'],
@@ -358,12 +386,13 @@ def emit_arg(arg, is_return):
     def emit(decl):
         arguments = [a for a in decl['arguments'] if a['simple_type'] not in default_only_types]
         n = get_name(decl['name'])
+        n_args = len(arguments)
+        n_returns = len(decl['returns'])
+        env['arguments'].append('// Arguments for {} ({} args, {} returns)'.format(decl['name'], n_args, n_returns))
         for a in arguments:
             emit_arg(a, False)
         for a in decl['returns']:
             emit_arg(a, True)
-        n_args = len(arguments)
-        n_returns = len(decl['returns'])
         env['operators'].append('{{ {}, {}, {} }}, // FunctionSchema("{}", <{} arguments>, <{} returns>) '.format(
             n, n_args, n_returns, decl['name'], n_args, n_returns))
 
diff --git a/tools/jit/templates/aten_dispatch.cpp b/tools/jit/templates/aten_dispatch.cpp
index f6d6f87bb16c14..d1545d52898890 100644
--- a/tools/jit/templates/aten_dispatch.cpp
+++ b/tools/jit/templates/aten_dispatch.cpp
@@ -79,6 +79,8 @@ int deviceForInputs(Stack & stack, size_t N) {
 std::unordered_set<Symbol> tensor_vararg_fns = {
   aten::cat,
   aten::stack,
+  aten::index,
+  aten::index_put,
 };
 
 template<size_t N>
diff --git a/tools/setup_helpers/miopen.py b/tools/setup_helpers/miopen.py
new file mode 100644
index 00000000000000..a87996688e7172
--- /dev/null
+++ b/tools/setup_helpers/miopen.py
@@ -0,0 +1,16 @@
+import os
+import glob
+
+from .env import IS_WINDOWS, IS_CONDA, CONDA_DIR, check_env_flag, gather_paths
+from .rocm import WITH_ROCM, ROCM_HOME
+
+
+WITH_MIOPEN = False
+MIOPEN_LIB_DIR = None
+MIOPEN_INCLUDE_DIR = None
+MIOPEN_LIBRARY = None
+if WITH_ROCM and not check_env_flag('NO_MIOPEN'):
+    WITH_MIOPEN = True
+    MIOPEN_LIB_DIR = ROCM_HOME + "/miopen/lib"
+    MIOPEN_INCLUDE_DIR = ROCM_HOME + "/miopen/include/miopen"
+    MIOPEN_LIBRARY = "MIOpen"
diff --git a/tools/setup_helpers/rocm.py b/tools/setup_helpers/rocm.py
new file mode 100644
index 00000000000000..b981dd82dc5020
--- /dev/null
+++ b/tools/setup_helpers/rocm.py
@@ -0,0 +1,5 @@
+from .env import check_env_flag
+# Check if ROCM is enabled
+WITH_ROCM = check_env_flag('WITH_ROCM')
+ROCM_HOME = "/opt/rocm"
+ROCM_VERSION = ""
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index d6915e4ac6f520..6fdc1f5a24711d 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -196,6 +196,14 @@ def _tensor_str(self, indent, fmt, scale, sz, summarize):
     return '[' + tensor_str + ']'
 
 
+def _maybe_wrap_suffix(suffix, indent, tensor_str):
+    suffix_len = len(suffix)
+    last_line_len = len(tensor_str) - tensor_str.rfind('\n') + 1
+    if suffix_len > 2 and last_line_len + suffix_len > PRINT_OPTS.linewidth:
+        return ',\n' + ' ' * indent + suffix[2:]
+    return suffix
+
+
 def get_summarized_data(self):
     dim = self.dim()
     if dim == 0:
@@ -224,27 +232,36 @@ def _str(self):
     indent = len(prefix)
     summarize = self.numel() > PRINT_OPTS.threshold
 
-    suffix = ')'
+    suffix = ''
     if not torch._C._is_default_type_cuda():
         if self.device.type == 'cuda':
-            suffix = ', device=\'' + str(self.device) + '\'' + suffix
+            suffix += ', device=\'' + str(self.device) + '\''
     else:
         if self.device.type == 'cpu' or torch.cuda.current_device() != self.device.index:
-            suffix = ', device=\'' + str(self.device) + '\'' + suffix
+            suffix += ', device=\'' + str(self.device) + '\''
 
     if self.numel() == 0:
         # In an empty tensor, there are no elements to infer if the dtype should be int64,
         # so it must be shown explicitly.
         if self.dtype != torch.get_default_dtype():
-            suffix = ', dtype=' + str(self.dtype) + suffix
+            suffix += ', dtype=' + str(self.dtype)
         tensor_str = '[]'
     else:
         if self.dtype != torch.get_default_dtype() and self.dtype != torch.int64:
-            suffix = ', dtype=' + str(self.dtype) + suffix
+            suffix += ', dtype=' + str(self.dtype)
 
         fmt, scale, sz = _number_format(get_summarized_data(self) if summarize else self)
         if scale != 1:
             prefix = prefix + SCALE_FORMAT.format(scale) + ' ' * indent
         tensor_str = _tensor_str(self, indent, fmt, scale, sz, summarize)
 
+    if self.grad_fn is not None:
+        suffix += ', grad_fn=<{}>'.format(type(self.grad_fn).__name__)
+    elif self.requires_grad:
+        suffix += ', requires_grad=True'
+
+    suffix += ')'
+
+    suffix = _maybe_wrap_suffix(suffix, indent, tensor_str)
+
     return prefix + tensor_str + suffix
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index bc96fa18ae41c1..79bf688fe01042 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -9,7 +9,7 @@
 
 from .variable import Variable
 from .function import Function, NestedIOFunction
-from .gradcheck import gradcheck
+from .gradcheck import gradcheck, gradgradcheck
 from .grad_mode import no_grad, enable_grad, set_grad_enabled
 from . import profiler
 
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 972f2656364a2a..648eb16d45058f 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -21,7 +21,7 @@ def make_jacobian(input, num_out):
             return None
         if not input.requires_grad:
             return None
-        return torch.zeros(input.nelement(), num_out)
+        return torch.zeros(input.nelement(), num_out, dtype=input.dtype)
     elif isinstance(input, Iterable):
         jacobians = list(filter(
             lambda x: x is not None, (make_jacobian(elem, num_out) for elem in input)))
@@ -125,25 +125,36 @@ def _differentiable_outputs(x):
 
 
 def gradcheck(func, inputs, eps=1e-6, atol=1e-5, rtol=1e-3, raise_exception=True):
-    """Check gradients computed via small finite differences
-       against analytical gradients
+    r"""Check gradients computed via small finite differences against analytical
+    gradients
 
-    The check between numerical and analytical has the same behaviour as
-    numpy.allclose https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html
-    meaning it check that
-        absolute(a - n) <= (atol + rtol * absolute(n))
-    is true for all elements of analytical jacobian a and numerical jacobian n.
+    The check between numerical and analytical gradients has the same behaviour as
+    `numpy.allclose <https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html>`_,
+    i.e., it checks that
+
+    .. math::
+
+        \lvert a - n \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert n \rvert
+
+    holds for all elements of analytical gradient :math:`a` and numerical
+    gradient :math:`n`.
+
+    .. note::
+        The default values are designed for :attr:`input` of double precision.
+        This check will likely fail if :attr:`input` is of single precision,
+        i.e., ``FloatTensor``.
 
     Args:
-        func: Python function that takes Tensor inputs and returns
+        func (function): a Python function that takes Tensor inputs and returns
             a Tensor or a tuple of Tensors
-        inputs: tuple of Tensors
-        eps: perturbation for finite differences
-        atol: absolute tolerance
-        rtol: relative tolerance
-        raise_exception: bool indicating whether to raise an exception if
-            gradcheck fails. The exception gives more information about the
+        inputs (tuple of Tensor): inputs to the function
+        eps (float, optional): perturbation for finite differences
+        atol (float, optional): absolute tolerance
+        rtol (float, optional): relative tolerance
+        raise_exception (bool, optional): indicating whether to raise an exception if
+            the check fails. The exception gives more information about the
             exact nature of the failure. This is helpful when debugging gradchecks.
+
     Returns:
         True if all differences satisfy allclose condition
     """
@@ -208,19 +219,30 @@ def fn(input):
 
 def gradgradcheck(func, inputs, grad_outputs=None, eps=1e-6, atol=1e-5, rtol=1e-3,
                   gen_non_contig_grad_outputs=False, raise_exception=True):
-    """Check gradients of gradients computed via small finite differences
-       against analytical gradients
+    r"""Check gradients of gradients computed via small finite differences
+    against analytical gradients
+
     This function checks that backpropagating through the gradients computed
-    to the given grad_outputs are correct.
+    to the given :attr:`grad_outputs` are correct.
+
+    The check between numerical and analytical gradients has the same behaviour as
+    `numpy.allclose <https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html>`_,
+    i.e., it checks that
+
+    .. math::
 
-    The check between numerical and analytical has the same behaviour as
-    numpy.allclose https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html
-    meaning it check that
-        absolute(a - n) <= (atol + rtol * absolute(n))
-    is true for all elements of analytical gradient a and numerical gradient n.
+        \lvert a - n \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert n \rvert
+
+    holds for all elements of analytical gradient :math:`a` and numerical
+    gradient :math:`n`.
+
+    .. note::
+        The default values are designed for :attr:`input` of double precision.
+        This check will likely fail if :attr:`input` is of single precision,
+        i.e., ``FloatTensor``.
 
     Args:
-        func (function): Python function that takes Tensor inputs and returns
+        func (function): a Python function that takes Tensor inputs and returns
             a Tensor or a tuple of Tensors
         inputs (tuple of Tensor): inputs to the function
         grad_outputs (tuple of Tensor, optional): The gradients with respect to
@@ -231,13 +253,12 @@ def gradgradcheck(func, inputs, grad_outputs=None, eps=1e-6, atol=1e-5, rtol=1e-
         gen_non_contig_grad_outputs (bool, optional): if :attr:`grad_outputs` is
             ``None`` and :attr:`gen_non_contig_grad_outputs` is ``True``, the
             randomly generated gradient outputs are made to be noncontiguous
-        raise_exception: bool indicating whether to raise an exception if
-            gradcheck fails. The exception gives more information about the
+        raise_exception (bool, optional): indicating whether to raise an exception if
+            the check fails. The exception gives more information about the
             exact nature of the failure. This is helpful when debugging gradchecks.
 
     Returns:
-        True if all differences satisfy allclose condition. Raises an exception
-        otherwise.
+        True if all differences satisfy allclose condition
     """
     if grad_outputs is None:
         # If grad_outputs is not specified, create random Tensors of the same
diff --git a/torch/csrc/THP.h b/torch/csrc/THP.h
index 68451ea5c95744..6cb660e6a89149 100644
--- a/torch/csrc/THP.h
+++ b/torch/csrc/THP.h
@@ -24,6 +24,11 @@
 #define LIBRARY_STATE_TYPE
 #define LIBRARY_STATE_TYPE_NOARGS
 
+#define THWStorage THStorage
+#define THWStorage_(NAME) THStorage_(NAME)
+#define THWTensor THTensor
+#define THWTensor_(NAME) THTensor_(NAME)
+
 #include "PtrWrapper.h"
 #include "Exceptions.h"
 #include "Generator.h"
diff --git a/torch/csrc/api/include/torch/detail/ordered_dict.h b/torch/csrc/api/include/torch/detail/ordered_dict.h
new file mode 100644
index 00000000000000..c165ca5d5def82
--- /dev/null
+++ b/torch/csrc/api/include/torch/detail/ordered_dict.h
@@ -0,0 +1,220 @@
+#pragma once
+
+#include <ATen/Error.h>
+
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace detail {
+
+/// A simple ordered dictionary implementation, akin to Python's `OrderedDict`.
+template <typename Key, typename Value>
+class OrderedDict {
+ public:
+  struct Item {
+    Item(Key key_, Value value_)
+        : key(std::move(key_)), value(std::move(value_)) {}
+
+    Value& operator*() {
+      return value;
+    }
+    const Value& operator*() const {
+      return value;
+    }
+    Value* operator->() {
+      return &value;
+    }
+    const Value* operator->() const {
+      return &value;
+    }
+
+    const Key key;
+    Value value;
+  };
+
+  // The lifetime of an iterator is bound to the lifetime of the `OrderedDict`.
+  // Further, any `insert()` operation may invalidate all iterators
+  // pointing into the vector.
+
+  using Iterator = typename std::vector<Item>::iterator;
+  using ConstIterator = typename std::vector<Item>::const_iterator;
+
+  /// Constructs the `OrderedDict` with a string that should describe the kind
+  /// of value stored in this `OrderedDict`, for example 'parameter' or
+  /// 'module'.
+  explicit OrderedDict(std::string subject = "Key")
+      : subject_(std::move(subject)) {}
+
+  // Copy we have to do ourselves, because items' keys are const, so we have to
+  // re-insert the items.
+  OrderedDict(const OrderedDict& other)
+      : index_(other.index_), subject_(other.subject_) {
+    for (const auto& item : other.items_) {
+      items_.push_back(item);
+    }
+  }
+
+  OrderedDict& operator=(const OrderedDict& other) {
+    index_ = other.index_;
+    items_.clear();
+    for (auto& item : other.items_) {
+      items_.push_back(item);
+    }
+    subject_ = other.subject_;
+    return *this;
+  }
+
+  // Move works by default, because you can move-construct vectors of const
+  // values..
+  OrderedDict(OrderedDict&& other) = default;
+  OrderedDict& operator=(OrderedDict&& other) = default;
+
+  ~OrderedDict() = default;
+
+  /*implicit */ OrderedDict(std::initializer_list<Item> initializer_list)
+      : OrderedDict("Key") {
+    items_.reserve(initializer_list.size());
+    for (auto& item : initializer_list) {
+      // Copy the key here and move it into the index.
+      items_.emplace_back(item.key, std::move(item.value));
+      index_.emplace(std::move(item.key), size() - 1);
+    }
+  }
+
+  Iterator begin() {
+    return items_.begin();
+  }
+
+  ConstIterator begin() const {
+    return items_.begin();
+  }
+
+  Iterator end() {
+    return items_.end();
+  }
+
+  ConstIterator end() const {
+    return items_.end();
+  }
+
+  Item& front() {
+    return items_.front();
+  }
+
+  const Item& front() const {
+    return items_.front();
+  }
+
+  Item& back() {
+    return items_.back();
+  }
+
+  const Item& back() const {
+    return items_.back();
+  }
+
+  Item& operator[](size_t index) {
+    return items_[index];
+  }
+
+  const Item& operator[](size_t index) const {
+    return items_[index];
+  }
+
+  Value& operator[](const Key& key) {
+    return get(key);
+  }
+
+  const Value& operator[](const Key& key) const {
+    return get(key);
+  }
+
+  template <typename K, typename V>
+  Value& insert(K&& key, V&& value) {
+    AT_CHECK(index_.count(key) == 0, subject_, " '", key, "' already defined");
+    // Copy `key` here and move it into the index.
+    items_.emplace_back(key, std::forward<V>(value));
+    index_.emplace(std::forward<K>(key), size() - 1);
+    return items_.back().value;
+  }
+
+  /// Allows calling `insert` with an initializer list for the value, e.g.
+  /// `insert(key, {...})`.
+  Value& insert(Key key, Value&& value) {
+    return insert<Key, Value>(std::move(key), std::move(value));
+  }
+
+  void update(OrderedDict&& other) {
+    for (auto& item : other) {
+      // We want to call `insert()` to prevent duplicate keys.
+      insert(std::move(item.key), std::move(item.value));
+    }
+  }
+
+  void update(const OrderedDict& other) {
+    for (auto& item : other) {
+      // We want to call `insert()` to prevent duplicate keys.
+      insert(item.key, item.value);
+    }
+  }
+
+  Value* find(const Key& key) noexcept {
+    auto iterator = index_.find(key);
+    if (iterator == index_.end()) {
+      return nullptr;
+    }
+    return &items_[iterator->second].value;
+  }
+
+  const Value* find(const Key& key) const noexcept {
+    auto iterator = index_.find(key);
+    if (iterator == index_.end()) {
+      return nullptr;
+    }
+    return &items_[iterator->second].value;
+  }
+
+  Value& get(const Key& key) {
+    if (auto* value = find(key)) {
+      return *value;
+    }
+    AT_ERROR(subject_, " '", key, "' is not defined");
+  }
+
+  const Value& get(const Key& key) const {
+    if (auto* value = find(key)) {
+      return *value;
+    }
+    AT_ERROR(subject_, " '", key, "' is not defined");
+  }
+
+  void clear() {
+    index_.clear();
+    items_.clear();
+  }
+
+  size_t size() const noexcept {
+    return items_.size();
+  }
+
+  bool is_empty() const noexcept {
+    return items_.empty();
+  }
+
+  const std::string& subject() const noexcept {
+    return subject_;
+  }
+
+ private:
+  std::unordered_map<Key, size_t> index_;
+  std::vector<Item> items_;
+  std::string subject_;
+};
+} // namespace detail
+} // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h
index 7f998efcd032b0..63cb4aa46a2e1c 100644
--- a/torch/csrc/api/include/torch/nn/module.h
+++ b/torch/csrc/api/include/torch/nn/module.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/tensor.h>
+#include <torch/detail/ordered_dict.h>
 
 #include <torch/csrc/autograd/variable.h>
 
@@ -93,26 +94,29 @@ class Module {
   }
 
  protected:
-  Variable register_parameter(const std::string& name, at::Tensor tensor);
-  Variable register_buffer(const std::string& name, at::Tensor tensor);
+  autograd::Variable& register_parameter(std::string name, at::Tensor tensor);
+  autograd::Variable& register_buffer(std::string name, at::Tensor tensor);
 
   template <typename ModuleType>
   std::shared_ptr<ModuleType> register_module(
-      const std::string& name,
-      const std::shared_ptr<ModuleType>& module) {
-    const auto pair = children_.emplace(name, module);
-    AT_CHECK(pair.second, "Module has already been registered");
-    return module;
+      std::string name,
+      std::shared_ptr<ModuleType> module) {
+    auto& base_module = children_.insert(std::move(name), std::move(module));
+    return std::static_pointer_cast<ModuleType>(base_module);
   }
 
  private:
+  template <typename T>
+  using OrderedDict = torch::detail::OrderedDict<std::string, T>;
+
   template <typename Derived>
   friend class CloneableModule;
 
   virtual void clone_(Module& other);
 
-  std::unordered_map<std::string, Variable> parameters_;
-  std::unordered_map<std::string, std::shared_ptr<Module>> children_;
+  OrderedDict<autograd::Variable> parameters_;
+  OrderedDict<autograd::Variable> buffers_;
+  OrderedDict<std::shared_ptr<Module>> children_;
 
   /// The module's name (e.g. "LSTM").
   mutable at::optional<std::string> name_;
@@ -151,13 +155,11 @@ class CloneableModule : public Module {
     copy->parameters_.clear();
     copy->children_.clear();
     copy->reset();
-    for (auto& parameter : parameters_) {
-      copy->parameters_.at(parameter.first)
-          .data()
-          .copy_(parameter.second.data());
+    for (const auto& parameter : parameters_) {
+      copy->parameters_[parameter.key].data().copy_(parameter->data());
     }
-    for (auto& child : children_) {
-      copy->children_.at(child.first)->clone_(*child.second);
+    for (const auto& child : children_) {
+      copy->children_[child.key]->clone_(*child.value);
     }
     return copy;
   }
diff --git a/torch/csrc/api/src/nn/module.cpp b/torch/csrc/api/src/nn/module.cpp
index dac4093b731d80..3f9ca4a46630b5 100644
--- a/torch/csrc/api/src/nn/module.cpp
+++ b/torch/csrc/api/src/nn/module.cpp
@@ -6,7 +6,6 @@
 
 #include <algorithm>
 #include <map>
-#include <stdexcept>
 #include <string>
 #include <typeinfo>
 #include <unordered_map>
@@ -45,15 +44,13 @@ std::shared_ptr<Module> Module::clone() const {
 
 std::map<std::string, Variable> Module::parameters() const {
   std::map<std::string, Variable> ret;
-  for (const auto& pair : children_) {
-    auto& name = pair.first;
-    auto& child = pair.second;
-    for (auto& p : child->parameters()) {
-      ret[name + "." + p.first] = p.second;
+  for (const auto& child : children_) {
+    for (auto& p : child.value->parameters()) {
+      ret[child.key + "." + p.first] = p.second;
     }
   }
-  for (const auto& pair : parameters_) {
-    ret[pair.first] = pair.second;
+  for (const auto& parameter : parameters_) {
+    ret[parameter.key] = parameter.value;
   }
   return ret;
 }
@@ -67,34 +64,33 @@ Variable& Module::param(std::string const& name) {
       break;
     }
 
-    auto child_name = name.substr(begin, dot_pos - begin);
-    auto it = container->children_.find(child_name);
-    if (it == container->children_.end()) {
-      throw std::runtime_error("No such child: " + child_name);
+    const auto child_name = name.substr(begin, dot_pos - begin);
+    if (auto* child = container->children_.find(child_name)) {
+      container = child->get();
+    } else {
+      AT_ERROR("No such child: ", child_name);
     }
 
-    container = it->second.get();
     begin = dot_pos + 1; // Skip the dot
   }
 
-  auto param_name = name.substr(begin);
-  auto it = container->parameters_.find(param_name);
-  if (it == container->parameters_.end()) {
-    throw std::runtime_error("No such param: " + param_name);
+  const auto parameter_name = name.substr(begin);
+  if (auto* parameter = container->parameters_.find(parameter_name)) {
+    return *parameter;
   }
-  return it->second;
+  AT_ERROR("No such param: ", parameter_name);
 }
 
 void Module::train() {
-  for (auto& pair : children_) {
-    pair.second->train();
+  for (auto& child : children_) {
+    child.value->train();
   }
   is_training_ = true;
 }
 
 void Module::eval() {
-  for (auto& pair : children_) {
-    pair.second->eval();
+  for (auto& child : children_) {
+    child.value->eval();
   }
   is_training_ = false;
 }
@@ -109,39 +105,36 @@ void Module::cpu() {
 
 void Module::to(at::Type& type) {
   for (auto& child : children_) {
-    child.second->to(type);
+    child.value->to(type);
   }
-  for (auto& pair : parameters_) {
-    auto parameter = pair.second;
-    at::detail::set_data(parameter, parameter.data().toType(type));
-    AT_ASSERT(parameter.data().type() == type);
-    AT_ASSERT(&parameter.type() == autograd::VariableType::getType(type));
+  for (auto& parameter : parameters_) {
+    at::detail::set_data(*parameter, parameter->data().toType(type));
+    AT_ASSERT(parameter->data().type() == type);
+    AT_ASSERT(&parameter->type() == autograd::VariableType::getType(type));
   }
 }
 
 void Module::to(at::ScalarType scalar_type) {
   for (auto& child : children_) {
-    child.second->to(scalar_type);
+    child.value->to(scalar_type);
   }
-  for (auto& pair : parameters_) {
-    auto parameter = pair.second;
-    auto& new_type = parameter.data().type().toScalarType(scalar_type);
-    at::detail::set_data(parameter, parameter.data().toType(new_type));
-    AT_ASSERT(parameter.data().type().scalarType() == scalar_type);
-    AT_ASSERT(parameter.type().scalarType() == scalar_type);
+  for (auto& parameter : parameters_) {
+    auto& new_type = parameter->data().type().toScalarType(scalar_type);
+    at::detail::set_data(*parameter, parameter->data().toType(new_type));
+    AT_ASSERT(parameter->data().type().scalarType() == scalar_type);
+    AT_ASSERT(parameter->type().scalarType() == scalar_type);
   }
 }
 
 void Module::to(at::Backend backend) {
   for (auto& child : children_) {
-    child.second->to(backend);
+    child.value->to(backend);
   }
-  for (auto& pair : parameters_) {
-    auto parameter = pair.second;
-    auto& new_type = parameter.data().type().toBackend(backend);
-    at::detail::set_data(parameter, parameter.data().toType(new_type));
-    AT_ASSERT(parameter.data().type().backend() == backend);
-    AT_ASSERT(parameter.type().backend() == backend);
+  for (auto& parameter : parameters_) {
+    auto& new_type = parameter->data().type().toBackend(backend);
+    at::detail::set_data(*parameter, parameter->data().toType(new_type));
+    AT_ASSERT(parameter->data().type().backend() == backend);
+    AT_ASSERT(parameter->type().backend() == backend);
   }
 }
 
@@ -151,27 +144,25 @@ bool Module::is_training() const noexcept {
 
 void Module::zero_grad() {
   for (auto& child : children_) {
-    child.second->zero_grad();
+    child.value->zero_grad();
   }
-  for (auto& pair : parameters_) {
-    pair.second.grad().zero_();
+  for (auto& parameter : parameters_) {
+    parameter->grad().zero_();
   }
 }
 
-Variable Module::register_parameter(
-    const std::string& name,
+autograd::Variable& Module::register_parameter(
+    std::string name,
     at::Tensor tensor) {
   auto variable = autograd::make_variable(tensor, /*requires_grad=*/true);
-  const auto pair = parameters_.emplace(name, std::move(variable));
-  AT_CHECK(pair.second, "Parameter has already been registered");
-  return pair.first->second;
+  return parameters_.insert(std::move(name), std::move(variable));
 }
 
-Variable Module::register_buffer(const std::string& name, at::Tensor tensor) {
+autograd::Variable& Module::register_buffer(
+    std::string name,
+    at::Tensor tensor) {
   auto variable = autograd::make_variable(tensor, /*requires_grad=*/false);
-  const auto pair = parameters_.emplace(name, std::move(variable));
-  AT_CHECK(pair.second, "Parameter has already been registered");
-  return pair.first->second;
+  return parameters_.insert(std::move(name), std::move(variable));
 }
 
 void Module::clone_(Module& other) {}
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 8f4d72398aca32..88aec1573e1e3b 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -288,6 +288,49 @@ static variable_list call_post_hooks(Function& fn, variable_list outputs, variab
   return outputs;
 }
 
+static bool is_compatible_type(const at::Type& expected, const at::Type& actual) {
+  // Types are compatible if they exactly match or if the gradient is a sparse
+  // version of the expected type.
+  return expected == actual || (actual.is_sparse() &&
+      expected == actual.toBackend(toDense(actual.backend())));
+}
+
+template<typename F>
+static void validate_outputs(const edge_list& edges, const variable_list& grads, const F& format_error) {
+  if (grads.size() != edges.size()) {
+    std::stringstream ss;
+    ss << "invalid number of gradients - expected ";
+    ss << edges.size() << ", but got " << grads.size();
+    throw std::runtime_error(format_error(ss.str()));
+  }
+  for (size_t i = 0; i < grads.size(); i++) {
+    const auto& edge = edges[i];
+    if (!edge.is_valid()) continue;
+
+    const auto& metadata = edge.function->input_metadata(edge.input_nr);
+    const auto& output = grads[i];
+    if (!output.defined()) {
+      // FIXME: TestJit.test_ge_optimized fails this assertion.
+      // std::stringstream ss;
+      // ss << "undefined gradient at index " << i;
+      // throw std::runtime_error(format_error(ss.str()));
+      continue;
+    }
+    if (!grads[i].sizes().equals(metadata.shape())) {
+      std::stringstream ss;
+      ss << "invalid gradient at index " << i << " - expected shape ";
+      ss << metadata.shape() << " but got " << grads[i].sizes();
+      throw std::runtime_error(format_error(ss.str()));
+    }
+    if (!is_compatible_type(metadata.type(), grads[i].type())) {
+      std::stringstream ss;
+      ss << "invalid gradient at index " << i << " - expected type ";
+      ss << metadata.type() << " but got " << grads[i].type();
+      throw std::runtime_error(format_error(ss.str()));
+    }
+  }
+}
+
 static variable_list call_function(FunctionTask& task) {
   bool prev_checkpoint_valid_state = checkpoint_valid;
   checkpoint_valid = task.base->can_checkpoint() && prev_checkpoint_valid_state;
@@ -298,6 +341,11 @@ static variable_list call_function(FunctionTask& task) {
     fn.will_release_variables();
   }
   auto outputs = fn(inputs);
+  validate_outputs(fn.next_edges(), outputs, [&](const std::string& msg) {
+    std::ostringstream ss;
+    ss << "Function "  << fn.name() << " returned an " << msg;
+    return ss.str();
+  });
   checkpoint_valid = prev_checkpoint_valid_state;
   return call_post_hooks(fn, std::move(outputs), std::move(inputs));
 }
@@ -323,13 +371,6 @@ auto Engine::evaluate_function(FunctionTask& task) -> void {
     fn.release_variables();
   }
 
-  if (outputs.size() != fn.num_outputs()) {
-    std::stringstream ss;
-    ss << "Function '" << fn.name() << "' returned an invalid number of outputs - expected ";
-    ss << fn.num_outputs() << ", but got " << outputs.size();
-    throw std::runtime_error(ss.str());
-  }
-
   int num_outputs = outputs.size();
   if (num_outputs == 0) return; // Don't even acquire the mutex
   std::lock_guard<std::mutex> lock(task.base->mutex);
@@ -426,6 +467,11 @@ auto Engine::execute(const edge_list& input_roots,
                      bool create_graph,
                      const edge_list& outputs) -> variable_list {
   std::call_once(start_threads_flag, &Engine::start_threads, this);
+
+  validate_outputs(input_roots, inputs, [](const std::string& msg) {
+    return msg;
+  });
+
   // Callbacks are only valid for the duration of this run and should always be cleared
   ClearCallbacks _cb_guard(final_callbacks, post_callbacks_lock);
 
diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp
index 47b4e0620e0d7c..d116069149fa1e 100644
--- a/torch/csrc/autograd/function.cpp
+++ b/torch/csrc/autograd/function.cpp
@@ -19,8 +19,8 @@ namespace torch { namespace autograd {
 
 thread_local uint64_t Function::next_sequence_nr_ = 0;
 
-auto Function::name() -> std::string {
-  return std::string(typeid(*this).name());
+auto Function::name() const -> std::string {
+  return at::demangle(typeid(*this).name());
 }
 
 // This function is analogous to make_trace which operates on PythonOp, but this
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index 77e9b218469e7c..d76296a10a11d8 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -5,6 +5,7 @@
 #include "torch/csrc/autograd/grad_mode.h"
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/saved_variable.h"
+#include "torch/csrc/autograd/type_and_shape.h"
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/jit/tracer.h"
 #include "torch/csrc/utils/auto_unique_ptr.h"
@@ -91,18 +92,15 @@ struct Function : std::enable_shared_from_this<Function> {
   /// in the backward() pass, with higher sequence numbers prioritized
   /// before lower sequence numbers.
   explicit Function(
-      uint32_t num_inputs,
       uint64_t sequence_nr,
       edge_list&& next_edges = edge_list())
       : sequence_nr_(sequence_nr),
-      num_inputs_(num_inputs),
       next_edges_(std::move(next_edges)) {}
 
   explicit Function(
-      uint32_t num_inputs = 0,
       edge_list&& next_edges = edge_list())
-      : Function(num_inputs, next_sequence_nr_++, std::move(next_edges)) {}
-  
+      : Function(next_sequence_nr_++, std::move(next_edges)) {}
+
   /// Functions are neither copyable nor moveable.
   Function(const Function& other) = delete;
   Function(Function&& other) = delete;
@@ -123,20 +121,37 @@ struct Function : std::enable_shared_from_this<Function> {
   // Graph Connectivity API
   //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-  // Inputs
+  // Inputs. NOTE: inputs of the grad_fn correspond to Tensor outputs of the
+  // forward function.
+
+  // Marker for expected undefined input
+  struct undefined_input {};
 
-  /// Increments the number of inputs of the function and returns the previous
-  /// value.
-  uint32_t bump_inputs() noexcept {
-    return num_inputs_++;
+  /// Adds the type and shape metadata for a new input. Returns the index of
+  /// of the new input.
+  uint32_t add_input_metadata(const at::Type& type, at::IntList shape) noexcept {
+    uint32_t input_nr = input_metadata_.size();
+    input_metadata_.emplace_back(type, shape);
+    return input_nr;
   }
 
-  void set_num_inputs(uint32_t num_inputs) noexcept {
-    num_inputs_ = num_inputs;
+  /// Adds a placeholder for an input that will not be used.
+  uint32_t add_input_metadata(undefined_input u) noexcept {
+    uint32_t input_nr = input_metadata_.size();
+    input_metadata_.emplace_back();
+    return input_nr;
   }
 
   uint32_t num_inputs() const noexcept {
-    return num_inputs_;
+    return input_metadata_.size();
+  }
+
+  const TypeAndShape& input_metadata(size_t index) const {
+    return input_metadata_[index];
+  }
+
+  void clear_input_metadata() {
+    input_metadata_.clear();
   }
 
   // Outputs ("Next Edges")
@@ -185,7 +200,7 @@ struct Function : std::enable_shared_from_this<Function> {
   }
 
   /// Returns the name of the dynamic type of the function, for debugging.
-  virtual std::string name();
+  virtual std::string name() const;
 
   /// Returns true if the particular output edge is active, and that particular
   /// output of this function should be computed.
@@ -312,12 +327,12 @@ struct Function : std::enable_shared_from_this<Function> {
   // fields.
   const uint64_t sequence_nr_;
 
-  uint32_t num_inputs_;
   edge_list next_edges_;
   PyObject* pyobj_ = nullptr; // weak reference
   std::vector<std::unique_ptr<FunctionPreHook>> pre_hooks_;
   std::vector<std::unique_ptr<FunctionPostHook>> post_hooks_;
   auto_unique_ptr<jit::tracer::FunctionTracingState> tracing_state_;
+  at::SmallVector<TypeAndShape, 2> input_metadata_;
 };
 
 /// See Function::is_traceable() for definition.
@@ -355,13 +370,14 @@ struct MakeNextFunctionList : IterArgs<MakeNextFunctionList> {
 /// `input_nr` thus equal to `function->num_inputs()`. Additionally, it
 /// increments the `Function`'s number of inputs by one. Approximately
 /// equivalent to `variable.set_gradient_edge(function,
-/// function->bump_inputs())`. If you don't want the `Function`'s `num_inputs`
-/// to be incremented, use `set_gradient_edge` directly.
+/// function->add_input_metadata(variable.type(), variable.sizes()))`.
+/// If you don't want the `Function`'s `num_inputs` to be incremented, use
+/// `set_gradient_edge` directly.
 inline void create_gradient_edge(
     Variable& variable,
     std::shared_ptr<Function> function) {
   // Copy before move.
-  const auto input_nr = function->bump_inputs();
+  const auto input_nr = function->add_input_metadata(variable.type(), variable.sizes());
   variable.set_gradient_edge({std::move(function), input_nr});
 }
 
diff --git a/torch/csrc/autograd/functions/accumulate_grad.cpp b/torch/csrc/autograd/functions/accumulate_grad.cpp
index 31fc2434ea5d90..fdbe54d3fc3e92 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.cpp
+++ b/torch/csrc/autograd/functions/accumulate_grad.cpp
@@ -13,12 +13,13 @@ using at::Tensor;
 
 namespace torch { namespace autograd {
 
-// AccumulateGrad sets sequence_nr to the max value so it's always called 
+// AccumulateGrad sets sequence_nr to the max value so it's always called
 // ASAP during backwards.
 AccumulateGrad::AccumulateGrad(Variable variable_)
-    : Function(/*num_inputs=*/1
-              , /*sequence_nr=*/UINT64_MAX)
-              , variable(std::move(variable_)) {}
+    : Function(/*sequence_nr=*/UINT64_MAX)
+    , variable(std::move(variable_)) {
+  add_input_metadata(variable.type(), variable.sizes());
+}
 
 auto AccumulateGrad::apply(const variable_list& grads) -> variable_list {
   // XXX: this method is not thread-safe!
diff --git a/torch/csrc/autograd/functions/basic_ops.h b/torch/csrc/autograd/functions/basic_ops.h
index a630c3c79eb378..7ac4d2a3faa3c8 100644
--- a/torch/csrc/autograd/functions/basic_ops.h
+++ b/torch/csrc/autograd/functions/basic_ops.h
@@ -12,7 +12,7 @@ namespace torch { namespace autograd {
 
 struct Error : public Function {
   Error(std::string msg, edge_list&& next_edges)
-    : Function(/*num_inputs=*/0, std::move(next_edges))
+    : Function(std::move(next_edges))
     , msg(std::move(msg)) {}
 
   Error(std::string msg)
@@ -35,7 +35,7 @@ struct DelayedError : public Function {
 
 struct GraphRoot : public Function {
   GraphRoot(edge_list functions, variable_list inputs)
-      : Function(/*num_inputs=*/0, std::move(functions)),
+      : Function(std::move(functions)),
         outputs(std::move(inputs)) {}
 
   virtual variable_list apply(const variable_list& inputs) {
diff --git a/torch/csrc/autograd/functions/special.cpp b/torch/csrc/autograd/functions/special.cpp
index a5c5a6228e4098..88ac969122b780 100644
--- a/torch/csrc/autograd/functions/special.cpp
+++ b/torch/csrc/autograd/functions/special.cpp
@@ -17,7 +17,9 @@ namespace torch { namespace autograd {
 // Used when an output has multiple uses (there's only one entry
 // in next_edges per output).
 struct Replicate : public Function {
-  Replicate() : Function(/*num_inputs=*/1) {}
+  Replicate(const at::Type& type, at::IntList shape) : Function() {
+    add_input_metadata(type, shape);
+  }
 
   virtual variable_list apply(const variable_list& inputs) {
 		TORCH_ASSERT(inputs.size() == 1);
@@ -236,6 +238,7 @@ bool Eval::replaceSubgraph(const variable_list& inputs, const variable_list& _ou
     // This detaches the subgraph from the full backward graph.
     for (auto& begin : subgraph.boundary.begins) {
       const auto& edge = begin.function->next_edge(begin.input_nr);
+
       begin.function->set_next_edge(
           begin.input_nr, Edge(ends_to_outputs.at(edge), 0));
     }
@@ -265,7 +268,7 @@ bool Eval::replaceSubgraph(const variable_list& inputs, const variable_list& _ou
     // the same Variable has been returned multiple times, and
     // is repeated in this list.
     if (output.grad_fn_unsafe() == this) {
-      auto replicate = std::make_shared<Replicate>();
+      auto replicate = std::make_shared<Replicate>(output.type(), output.sizes());
       replicate->add_next_edge({this_shared, output.output_nr()});
       output.set_gradient_edge({std::move(replicate), 0});
       repeated_outputs.emplace(&output);
@@ -274,7 +277,8 @@ bool Eval::replaceSubgraph(const variable_list& inputs, const variable_list& _ou
     // perform any allocations until we actually see repeated outputs.
     if (repeated_outputs.count(&output) > 0) {
       auto & replicate = output.grad_fn();
-      replicate->add_next_edge({this_shared, num_inputs_++});
+      auto input_nr = add_input_metadata(output.type(), output.sizes());
+      replicate->add_next_edge({this_shared, input_nr});
     } else {
       autograd::create_gradient_edge(output, this_shared);
     }
diff --git a/torch/csrc/autograd/functions/special.h b/torch/csrc/autograd/functions/special.h
index 076a4faa98767a..273b139e238780 100644
--- a/torch/csrc/autograd/functions/special.h
+++ b/torch/csrc/autograd/functions/special.h
@@ -15,7 +15,9 @@ namespace torch { namespace autograd {
 
 struct EvalOutput : Function {
   explicit EvalOutput(const Edge& next_edge_)
-      : Function(/*num_inputs=*/1), next_edge(next_edge_) {}
+      : Function(), next_edge(next_edge_) {
+    add_input_metadata(undefined_input());
+  }
 
   virtual variable_list apply(const variable_list& inputs) override {
     throw std::logic_error("EvalOutput::apply() called");
diff --git a/torch/csrc/autograd/functions/tensor.cpp b/torch/csrc/autograd/functions/tensor.cpp
index 75ec8180c95983..5df72d52630680 100644
--- a/torch/csrc/autograd/functions/tensor.cpp
+++ b/torch/csrc/autograd/functions/tensor.cpp
@@ -36,12 +36,13 @@ CopySlices::CopySlices(
     const Variable& base_var,
     at::TensorGeometry view_,
     std::shared_ptr<Function> fn_)
-    : Function(/*num_inputs=*/1),
+    : Function(),
       base(base_var),
       view(std::move(view_)),
       fn(std::move(fn_)) {
   // Take the next_edges of fn as our own, except for index 0 which goes
   // to base instead of the view.
+  add_input_metadata(base_var.type(), base_var.sizes());
   const auto num_outputs = fn->num_outputs();
   next_edges_.reserve(num_outputs);
   add_next_edge(base_var.gradient_edge());
diff --git a/torch/csrc/autograd/functions/utils.cpp b/torch/csrc/autograd/functions/utils.cpp
index 09939a109a6c50..485572d9ee812c 100644
--- a/torch/csrc/autograd/functions/utils.cpp
+++ b/torch/csrc/autograd/functions/utils.cpp
@@ -29,7 +29,7 @@ variable_list wrap_outputs(const variable_list& inputs, tensor_list&& outputs,
         autograd::create_gradient_edge(variable, grad_fn);
         result.push_back(std::move(variable));
       } else {
-        grad_fn->bump_inputs();
+        grad_fn->add_input_metadata(Function::undefined_input());
         result.emplace_back();
       }
     }
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index e07d88feeeb21d..e52e06a34199c5 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -134,7 +134,7 @@ PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwar
     }
   }
 
-  edge_list output_edges;
+  std::vector<Edge> output_edges;
   if (inputs != nullptr) {
     int num_inputs = PyTuple_GET_SIZE(inputs);
     output_edges.reserve(num_inputs);
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 3fb53c9a6d804f..4672d535ef5d9a 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -207,7 +207,7 @@ auto PyFunction::release_variables() -> void {
   f->has_freed_buffers = 1;
 }
 
-auto PyFunction::name() -> std::string {
+auto PyFunction::name() const -> std::string {
   AutoGIL gil;
   auto f = (THPFunction*) obj;
   auto name = std::string(Py_TYPE(f)->tp_name);
@@ -245,7 +245,7 @@ static int THPFunction_traverse(THPFunction *self, visitproc visit, void *arg)
 
 static int THPFunction_clear(THPFunction *self)
 {
-  self->cdata.set_num_inputs(0);
+  self->cdata.clear_input_metadata();
 
   Py_CLEAR(self->needs_input_grad);
 
@@ -293,7 +293,6 @@ PyObject *THPFunction_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
   new (&self->input_info) std::vector<VariableInfo>();
   new (&self->saved_variables) std::vector<SavedVariable>();
   new (&self->is_variable_input) std::vector<bool>();
-  self->cdata.set_num_inputs(0);
   return obj;
 }
 
@@ -425,6 +424,10 @@ static void _wrap_outputs(THPFunction *self,
     // Note that output Variables may be repeated. In that case, the last call
     // to set_history wins.
     auto var = as_variable(obj, i);
+    if (cdata) {
+      auto output_nr = cdata->add_input_metadata(var.type(), var.sizes());
+      TORCH_ASSERT(i == (int)output_nr);
+    }
     set_history(var, i, is_input, is_modified, is_differentiable);
 
     if (is_executable) {
@@ -616,7 +619,7 @@ PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const Unpacked
   THPObjectPtr outputs(PyTuple_New(num_outputs));
   if (!outputs) throw python_error();
 
-  grad_fn->cdata.set_num_inputs(num_outputs);
+  grad_fn->cdata.clear_input_metadata();
 
   // Record type, device, and size information about inputs
   if (is_executable) {
diff --git a/torch/csrc/autograd/python_function.h b/torch/csrc/autograd/python_function.h
index 562ab79a280f68..529bcaf4546a15 100644
--- a/torch/csrc/autograd/python_function.h
+++ b/torch/csrc/autograd/python_function.h
@@ -35,7 +35,7 @@ struct PyFunction : public Function {
   variable_list legacy_apply(const variable_list& inputs);
 
   virtual void release_variables() override;
-  virtual std::string name() override;
+  virtual std::string name() const override;
   virtual std::shared_ptr<Function> get_shared_ptr() override;
   virtual bool is_traceable() override;
 
diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp
index 179c54a1e3477d..33dd281ab078d5 100644
--- a/torch/csrc/autograd/python_legacy_variable.cpp
+++ b/torch/csrc/autograd/python_legacy_variable.cpp
@@ -57,7 +57,7 @@ static PyObject *THPVariable_pynew(PyTypeObject* type, PyObject *args, PyObject
   Variable var;
   if (grad_fn) {
     auto grad_fn_ = THPFunction_asFunction((THPFunction*)grad_fn);
-    Edge edge(grad_fn_, grad_fn_->bump_inputs());
+    Edge edge(grad_fn_, grad_fn_->add_input_metadata(tensor.type(), tensor.sizes()));
     var = make_variable(std::move(tensor), std::move(edge));
   } else {
     var = make_variable(std::move(tensor), requires_grad);
diff --git a/torch/csrc/autograd/type_and_shape.h b/torch/csrc/autograd/type_and_shape.h
new file mode 100644
index 00000000000000..01a62fa2cf18a9
--- /dev/null
+++ b/torch/csrc/autograd/type_and_shape.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include "torch/csrc/assertions.h"
+
+namespace torch { namespace autograd {
+
+/// A tensor's type and shape. Each Function records the required type and
+/// shape of its inputs. If is_valid() is false, then the corresponding input
+/// is not used and may be an undefined tensor.
+struct TypeAndShape {
+  TypeAndShape() : type_(nullptr) {}
+
+  TypeAndShape(const at::Type& type, at::IntList shape)
+    : type_(&type) , shape_(shape) {}
+
+  bool is_valid() const {
+    return type_ != nullptr;
+  }
+
+  const at::Type& type() const {
+    TORCH_ASSERT(type_);
+    return *type_;
+  }
+
+  at::IntList shape() const {
+    return shape_;
+  }
+
+  const at::Type* type_;
+  at::DimVector shape_;
+};
+
+}}
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index f0b4c1df9d99fb..48a35f4f88795f 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -126,10 +126,12 @@ void Variable::Impl::backward(
 }
 
 void Variable::Impl::set_data(Tensor new_data) {
-  data_ = std::move(new_data);
-  if (data_.type() != *type_) {
-    type_ = VariableType::getType(data_);
+  if (new_data.type() != data_.type()) {
+    type_ = VariableType::getType(new_data.type());
+    // Clear grad_accumulator if it exists, since it stores the old type info.
+    grad_accumulator_.reset();
   }
+  data_ = std::move(new_data);
 }
 
 Variable::ViewImpl::ViewImpl(Variable base, at::Tensor data, Edge gradient_edge)
@@ -158,7 +160,7 @@ std::shared_ptr<Function>& Variable::ViewImpl::get_grad_fn() {
     fn->stride = strides();
     fn->storage_offset = data_.storage_offset();
     fn->set_next_edges(collect_next_edges(base_));
-    fn->set_num_inputs(1);
+    fn->add_input_metadata(base_.type(), sizes());
     grad_fn_ = std::move(fn);
     attr_version = current_version;
   }
diff --git a/torch/csrc/cuda/Storage.cpp b/torch/csrc/cuda/Storage.cpp
index a9806d9aa5554f..4e3d1b4ef3dd83 100644
--- a/torch/csrc/cuda/Storage.cpp
+++ b/torch/csrc/cuda/Storage.cpp
@@ -19,7 +19,7 @@
 #include <THC/THCGenerateAllTypes.h>
 
 template<>
-void THPPointer<THStorage>::free() {
+void THPPointer<THCStorage>::free() {
   if (ptr)
     THCStorage_free(LIBRARY_STATE ptr);
 }
diff --git a/torch/csrc/cuda/override_macros.h b/torch/csrc/cuda/override_macros.h
index a124903bf51cb5..439f33db001f67 100644
--- a/torch/csrc/cuda/override_macros.h
+++ b/torch/csrc/cuda/override_macros.h
@@ -1,14 +1,14 @@
 #include "undef_macros.h"
 
-#define THStoragePtr THCStoragePtr
+#define THWStoragePtr THCStoragePtr
 #define THPStoragePtr THCPStoragePtr
-#define THTensorPtr THCTensorPtr
+#define THWTensorPtr THCTensorPtr
 #define THPTensorPtr THCPTensorPtr
 
-#define THStorage THCStorage
-#define THStorage_(NAME) THCStorage_(NAME)
-#define THTensor THCTensor
-#define THTensor_(NAME) THCTensor_(NAME)
+#define THWStorage THCStorage
+#define THWStorage_(NAME) THCStorage_(NAME)
+#define THWTensor THCTensor
+#define THWTensor_(NAME) THCTensor_(NAME)
 
 #define THPStorage_(NAME) TH_CONCAT_4(THCP,Real,Storage_,NAME)
 #define THPStorage THCPStorage
@@ -29,10 +29,7 @@
 #define THPTensorStateless THCPTensorStateless
 
 
-#define THSTensorPtr THCSTensorPtr
 #define THSPTensorPtr THCSPTensorPtr
-#define THSTensor THCSTensor
-#define THSTensor_(NAME) THCSTensor_(NAME)
 
 #define THSPTensor_(NAME) TH_CONCAT_4(THCSP,Real,Tensor_,NAME)
 #define THSPTensor_stateless_(NAME) TH_CONCAT_4(THCSP,Real,Tensor_stateless_,NAME)
diff --git a/torch/csrc/cuda/restore_macros.h b/torch/csrc/cuda/restore_macros.h
index ec6eee25232446..054eec69897bbb 100644
--- a/torch/csrc/cuda/restore_macros.h
+++ b/torch/csrc/cuda/restore_macros.h
@@ -1,6 +1,6 @@
 
-#define THTensor                    TH_CONCAT_3(TH,Real,Tensor)
-#define THTensor_(NAME)             TH_CONCAT_4(TH,Real,Tensor_,NAME)
+#define THWTensor                    TH_CONCAT_3(TH,Real,Tensor)
+#define THWTensor_(NAME)             TH_CONCAT_4(TH,Real,Tensor_,NAME)
 
 #define THPTensor                   TH_CONCAT_3(THP,Real,Tensor)
 #define THPTensorStr                TH_CONCAT_STRING_3(torch.,Real,Tensor)
@@ -13,8 +13,8 @@
 #define THPStorage_(NAME) TH_CONCAT_4(THP,Real,Storage_,NAME)
 
 #ifdef _THP_CORE
-#define THStoragePtr TH_CONCAT_3(TH,Real,StoragePtr)
-#define THTensorPtr  TH_CONCAT_3(TH,Real,TensorPtr)
+#define THWStoragePtr TH_CONCAT_3(TH,Real,StoragePtr)
+#define THWTensorPtr  TH_CONCAT_3(TH,Real,TensorPtr)
 #define THPStoragePtr TH_CONCAT_3(THP,Real,StoragePtr)
 #define THPTensorPtr  TH_CONCAT_3(THP,Real,TensorPtr)
 #endif
diff --git a/torch/csrc/cuda/undef_macros.h b/torch/csrc/cuda/undef_macros.h
index bfc6600959cc5a..a6d5e834b5f76e 100644
--- a/torch/csrc/cuda/undef_macros.h
+++ b/torch/csrc/cuda/undef_macros.h
@@ -22,14 +22,14 @@
 #undef THPStorageClass
 #undef THPStorageType
 
-#undef THStorage
-#undef THStorage_
-#undef THTensor
-#undef THTensor_
+#undef THWStorage
+#undef THWStorage_
+#undef THWTensor
+#undef THWTensor_
 
-#undef THStoragePtr
+#undef THWStoragePtr
 #undef THPStoragePtr
-#undef THTensorPtr
+#undef THWTensorPtr
 #undef THPTensorPtr
 
 
@@ -44,9 +44,6 @@
 #undef THSPTensorStateless
 #undef THSPTensorType
 
-#undef THSTensor
-#undef THSTensor_
-#undef THSTensorPtr
 #undef THSPTensorPtr
 
 
diff --git a/torch/csrc/cuda/utils.cpp b/torch/csrc/cuda/utils.cpp
index 270d972fc05a1b..30950bd0782ef1 100644
--- a/torch/csrc/cuda/utils.cpp
+++ b/torch/csrc/cuda/utils.cpp
@@ -7,3 +7,30 @@
 
 #define THC_GENERIC_FILE "torch/csrc/generic/utils.cpp"
 #include <THC/THCGenerateAllTypes.h>
+
+#ifdef WITH_CUDA
+std::vector <THCStream*> THPUtils_PySequence_to_THCStreamList(PyObject *obj) {
+  if (!PySequence_Check(obj)) {
+    throw std::runtime_error("Expected a sequence in THPUtils_PySequence_to_THCStreamList");
+  }
+  THPObjectPtr seq = THPObjectPtr(PySequence_Fast(obj, NULL));
+  if (seq.get() == NULL) {
+    throw std::runtime_error("expected PySequence, but got " + std::string(THPUtils_typename(obj)));
+  }
+
+  std::vector<THCStream*> streams;
+  Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get());
+  for (Py_ssize_t i = 0; i < length; i++) {
+    PyObject *stream = PySequence_Fast_GET_ITEM(seq.get(), i);
+
+    if (PyObject_IsInstance(stream, THCPStreamClass)) {
+      streams.push_back( ((THCPStream *)stream)->cdata);
+    } else if (stream == Py_None) {
+      streams.push_back(NULL);
+    } else {
+      std::runtime_error("Unknown data type found in stream list. Need THCStream or None");
+    }
+  }
+  return streams;
+}
+#endif
diff --git a/torch/csrc/distributed/override_macros.h b/torch/csrc/distributed/override_macros.h
index bc94d056174189..e788533adb7c98 100644
--- a/torch/csrc/distributed/override_macros.h
+++ b/torch/csrc/distributed/override_macros.h
@@ -1,14 +1,14 @@
 #include "undef_macros.h"
 
-#define THStoragePtr THDStoragePtr
+#define THWStoragePtr THDStoragePtr
 #define THPStoragePtr THDPStoragePtr
-#define THTensorPtr THDTensorPtr
+#define THWTensorPtr THDTensorPtr
 #define THPTensorPtr THDPTensorPtr
 
-#define THStorage THDStorage
-#define THStorage_(NAME) THDStorage_(NAME)
-#define THTensor THDTensor
-#define THTensor_(NAME) THDTensor_(NAME)
+#define THWStorage THDStorage
+#define THWStorage_(NAME) THDStorage_(NAME)
+#define THWTensor THDTensor
+#define THWTensor_(NAME) THDTensor_(NAME)
 
 #define THPStorage_(NAME) TH_CONCAT_4(THDP,Real,Storage_,NAME)
 #define THPStorage THDPStorage
diff --git a/torch/csrc/distributed/undef_macros.h b/torch/csrc/distributed/undef_macros.h
index a3bed51406e6c3..969817503ec0ce 100644
--- a/torch/csrc/distributed/undef_macros.h
+++ b/torch/csrc/distributed/undef_macros.h
@@ -20,14 +20,14 @@
 #undef THPStorageClass
 #undef THPStorageType
 
-#undef THStorage
-#undef THStorage_
-#undef THTensor
-#undef THTensor_
+#undef THWStorage
+#undef THWStorage_
+#undef THWTensor
+#undef THWTensor_
 
-#undef THStoragePtr
+#undef THWStoragePtr
 #undef THPStoragePtr
-#undef THTensorPtr
+#undef THWTensorPtr
 #undef THPTensorPtr
 
 #undef THHostTensor
diff --git a/torch/csrc/generic/Storage.cpp b/torch/csrc/generic/Storage.cpp
index 2d34aeabcc9afd..c50b8ce507b60c 100644
--- a/torch/csrc/generic/Storage.cpp
+++ b/torch/csrc/generic/Storage.cpp
@@ -4,7 +4,7 @@
 
 PyObject *THPStorageClass = NULL;
 
-PyObject * THPStorage_(New)(THStorage *ptr)
+PyObject * THPStorage_(New)(THWStorage *ptr)
 {
   TORCH_ASSERT(ptr);
   PyTypeObject *type = (PyTypeObject *)THPStorageClass;
@@ -12,24 +12,24 @@ PyObject * THPStorage_(New)(THStorage *ptr)
   if (obj) {
     ((THPStorage *)obj)->cdata = ptr;
   } else {
-    THStorage_(free)(LIBRARY_STATE ptr);
+    THWStorage_(free)(LIBRARY_STATE ptr);
   }
   return obj;
 }
 
 static void THPStorage_(dealloc)(THPStorage* self)
 {
-  THStorage_(free)(LIBRARY_STATE self->cdata);
+  THWStorage_(free)(LIBRARY_STATE self->cdata);
   Py_TYPE(self)->tp_free((PyObject*)self);
 }
 
-static THStorage* THPStorage_(newWithAllocator)(int64_t size, THAllocator* allocator)
+static THWStorage* THPStorage_(newWithAllocator)(int64_t size, THAllocator* allocator)
 {
 #if defined(THC_GENERIC_FILE) || defined(THD_GENERIC_FILE)
   THPUtils_setError(THPStorageStr " does not support custom allocators");
   return NULL;
 #else
-  return THStorage_(newWithAllocator)(LIBRARY_STATE size, allocator, NULL);
+  return THWStorage_(newWithAllocator)(LIBRARY_STATE size, allocator, NULL);
 #endif
 }
 
@@ -55,7 +55,7 @@ static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObjec
     if (num_args == 0) {
       PyObject *cdata_ptr = PyDict_GetItemString(kwargs, "cdata");
       if (num_kwargs == 1 && cdata_ptr && THPUtils_checkLong(cdata_ptr)) {
-        THStorage *ptr = (THStorage*)PyLong_AsVoidPtr(cdata_ptr);
+        THWStorage *ptr = (THWStorage*)PyLong_AsVoidPtr(cdata_ptr);
         self->cdata = ptr;
         return (PyObject*)self.release();
       }
@@ -68,7 +68,7 @@ static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObjec
     if (allocator) {
       self->cdata = THPStorage_(newWithAllocator)(0, allocator);
     } else {
-      self->cdata = THStorage_(new)(LIBRARY_STATE_NOARGS);
+      self->cdata = THWStorage_(new)(LIBRARY_STATE_NOARGS);
     }
     return (PyObject*)self.release();
   }
@@ -81,7 +81,7 @@ static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObjec
     if (allocator) {
       self->cdata = THPStorage_(newWithAllocator)(size, allocator);
     } else {
-      self->cdata = THStorage_(newWithSize)(LIBRARY_STATE size);
+      self->cdata = THWStorage_(newWithSize)(LIBRARY_STATE size);
     }
     return (PyObject*)self.release();
   }
@@ -117,11 +117,11 @@ static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObjec
         "%" PRId64 ", but the viewed storage has only %" PRId64 " element(s) after offset %" PRId64,
         size, numel - offset, offset);
 
-    real *data_ptr = THStorage_(data)(LIBRARY_STATE storage_arg->cdata) + offset;
-    THStoragePtr storage(THStorage_(newWithData)(LIBRARY_STATE data_ptr, size));
+    real *data_ptr = THWStorage_(data)(LIBRARY_STATE storage_arg->cdata) + offset;
+    THWStoragePtr storage(THWStorage_(newWithData)(LIBRARY_STATE data_ptr, size));
     storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_VIEW;
     storage->view = storage_arg->cdata;
-    THStorage_(retain)(LIBRARY_STATE storage_arg->cdata);
+    THWStorage_(retain)(LIBRARY_STATE storage_arg->cdata);
     self->cdata = storage.release();
     return (PyObject*)self.release();
 #endif
@@ -135,7 +135,7 @@ static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObjec
     Py_ssize_t length = PySequence_Length(first_arg);
     THPUtils_assert(length >= 0, "couldn't obtain the length of %s",
         THPUtils_typename(first_arg));
-    self->cdata = THStorage_(newWithSize)(LIBRARY_STATE length);
+    self->cdata = THWStorage_(newWithSize)(LIBRARY_STATE length);
     THPObjectPtr item;
     try {
       for (Py_ssize_t i = 0; i < length; i++) {
@@ -177,7 +177,7 @@ static PyObject * THPStorage_(pynew)(PyTypeObject *type, PyObject *args, PyObjec
 static Py_ssize_t THPStorage_(length)(THPStorage *self)
 {
   HANDLE_TH_ERRORS
-  return THStorage_(size)(LIBRARY_STATE self->cdata);
+  return THWStorage_(size)(LIBRARY_STATE self->cdata);
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
@@ -188,13 +188,13 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index)
   if (THPUtils_checkLong(index)) {
     int64_t nindex = THPUtils_unpackLong(index);
     if (nindex < 0)
-      nindex += THStorage_(size)(LIBRARY_STATE self->cdata);
+      nindex += THWStorage_(size)(LIBRARY_STATE self->cdata);
     if (nindex < 0 || nindex >= self->cdata->size) {
       PyErr_Format(PyExc_IndexError, "index %" PRId64 " out of range for storage of "
               "size %" PRId64, (int64_t) nindex, (int64_t) self->cdata->size);
       return NULL;
     }
-    real value = THStorage_(get)(LIBRARY_STATE self->cdata, nindex);
+    real value = THWStorage_(get)(LIBRARY_STATE self->cdata, nindex);
     return THPUtils_(newReal)(value);
   /* Slice index */
   } else if (PySlice_Check(index)) {
@@ -203,7 +203,7 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index)
     return NULL;
 #else
     Py_ssize_t start, stop, slicelength, step;
-    int64_t len = THStorage_(size)(LIBRARY_STATE self->cdata);
+    int64_t len = THWStorage_(size)(LIBRARY_STATE self->cdata);
     if (!THPUtils_parseSlice(index, len, &start, &stop, &step, &slicelength))
       return NULL;
     if (step != 1) {
@@ -212,11 +212,11 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index)
       return NULL;
     }
 
-    real *data = THStorage_(data)(LIBRARY_STATE self->cdata);
-    THStoragePtr new_storage(THStorage_(newWithData)(LIBRARY_STATE data + start, slicelength));
+    real *data = THWStorage_(data)(LIBRARY_STATE self->cdata);
+    THWStoragePtr new_storage(THWStorage_(newWithData)(LIBRARY_STATE data + start, slicelength));
     new_storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_VIEW;
     new_storage->view = self->cdata;
-    THStorage_(retain)(LIBRARY_STATE self->cdata);
+    THWStorage_(retain)(LIBRARY_STATE self->cdata);
 
     PyObject *_ret = THPStorage_(New)(new_storage);
     new_storage.release();
@@ -242,11 +242,11 @@ static int THPStorage_(set)(THPStorage *self, PyObject *index, PyObject *value)
   real rvalue = THPUtils_(unpackReal)(value);
   if (THPUtils_checkLong(index)) {
     int64_t nindex = THPUtils_unpackLong(index);
-    THStorage_(set)(LIBRARY_STATE self->cdata, nindex, rvalue);
+    THWStorage_(set)(LIBRARY_STATE self->cdata, nindex, rvalue);
     return 0;
   } else if (PySlice_Check(index)) {
     Py_ssize_t start, stop, slicelength, step;
-    int64_t len = THStorage_(size)(LIBRARY_STATE self->cdata);
+    int64_t len = THWStorage_(size)(LIBRARY_STATE self->cdata);
     if (!THPUtils_parseSlice(index, len, &start, &stop, &step, &slicelength))
       return -1;
     if (step != 1) {
@@ -257,7 +257,7 @@ static int THPStorage_(set)(THPStorage *self, PyObject *index, PyObject *value)
     // TODO: check the bounds only once
     // TODO: fill?
     for (;start < stop; start++)
-      THStorage_(set)(LIBRARY_STATE self->cdata, start, rvalue);
+      THWStorage_(set)(LIBRARY_STATE self->cdata, start, rvalue);
     return 0;
   }
   THPUtils_setError("can't index a " THPStorageStr " with %s",
@@ -319,33 +319,33 @@ static struct PyMemberDef THPStorage_(members)[] = {
   {NULL}
 };
 
-extern THPCopyList THStorage_(copy_functions);
-THPCopyList THStorage_(copy_functions);
+extern THPCopyList THWStorage_(copy_functions);
+THPCopyList THWStorage_(copy_functions);
 
 void THPStorage_(initCopyMethods)()
 {
 #ifndef THD_GENERIC_FILE
-  auto& h = THStorage_(copy_functions);
+  auto& h = THWStorage_(copy_functions);
   // copy from CPU types
-  THPInsertStorageCopyFunction<THPStorage, THPByteStorage>(&THPByteStorageType, h, &THStorage_(copyByte));
-  THPInsertStorageCopyFunction<THPStorage, THPCharStorage>(&THPCharStorageType, h, &THStorage_(copyChar));
-  THPInsertStorageCopyFunction<THPStorage, THPShortStorage>(&THPShortStorageType, h, &THStorage_(copyShort));
-  THPInsertStorageCopyFunction<THPStorage, THPIntStorage>(&THPIntStorageType, h, &THStorage_(copyInt));
-  THPInsertStorageCopyFunction<THPStorage, THPLongStorage>(&THPLongStorageType, h, &THStorage_(copyLong));
-  THPInsertStorageCopyFunction<THPStorage, THPHalfStorage>(&THPHalfStorageType, h, &THStorage_(copyHalf));
-  THPInsertStorageCopyFunction<THPStorage, THPFloatStorage>(&THPFloatStorageType, h, &THStorage_(copyFloat));
-  THPInsertStorageCopyFunction<THPStorage, THPDoubleStorage>(&THPDoubleStorageType, h, &THStorage_(copyDouble));
+  THPInsertStorageCopyFunction<THPStorage, THPByteStorage>(&THPByteStorageType, h, &THWStorage_(copyByte));
+  THPInsertStorageCopyFunction<THPStorage, THPCharStorage>(&THPCharStorageType, h, &THWStorage_(copyChar));
+  THPInsertStorageCopyFunction<THPStorage, THPShortStorage>(&THPShortStorageType, h, &THWStorage_(copyShort));
+  THPInsertStorageCopyFunction<THPStorage, THPIntStorage>(&THPIntStorageType, h, &THWStorage_(copyInt));
+  THPInsertStorageCopyFunction<THPStorage, THPLongStorage>(&THPLongStorageType, h, &THWStorage_(copyLong));
+  THPInsertStorageCopyFunction<THPStorage, THPHalfStorage>(&THPHalfStorageType, h, &THWStorage_(copyHalf));
+  THPInsertStorageCopyFunction<THPStorage, THPFloatStorage>(&THPFloatStorageType, h, &THWStorage_(copyFloat));
+  THPInsertStorageCopyFunction<THPStorage, THPDoubleStorage>(&THPDoubleStorageType, h, &THWStorage_(copyDouble));
 #ifdef THC_GENERIC_FILE
   // copy from GPU types
-  THPInsertStorageCopyFunction<THPStorage, THCPByteStorage>(&THCPByteStorageType, h, &THStorage_(copyCudaByte));
-  THPInsertStorageCopyFunction<THPStorage, THCPCharStorage>(&THCPCharStorageType, h, &THStorage_(copyCudaChar));
-  THPInsertStorageCopyFunction<THPStorage, THCPShortStorage>(&THCPShortStorageType, h, &THStorage_(copyCudaShort));
-  THPInsertStorageCopyFunction<THPStorage, THCPIntStorage>(&THCPIntStorageType, h, &THStorage_(copyCudaInt));
-  THPInsertStorageCopyFunction<THPStorage, THCPLongStorage>(&THCPLongStorageType, h, &THStorage_(copyCudaLong));
-  THPInsertStorageCopyFunction<THPStorage, THCPFloatStorage>(&THCPFloatStorageType, h, &THStorage_(copyCudaFloat));
-  THPInsertStorageCopyFunction<THPStorage, THCPDoubleStorage>(&THCPDoubleStorageType, h, &THStorage_(copyCudaDouble));
+  THPInsertStorageCopyFunction<THPStorage, THCPByteStorage>(&THCPByteStorageType, h, &THWStorage_(copyCudaByte));
+  THPInsertStorageCopyFunction<THPStorage, THCPCharStorage>(&THCPCharStorageType, h, &THWStorage_(copyCudaChar));
+  THPInsertStorageCopyFunction<THPStorage, THCPShortStorage>(&THCPShortStorageType, h, &THWStorage_(copyCudaShort));
+  THPInsertStorageCopyFunction<THPStorage, THCPIntStorage>(&THCPIntStorageType, h, &THWStorage_(copyCudaInt));
+  THPInsertStorageCopyFunction<THPStorage, THCPLongStorage>(&THCPLongStorageType, h, &THWStorage_(copyCudaLong));
+  THPInsertStorageCopyFunction<THPStorage, THCPFloatStorage>(&THCPFloatStorageType, h, &THWStorage_(copyCudaFloat));
+  THPInsertStorageCopyFunction<THPStorage, THCPDoubleStorage>(&THCPDoubleStorageType, h, &THWStorage_(copyCudaDouble));
 #ifdef CUDA_HALF_TENSOR
-  THPInsertStorageCopyFunction<THPStorage, THCPHalfStorage>(&THCPHalfStorageType, h, &THStorage_(copyCudaHalf));
+  THPInsertStorageCopyFunction<THPStorage, THCPHalfStorage>(&THCPHalfStorageType, h, &THWStorage_(copyCudaHalf));
 #endif
   // add CPU <- GPU copies to base type
   #define THPCpuStorage TH_CONCAT_3(THP, Real, Storage)
diff --git a/torch/csrc/generic/Storage.h b/torch/csrc/generic/Storage.h
index 141a2c33101657..4a9b0acc576978 100644
--- a/torch/csrc/generic/Storage.h
+++ b/torch/csrc/generic/Storage.h
@@ -4,10 +4,10 @@
 
 struct THPStorage {
   PyObject_HEAD
-  THStorage *cdata;
+  THWStorage *cdata;
 };
 
-THP_API PyObject * THPStorage_(New)(THStorage *ptr);
+THP_API PyObject * THPStorage_(New)(THWStorage *ptr);
 extern PyObject *THPStorageClass;
 
 #ifdef _THP_CORE
diff --git a/torch/csrc/generic/StorageMethods.cpp b/torch/csrc/generic/StorageMethods.cpp
index f525d621502997..26863d9e7404d6 100644
--- a/torch/csrc/generic/StorageMethods.cpp
+++ b/torch/csrc/generic/StorageMethods.cpp
@@ -5,7 +5,7 @@
 static PyObject * THPStorage_(size)(THPStorage *self)
 {
   HANDLE_TH_ERRORS
-  return PyLong_FromLong(THStorage_(size)(LIBRARY_STATE self->cdata));
+  return PyLong_FromLong(THWStorage_(size)(LIBRARY_STATE self->cdata));
   END_HANDLE_TH_ERRORS
 }
 
@@ -13,7 +13,7 @@ static PyObject * THPStorage_(size)(THPStorage *self)
 static PyObject * THPStorage_(dataPtr)(THPStorage *self)
 {
   HANDLE_TH_ERRORS
-  return PyLong_FromVoidPtr(THStorage_(data)(LIBRARY_STATE self->cdata));
+  return PyLong_FromVoidPtr(THWStorage_(data)(LIBRARY_STATE self->cdata));
   END_HANDLE_TH_ERRORS
 }
 #endif
@@ -21,7 +21,7 @@ static PyObject * THPStorage_(dataPtr)(THPStorage *self)
 static PyObject * THPStorage_(copy_)(PyObject *self, PyObject *args, PyObject *kwargs)
 {
   HANDLE_TH_ERRORS
-  return THPStorageCopyMethod(THStorage_(copy_functions), self, args, kwargs);
+  return THPStorageCopyMethod(THWStorage_(copy_functions), self, args, kwargs);
   END_HANDLE_TH_ERRORS
 }
 
@@ -31,7 +31,7 @@ static PyObject * THPStorage_(isPinned)(THPStorage *self)
   HANDLE_TH_ERRORS
 #if defined(WITH_CUDA)
   cudaPointerAttributes attr;
-  cudaError_t err = cudaPointerGetAttributes(&attr, THStorage_(data)(LIBRARY_STATE self->cdata));
+  cudaError_t err = cudaPointerGetAttributes(&attr, THWStorage_(data)(LIBRARY_STATE self->cdata));
   if (err != cudaSuccess) {
     cudaGetLastError();
     Py_RETURN_FALSE;
@@ -47,14 +47,14 @@ static PyObject * THPStorage_(isPinned)(THPStorage *self)
 static PyObject * THPStorage_(elementSize)(THPStorage *self)
 {
   HANDLE_TH_ERRORS
-  return PyLong_FromLong(THStorage_(elementSize)(LIBRARY_STATE_NOARGS));
+  return PyLong_FromLong(THWStorage_(elementSize)(LIBRARY_STATE_NOARGS));
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject * THPStorage_(new)(THPStorage *self)
 {
   HANDLE_TH_ERRORS
-  THStoragePtr new_storage(THStorage_(new)(LIBRARY_STATE_NOARGS));
+  THWStoragePtr new_storage(THWStorage_(new)(LIBRARY_STATE_NOARGS));
   PyObject *_ret = THPStorage_(New)(new_storage);
   new_storage.release();
   return _ret;
@@ -67,7 +67,7 @@ static PyObject * THPStorage_(resize_)(THPStorage *self, PyObject *number_arg)
   THPUtils_assert(THPUtils_checkLong(number_arg), "resize_ expects an int, "
       "but got %s", THPUtils_typename(number_arg));
   int64_t newsize = THPUtils_unpackLong(number_arg);
-  THStorage_(resize)(LIBRARY_STATE self->cdata, newsize);
+  THWStorage_(resize)(LIBRARY_STATE self->cdata, newsize);
   Py_INCREF(self);
   return (PyObject*)self;
   END_HANDLE_TH_ERRORS
@@ -79,7 +79,7 @@ static PyObject * THPStorage_(fill_)(THPStorage *self, PyObject *number_arg)
   THPUtils_assert(THPUtils_(checkReal)(number_arg), "fill_ expects %s, "
       "but got %s", THPUtils_typeTraits<real>::python_type_str,
       THPUtils_typename(number_arg));
-  THStorage_(fill)(LIBRARY_STATE self->cdata, THPUtils_(unpackReal)(number_arg));
+  THWStorage_(fill)(LIBRARY_STATE self->cdata, THPUtils_(unpackReal)(number_arg));
   Py_INCREF(self);
   return (PyObject*)self;
   END_HANDLE_TH_ERRORS
@@ -152,23 +152,23 @@ static PyObject * THPStorage_(fromBuffer)(PyObject *_unused, PyObject *args, PyO
   }
 
   uint8_t* src = (uint8_t*) buffer.buf;
-  THStorage* storage = THStorage_(newWithSize)(count);
+  THWStorage* storage = THWStorage_(newWithSize)(count);
 
 #if defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR)
-  memcpy(THStorage_(data)(storage), src + offset, count);
+  memcpy(THWStorage_(data)(storage), src + offset, count);
 #elif defined(TH_REAL_IS_SHORT)
-  THP_decodeInt16Buffer(THStorage_(data)(storage), src + offset, byte_order, count);
+  THP_decodeInt16Buffer(THWStorage_(data)(storage), src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_INT)
-  THP_decodeInt32Buffer(THStorage_(data)(storage), src + offset, byte_order, count);
+  THP_decodeInt32Buffer(THWStorage_(data)(storage), src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_LONG)
   // TODO: remove the cast
-  THP_decodeInt64Buffer((int64_t*) THStorage_(data)(storage), src + offset, byte_order, count);
+  THP_decodeInt64Buffer((int64_t*) THWStorage_(data)(storage), src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_HALF)
-  THP_decodeHalfBuffer(THStorage_(data)(storage), src + offset, byte_order, count);
+  THP_decodeHalfBuffer(THWStorage_(data)(storage), src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_FLOAT)
-  THP_decodeFloatBuffer(THStorage_(data)(storage), src + offset, byte_order, count);
+  THP_decodeFloatBuffer(THWStorage_(data)(storage), src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_DOUBLE)
-  THP_decodeDoubleBuffer(THStorage_(data)(storage), src + offset, byte_order, count);
+  THP_decodeDoubleBuffer(THWStorage_(data)(storage), src + offset, byte_order, count);
 #else
 #error "Unknown type"
 #endif
@@ -192,7 +192,7 @@ static PyObject * THPStorage_(fromFile)(PyObject *_unused, PyObject *args, PyObj
   }
   if (shared)
     shared = TH_ALLOCATOR_MAPPED_SHARED;
-  THStorage *storage = THStorage_(newWithMapping)(LIBRARY_STATE filename, size, shared);
+  THWStorage *storage = THWStorage_(newWithMapping)(LIBRARY_STATE filename, size, shared);
   return (PyObject*)THPStorage_(New)(storage);
   END_HANDLE_TH_ERRORS
 }
@@ -223,7 +223,7 @@ PyObject * THPStorage_(newWithFile)(PyObject *_unused, PyObject *file)
   int fd = PyObject_AsFileDescriptor(file);
   THPUtils_assert(fd != -1, "_new_with_file couldn't retrieve a file "
       "descriptor from given object");
-  THStorage *storage = THPStorage_(readFileRaw<int>)(fd, nullptr);
+  THWStorage *storage = THPStorage_(readFileRaw<int>)(fd, nullptr);
   if (storage == nullptr)
     return nullptr;
   PyObject *result = THPStorage_(New)(storage);
@@ -243,7 +243,7 @@ static PyObject *THPStorage_(setFromFile)(THPStorage *self, PyObject *args)
     // but it is currently unnecessary to support this.
     THPUtils_assert(offset == Py_None,
                     "_set_from_file: offset is NYI for filelike objects");
-    THStorage *storage = THPStorage_(readFileRaw<PyObject*>)(file, self->cdata);
+    THWStorage *storage = THPStorage_(readFileRaw<PyObject*>)(file, self->cdata);
     if (storage == nullptr) {
       return nullptr;
     }
@@ -258,7 +258,7 @@ static PyObject *THPStorage_(setFromFile)(THPStorage *self, PyObject *args)
   }
   THPUtils_assert(fd != -1, "_set_from_file couldn't retrieve a file "
       "descriptor from given object");
-  THStorage *storage = THPStorage_(readFileRaw<int>)(fd, self->cdata);
+  THWStorage *storage = THPStorage_(readFileRaw<int>)(fd, self->cdata);
   if (storage == nullptr)
     return nullptr;
   Py_INCREF(self);
@@ -283,9 +283,9 @@ PyObject * THPStorage_(_setCdata)(THPStorage *self, PyObject *new_cdata)
   THPUtils_assert(THPUtils_checkLong(new_cdata), "given an invalid argument to "
       "_set_cdata - expected an int or long, but got %s",
       THPUtils_typename(new_cdata));
-  THStorage *ptr = (THStorage*)PyLong_AsVoidPtr(new_cdata);
-  THStorage_(retain)(LIBRARY_STATE ptr);
-  THStorage_(free)(LIBRARY_STATE self->cdata);
+  THWStorage *ptr = (THWStorage*)PyLong_AsVoidPtr(new_cdata);
+  THWStorage_(retain)(LIBRARY_STATE ptr);
+  THWStorage_(free)(LIBRARY_STATE self->cdata);
   self->cdata = ptr;
   Py_INCREF(self);
   return (PyObject*)self;
@@ -299,11 +299,11 @@ PyObject * THPStorage_(_rootStorage)(THPStorage *self)
   if (!(self->cdata->flag & TH_STORAGE_VIEW)) {
     return Py_BuildValue("(ON)", self, PyLong_FromLong(0));
   }
-  THStorage *root = self->cdata;
+  THWStorage *root = self->cdata;
   while (root->flag & TH_STORAGE_VIEW)
     root = root->view;
-  size_t offset = THStorage_(data)(LIBRARY_STATE self->cdata) - THStorage_(data)(LIBRARY_STATE root);
-  THStorage_(retain)(LIBRARY_STATE root);
+  size_t offset = THWStorage_(data)(LIBRARY_STATE self->cdata) - THWStorage_(data)(LIBRARY_STATE root);
+  THWStorage_(retain)(LIBRARY_STATE root);
   THPObjectPtr storage(THPStorage_(New)(root));
   PyObject *result = Py_BuildValue("(NN)", storage.get(), PyLong_FromLong(offset));
   storage.release();
diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp
index eabc9f57c61620..4c09f22498a9a7 100644
--- a/torch/csrc/generic/StorageSharing.cpp
+++ b/torch/csrc/generic/StorageSharing.cpp
@@ -9,7 +9,7 @@ static PyObject * THPStorage_(sharedDecref)(THPStorage *self)
   HANDLE_TH_ERRORS
 #ifndef THC_GENERIC_FILE
   libshm_context *ctx = NULL;
-  THStorage *storage = self->cdata;
+  THWStorage *storage = self->cdata;
   if (storage->allocator == &THManagedSharedAllocator) {
     ctx = (libshm_context*)storage->allocatorContext;
   } else if (storage->allocator == &THStorageWeakRefAllocator) {
@@ -18,7 +18,7 @@ static PyObject * THPStorage_(sharedDecref)(THPStorage *self)
       ctx = (libshm_context*)allocator_obj->allocatorContext;
   }
   if (ctx)
-    THRefcountedMapAllocator_decref(ctx->th_context, THStorage_(data)(storage));
+    THRefcountedMapAllocator_decref(ctx->th_context, THWStorage_(data)(storage));
 #endif
   Py_INCREF(self);
   return (PyObject *)self;
@@ -30,7 +30,7 @@ static PyObject * THPStorage_(sharedIncref)(THPStorage *self)
   HANDLE_TH_ERRORS
 #ifndef THC_GENERIC_FILE
   libshm_context *ctx = NULL;
-  THStorage *storage = self->cdata;
+  THWStorage *storage = self->cdata;
   if (storage->allocator == &THManagedSharedAllocator) {
     ctx = (libshm_context*)storage->allocatorContext;
   } else if (storage->allocator == &THStorageWeakRefAllocator) {
@@ -39,19 +39,19 @@ static PyObject * THPStorage_(sharedIncref)(THPStorage *self)
       ctx = (libshm_context*)allocator_obj->allocatorContext;
   }
   if (ctx)
-    THRefcountedMapAllocator_incref(ctx->th_context, THStorage_(data)(storage));
+    THRefcountedMapAllocator_incref(ctx->th_context, THWStorage_(data)(storage));
 #endif
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject * THPStorage_(newTHView)(THStorage *base, ptrdiff_t offset, size_t size)
+static PyObject * THPStorage_(newTHView)(THWStorage *base, ptrdiff_t offset, size_t size)
 {
   void *data = (char*)base->data<real>() + offset;
-  THStoragePtr view(THStorage_(newWithData)(LIBRARY_STATE (real*)data, size));
+  THWStoragePtr view(THWStorage_(newWithData)(LIBRARY_STATE (real*)data, size));
   view->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_VIEW;
   view->view = base;
-  THStorage_(retain)(LIBRARY_STATE base);
+  THWStorage_(retain)(LIBRARY_STATE base);
   return THPStorage_(New)(view.release());
 }
 
@@ -69,12 +69,12 @@ static std::string THPStorage_(__newHandle)() {
   return handle;
 }
 
-static THStorage* THPStorage_(newFilenameStorage)(ptrdiff_t size)
+static THWStorage* THPStorage_(newFilenameStorage)(ptrdiff_t size)
 {
   int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM | TH_ALLOCATOR_MAPPED_EXCLUSIVE;
   std::string handle = THPStorage_(__newHandle)();
   auto ctx = libshm_context_new(NULL, handle.c_str(), flags);
-  return THStorage_(newWithAllocator)(size, &THManagedSharedAllocator, (void*)ctx);
+  return THWStorage_(newWithAllocator)(size, &THManagedSharedAllocator, (void*)ctx);
 }
 
 static PyObject * THPStorage_(pyNewFilenameStorage)(PyObject *_unused, PyObject *args)
@@ -91,7 +91,7 @@ static PyObject * THPStorage_(pyNewFilenameStorage)(PyObject *_unused, PyObject
 static PyObject * THPStorage_(shareFilename)(THPStorage *self)
 {
   HANDLE_TH_ERRORS
-  THStorage *storage = self->cdata;
+  THWStorage *storage = self->cdata;
   libshm_context *ctx;
   // Storage is already in shared memory, just return a handle
   if (storage->allocator == &THManagedSharedAllocator) {
@@ -102,9 +102,9 @@ static PyObject * THPStorage_(shareFilename)(THPStorage *self)
   } else {
     // TODO: retry on collision
     AutoNoGIL no_gil;
-    THStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->size));
-    THStorage_(copy)(new_storage, storage);
-    THStorage_(swap)(storage, new_storage);
+    THWStoragePtr new_storage(THPStorage_(newFilenameStorage)(storage->size));
+    THWStorage_(copy)(new_storage, storage);
+    THWStorage_(swap)(storage, new_storage);
     ctx = (libshm_context*)storage->allocatorContext;
   }
 
@@ -143,12 +143,12 @@ static PyObject * THPStorage_(newSharedFilename)(PyObject *_unused, PyObject *ar
   int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM |
               TH_ALLOCATOR_MAPPED_NOCREATE;
   libshm_context *ctx = libshm_context_new(manager_handle, object_handle, flags);
-  return THPStorage_(New)(THStorage_(newWithAllocator)(size,
+  return THPStorage_(New)(THWStorage_(newWithAllocator)(size,
       &THManagedSharedAllocator, (void*)ctx));
   END_HANDLE_TH_ERRORS
 }
 
-static THStorage* THPStorage_(newFdStorage)(ptrdiff_t size)
+static THWStorage* THPStorage_(newFdStorage)(ptrdiff_t size)
 {
   int flags = TH_ALLOCATOR_MAPPED_SHAREDMEM |
               TH_ALLOCATOR_MAPPED_EXCLUSIVE |
@@ -156,7 +156,7 @@ static THStorage* THPStorage_(newFdStorage)(ptrdiff_t size)
               TH_ALLOCATOR_MAPPED_UNLINK;
   std::string handle = THPStorage_(__newHandle)();
   auto ctx = THMapAllocatorContext_new(handle.c_str(), flags);
-  return THStorage_(newWithAllocator)(size, &THMapAllocator, (void*)ctx);
+  return THWStorage_(newWithAllocator)(size, &THMapAllocator, (void*)ctx);
 }
 
 static PyObject * THPStorage_(pyNewFdStorage)(PyObject *_unused, PyObject *args)
@@ -173,7 +173,7 @@ static PyObject * THPStorage_(pyNewFdStorage)(PyObject *_unused, PyObject *args)
 static PyObject * THPStorage_(shareFd)(THPStorage *self)
 {
   HANDLE_TH_ERRORS
-  THStorage *storage = self->cdata;
+  THWStorage *storage = self->cdata;
   THMapAllocatorContext *ctx;
   // Storage is already in shared memory, just return a handle
   if (storage->allocator == &THMapAllocator) {
@@ -183,9 +183,9 @@ static PyObject * THPStorage_(shareFd)(THPStorage *self)
     ctx = (THMapAllocatorContext*)allocator_obj->allocatorContext;
   } else {
     AutoNoGIL no_gil;
-    THStoragePtr new_storage(THPStorage_(newFdStorage)(storage->size));
-    THStorage_(copy)(new_storage, storage);
-    THStorage_(swap)(storage, new_storage);
+    THWStoragePtr new_storage(THPStorage_(newFdStorage)(storage->size));
+    THWStorage_(copy)(new_storage, storage);
+    THWStorage_(swap)(storage, new_storage);
     ctx = (THMapAllocatorContext*)storage->allocatorContext;
   }
 
@@ -226,7 +226,7 @@ static PyObject * THPStorage_(newSharedFd)(PyObject *_unused, PyObject *args)
               TH_ALLOCATOR_MAPPED_KEEPFD |
               TH_ALLOCATOR_MAPPED_FROMFD;
   THMapAllocatorContext *ctx = THMapAllocatorContext_newWithFd(NULL, fd, flags);
-  return THPStorage_(New)(THStorage_(newWithAllocator)(size, &THMapAllocator,
+  return THPStorage_(New)(THWStorage_(newWithAllocator)(size, &THMapAllocator,
       (void*)ctx));
   END_HANDLE_TH_ERRORS
 }
@@ -236,7 +236,7 @@ static PyObject * THPStorage_(newSharedFd)(PyObject *_unused, PyObject *args)
 static PyObject * THPStorage_(shareCuda)(THPStorage *self)
 {
   HANDLE_TH_ERRORS
-  THStorage *storage = self->cdata;
+  THWStorage *storage = self->cdata;
   AutoGPU gpu_guard(storage->device);
   THPObjectPtr tuple(PyTuple_New(5));
   THPObjectPtr device(PyLong_FromLong(storage->device));
@@ -245,9 +245,9 @@ static PyObject * THPStorage_(shareCuda)(THPStorage *self)
   THPObjectPtr size(PyLong_FromLong(storage->size));
   THPObjectPtr _offset(PyLong_FromLong(0));
   THPObjectPtr view_size(PyLong_FromLong(storage->size));
-  if (THStorage_(data)(LIBRARY_STATE storage)) {
+  if (THWStorage_(data)(LIBRARY_STATE storage)) {
     size_t base_size;
-    void *base_ptr = THCCachingAllocator_getBaseAllocation(THStorage_(data)(LIBRARY_STATE storage), &base_size);
+    void *base_ptr = THCCachingAllocator_getBaseAllocation(THWStorage_(data)(LIBRARY_STATE storage), &base_size);
     ptrdiff_t offset = (char*)storage->data<real>() - (char*)base_ptr;
 
     cudaIpcMemHandle_t handle;
@@ -304,7 +304,7 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
   void *devPtr = NULL;
   THCudaCheck(cudaIpcOpenMemHandle(&devPtr, handle, cudaIpcMemLazyEnablePeerAccess));
 
-  THStoragePtr base(THStorage_(newWithDataAndAllocator)(
+  THWStoragePtr base(THWStorage_(newWithDataAndAllocator)(
       LIBRARY_STATE (real*)devPtr, storage_size, &THCIpcAllocator, (void*)device));
   base->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_FREEMEM;
 
@@ -317,13 +317,13 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args)
 }
 #endif
 
-// Returns an object that holds a "weak" pointer to the THStorage. The
-// pointer is set to None when the THStorage is deallocated. This is done by
+// Returns an object that holds a "weak" pointer to the THWStorage. The
+// pointer is set to None when the THWStorage is deallocated. This is done by
 // wrapping the storages allocator with THStorageWeakRefAllocator which is
 // responsible for clearing the weak reference.
 static PyObject * THPStorage_(weakRef)(THPStorage *self, PyObject *weak_ref_class) {
   HANDLE_TH_ERRORS
-  THStorage* storage = self->cdata;
+  THWStorage* storage = self->cdata;
   while (storage->flag & TH_STORAGE_VIEW) {
     storage = storage->view;
   }
@@ -367,8 +367,8 @@ PyObject * THPStorage_(newWithWeakPtr)(PyObject *_unused, PyObject *arg)
   }
   THPUtils_assert(THPUtils_checkLong(ref.get()),
       "_new_with_weak_ptr(): arg.cdata must be an 'int'");
-  THStorage *storage = (THStorage*)PyLong_AsVoidPtr(ref.get());
-  if (THStorage_(retainIfLive)(LIBRARY_STATE storage)) {
+  THWStorage *storage = (THWStorage*)PyLong_AsVoidPtr(ref.get());
+  if (THWStorage_(retainIfLive)(LIBRARY_STATE storage)) {
     return THPStorage_(New)(storage);
   }
   Py_RETURN_NONE;
@@ -380,7 +380,7 @@ PyObject * THPStorage_(sharedFd)(THPStorage *self)
   HANDLE_TH_ERRORS
   THMapAllocatorContext *ctx = NULL;
 #ifndef THC_GENERIC_FILE
-  THStorage *storage = self->cdata;
+  THWStorage *storage = self->cdata;
   if (storage->allocator == &THMapAllocator) {
     ctx = (THMapAllocatorContext*)storage->allocatorContext;
   } else if (storage->allocator == &THStorageWeakRefAllocator) {
diff --git a/torch/csrc/generic/serialization.cpp b/torch/csrc/generic/serialization.cpp
index 8550e9c9d83a9a..42dff61c2feb9f 100644
--- a/torch/csrc/generic/serialization.cpp
+++ b/torch/csrc/generic/serialization.cpp
@@ -5,16 +5,16 @@
 #define SYSCHECK(call) { ssize_t __result = call; if (__result < 0) throw std::system_error((int) __result, std::system_category()); }
 
 template <class io>
-void THPStorage_(writeFileRaw)(THStorage *self, io fd)
+void THPStorage_(writeFileRaw)(THWStorage *self, io fd)
 {
   real *data;
-  int64_t size = THStorage_(size)(LIBRARY_STATE self);
+  int64_t size = THWStorage_(size)(LIBRARY_STATE self);
 #ifndef THC_GENERIC_FILE
-  data = THStorage_(data)(LIBRARY_STATE self);
+  data = THWStorage_(data)(LIBRARY_STATE self);
 #else
   std::unique_ptr<char[]> cpu_data(new char[size * sizeof(real)]);
   data = (real*)cpu_data.get();
-  THCudaCheck(cudaMemcpy(data, THStorage_(data)(LIBRARY_STATE self), size * sizeof(real), cudaMemcpyDeviceToHost));
+  THCudaCheck(cudaMemcpy(data, THWStorage_(data)(LIBRARY_STATE self), size * sizeof(real), cudaMemcpyDeviceToHost));
 #endif
   ssize_t result = doWrite(fd, &size, sizeof(int64_t));
   if (result != sizeof(int64_t))
@@ -59,11 +59,11 @@ void THPStorage_(writeFileRaw)(THStorage *self, io fd)
   }
 }
 
-template void THPStorage_(writeFileRaw<int>)(THStorage *self, int fd);
-template void THPStorage_(writeFileRaw<PyObject*>)(THStorage *self, PyObject* fd);
+template void THPStorage_(writeFileRaw<int>)(THWStorage *self, int fd);
+template void THPStorage_(writeFileRaw<PyObject*>)(THWStorage *self, PyObject* fd);
 
 template <class io>
-THStorage * THPStorage_(readFileRaw)(io file, THStorage *_storage)
+THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
 {
   real *data;
   int64_t size;
@@ -72,18 +72,18 @@ THStorage * THPStorage_(readFileRaw)(io file, THStorage *_storage)
     throw std::runtime_error("unexpected EOF. The file might be corrupted.");
   if (result != sizeof(int64_t))
     throw std::system_error(result, std::system_category());
-  THStoragePtr storage;
+  THWStoragePtr storage;
   if (_storage == nullptr) {
-    storage = THStorage_(newWithSize)(LIBRARY_STATE size);
+    storage = THWStorage_(newWithSize)(LIBRARY_STATE size);
   } else {
-    THPUtils_assert(THStorage_(size)(LIBRARY_STATE _storage) == size,
+    THPUtils_assert(THWStorage_(size)(LIBRARY_STATE _storage) == size,
         "storage has wrong size: expected %ld got %ld",
-        size, THStorage_(size)(LIBRARY_STATE _storage));
+        size, THWStorage_(size)(LIBRARY_STATE _storage));
     storage = _storage;
   }
 
 #ifndef THC_GENERIC_FILE
-  data = THStorage_(data)(LIBRARY_STATE storage);
+  data = THWStorage_(data)(LIBRARY_STATE storage);
 #else
   std::unique_ptr<char[]> cpu_data(new char[size * sizeof(real)]);
   data = (real*)cpu_data.get();
@@ -92,7 +92,7 @@ THStorage * THPStorage_(readFileRaw)(io file, THStorage *_storage)
   // fast track for bytes and little endian
   if (sizeof(real) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
     char *bytes = (char *) data;
-    int64_t remaining = sizeof(real) * THStorage_(size)(LIBRARY_STATE storage);
+    int64_t remaining = sizeof(real) * THWStorage_(size)(LIBRARY_STATE storage);
     while (remaining > 0) {
       // we write and read in 1GB blocks to avoid bugs on some OSes
       ssize_t result = doRead(file, bytes, THMin(remaining, 1073741824));
@@ -134,13 +134,13 @@ THStorage * THPStorage_(readFileRaw)(io file, THStorage *_storage)
   }
 
 #ifdef THC_GENERIC_FILE
-  THCudaCheck(cudaMemcpy(THStorage_(data)(LIBRARY_STATE storage), data, size * sizeof(real), cudaMemcpyHostToDevice));
+  THCudaCheck(cudaMemcpy(THWStorage_(data)(LIBRARY_STATE storage), data, size * sizeof(real), cudaMemcpyHostToDevice));
 #endif
   return storage.release();
 }
 
-template THStorage* THPStorage_(readFileRaw<int>)(int fd, THStorage* storage);
-template THStorage* THPStorage_(readFileRaw<PyObject*>)(PyObject* fd, THStorage* storage);
+template THWStorage* THPStorage_(readFileRaw<int>)(int fd, THWStorage* storage);
+template THWStorage* THPStorage_(readFileRaw<PyObject*>)(PyObject* fd, THWStorage* storage);
 
 #undef SYSCHECK
 
diff --git a/torch/csrc/generic/serialization.h b/torch/csrc/generic/serialization.h
index d4f46c889f40d7..e73ac7c76229a9 100644
--- a/torch/csrc/generic/serialization.h
+++ b/torch/csrc/generic/serialization.h
@@ -3,9 +3,9 @@
 #else
 
 template <class io>
-void THPStorage_(writeFileRaw)(THStorage *self, io fd);
+void THPStorage_(writeFileRaw)(THWStorage *self, io fd);
 
 template <class io>
-THStorage * THPStorage_(readFileRaw)(io fd, THStorage *storage);
+THWStorage * THPStorage_(readFileRaw)(io fd, THWStorage *storage);
 
 #endif
diff --git a/torch/csrc/generic/utils.cpp b/torch/csrc/generic/utils.cpp
index fe722b1ab28943..0e59a81ed1ee6a 100644
--- a/torch/csrc/generic/utils.cpp
+++ b/torch/csrc/generic/utils.cpp
@@ -9,9 +9,9 @@
 #endif
 
 template<>
-void THPPointer<THTensor>::free() {
+void THPPointer<THWTensor>::free() {
   if (ptr)
-    THTensor_(free)(LIBRARY_STATE ptr);
+    THWTensor_(free)(LIBRARY_STATE ptr);
 }
 
 template<>
@@ -20,20 +20,8 @@ void THPPointer<THPStorage>::free() {
     Py_DECREF(ptr);
 }
 
-#if GENERATE_SPARSE
-template<>
-void THPPointer<THSTensor>::free() {
-  if (ptr)
-    THSTensor_(free)(LIBRARY_STATE ptr);
-}
-#endif
-
-
-template class THPPointer<THTensor>;
+template class THPPointer<THWTensor>;
 template class THPPointer<THPStorage>;
-#if GENERATE_SPARSE
-template class THPPointer<THSTensor>;
-#endif
 
 #undef GENERATE_SPARSE
 
diff --git a/torch/csrc/generic/utils.h b/torch/csrc/generic/utils.h
index a9191a7c6f1b30..bcccffa051b770 100644
--- a/torch/csrc/generic/utils.h
+++ b/torch/csrc/generic/utils.h
@@ -11,14 +11,10 @@
 struct THPStorage;
 struct THSPTensor;
 
-typedef class THPPointer<THStorage>      THStoragePtr;
-typedef class THPPointer<THTensor>       THTensorPtr;
+typedef class THPPointer<THWStorage>      THWStoragePtr;
+typedef class THPPointer<THWTensor>       THWTensorPtr;
 typedef class THPPointer<THPStorage>     THPStoragePtr;
 
-#if GENERATE_SPARSE
-typedef class THPPointer<THSTensor>      THSTensorPtr;
-#endif
-
 #if (!defined(THC_GENERIC_FILE) || defined(THC_REAL_IS_HALF)) && \
     (!defined(THD_GENERIC_FILE))
 template<>
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index 4e9f50f98c451d..8cf5bf386f25aa 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -15,6 +15,7 @@
 #include "torch/csrc/jit/passes/shape_analysis.h"
 #include "torch/csrc/jit/passes/remove_expands.h"
 #include "torch/csrc/jit/passes/decompose_addmm.h"
+#include "torch/csrc/jit/passes/loop_unrolling.h"
 
 #include "torch/csrc/autograd/edge.h"
 #include "torch/csrc/autograd/function.h"
@@ -349,6 +350,7 @@ struct GraphExecutorImpl {
       // do not work on variables
 
       // They also may assume that concrete sizes/strides are availiable
+      UnrollLoops(graph);
 
       //TODO: create peephole optimizations that are safe to run
       // when we are using variables, and when we do not know sizes.
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index 2790161e2208da..b7514465db0c28 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -16,6 +16,7 @@
 #include "torch/csrc/jit/passes/onnx/fixup_onnx_loop.h"
 #include "torch/csrc/jit/passes/shape_analysis.h"
 #include "torch/csrc/jit/passes/decompose_addmm.h"
+#include "torch/csrc/jit/passes/loop_unrolling.h"
 #include "torch/csrc/jit/graph_executor.h"
 #include "torch/csrc/jit/script/init.h"
 #include "torch/csrc/jit/script/python_tree_views.h"
@@ -76,6 +77,7 @@ void initJITBindings(PyObject *module) {
      auto tensor_inputs = createVariableTensorList(inputs);
      PropagateInputShapes(graph, ArgumentSpec(with_grad, tensor_inputs));
    })
+   .def("_jit_pass_loop_unrolling", UnrollLoops)
    .def("_jit_run_cpp_tests", [] {
      // We have to release the GIL inside this method, because if we happen to
      // initialize the autograd engine in these tests, the newly spawned worker threads will
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index 40f95c6f3b1156..bbeb60fddef6c9 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -557,10 +557,18 @@ inline Value* Value::setUniqueName(const std::string & name) {
   auto old_owner_of_name = names.find(name);
   if(old_owner_of_name != names.end()) {
     size_t suffix = 1;
+    std::string name_base = name;
+    auto last_dot_pos = name.find_last_of('.');
+    if (last_dot_pos != std::string::npos && last_dot_pos + 1 != name.size()) {
+      if (name.find_first_not_of("0123456789", last_dot_pos + 1) == std::string::npos) {
+        suffix = std::stoll(name.substr(last_dot_pos + 1));
+        name_base = name.substr(0, last_dot_pos);
+      }
+    }
     std::string replacement_name;
     do {
       std::stringstream ss;
-      ss << name << "." << suffix++;
+      ss << name_base << "." << suffix++;
       replacement_name = ss.str();
     } while(names.count(replacement_name) > 0);
     old_owner_of_name->second->setUniqueName(replacement_name);
diff --git a/torch/csrc/jit/passes/dead_code_elimination.cpp b/torch/csrc/jit/passes/dead_code_elimination.cpp
index 3e5b4d95703466..42b1e8086b9cca 100644
--- a/torch/csrc/jit/passes/dead_code_elimination.cpp
+++ b/torch/csrc/jit/passes/dead_code_elimination.cpp
@@ -1,23 +1,49 @@
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
 
+#include <unordered_map>
+
 namespace torch { namespace jit {
 
-void EliminateDeadCode(std::shared_ptr<Graph>& graph) {
-  EliminateDeadCode(graph->block());
-}
-bool hasSideEffects(Node * node) {
-  return node->kind() == prim::Print || node->blocks().size() > 0;
+using bool_memo_type = std::unordered_map<Node*, bool>;
+
+bool hasSideEffects(Node * node, bool_memo_type& memo) {
+  // FIXME: PythonOp and CppOp should be treated as having side effects as well!
+  //        Unfortunately ONNX depends on them getting removed in this pass, so it's not
+  //        a simple change.
+  auto it = memo.find(node);
+  if (it != memo.end())
+    return it->second;
+  bool has_side_effects = node->kind() == prim::Print ||
+    std::any_of(node->blocks().begin(), node->blocks().end(),
+                [&](Block *b) {
+                  return std::any_of(b->nodes().begin(), b->nodes().end(),
+                                    [&](Node *n) { return hasSideEffects(n, memo); });
+                });
+  memo.emplace(node, has_side_effects);
+  return has_side_effects;
 }
 
-void EliminateDeadCode(Block *block) {
+void EliminateDeadCode(Block *block, bool recurse, bool_memo_type& memo) {
   auto nodes = block->nodes().reverse();
   for (auto it = nodes.begin(); it != nodes.end(); it++) {
     auto node = *it;
-    for (Block * block : node->blocks())
-      EliminateDeadCode(block);
-    if (!node->hasUses() && !hasSideEffects(node))
+    if (recurse) {
+      for (Block * block : node->blocks())
+        EliminateDeadCode(block, true, memo);
+    }
+    if (!node->hasUses() && !hasSideEffects(node, memo))
       it.destroyCurrent();
   }
 }
 
-}}
+void EliminateDeadCode(std::shared_ptr<Graph>& graph) {
+  bool_memo_type side_effect_memo;
+  EliminateDeadCode(graph->block(), true, side_effect_memo);
+}
+
+void EliminateDeadCode(Block *block, bool recurse) {
+  bool_memo_type side_effect_memo;
+  EliminateDeadCode(block, recurse, side_effect_memo);
+}
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/dead_code_elimination.h b/torch/csrc/jit/passes/dead_code_elimination.h
index 438e33d7ebcfef..a6b38f16f586f4 100644
--- a/torch/csrc/jit/passes/dead_code_elimination.h
+++ b/torch/csrc/jit/passes/dead_code_elimination.h
@@ -5,6 +5,6 @@
 namespace torch { namespace jit {
 
 void EliminateDeadCode(std::shared_ptr<Graph>& graph);
-void EliminateDeadCode(Block *block);
+void EliminateDeadCode(Block *block, bool recurse=true);
 
 }}
diff --git a/torch/csrc/jit/passes/loop_unrolling.cpp b/torch/csrc/jit/passes/loop_unrolling.cpp
new file mode 100644
index 00000000000000..bbf05f198a3b61
--- /dev/null
+++ b/torch/csrc/jit/passes/loop_unrolling.cpp
@@ -0,0 +1,212 @@
+#include "torch/csrc/jit/passes/loop_unrolling.h"
+
+#include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/jit/symbolic_variable.h"
+#include "torch/csrc/jit/tensor_conversions.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+
+namespace torch { namespace jit {
+
+namespace {
+
+static constexpr int64_t kUnrollFactor = 8;
+static constexpr int64_t kMaxBodySize = 16;
+static constexpr int64_t kMaxBodyRepeats = 64;
+
+bool isTrueConstant(Value *val) {
+  at::optional<bool> maybe_value = constant_as<bool>(val);
+  return maybe_value && *maybe_value;
+}
+
+bool isForLoop(Node* node) {
+  if (node->kind() != prim::Loop)
+    return false;
+  Value *start_cond = node->inputs().at(1);
+  Value *continue_cond = node->blocks().at(0)->outputs().at(0);
+  return isTrueConstant(start_cond) && isTrueConstant(continue_cond);
+}
+
+// Counts the size of this block, stopping and returning once reaches limit instructions.
+int64_t limitedBlockSize(Block *body, int64_t limit) {
+  auto it = body->nodes().begin();
+  auto end = body->nodes().end();
+  for (int64_t i = 0; i < limit; ++i, ++it) {
+    for (Block *subblock : it->blocks()) {
+      i += limitedBlockSize(subblock, limit - i);
+    }
+    if (it == end) {
+      return i;
+    }
+  }
+  return limit;
+}
+
+bool isSmallBlock(Block *body) {
+  return limitedBlockSize(body, kMaxBodySize + 1) <= kMaxBodySize;
+}
+
+// XXX: This function can only be called with a loop that is guaranteed to execute EXACTLY ONCE.
+void inlineBody(Node *loop) {
+  auto graph = loop->owningGraph();
+  auto body = loop->blocks().at(0);
+  WithInsertPoint insert_point_guard { loop };
+
+  std::unordered_map<Value*, Value*> value_map;
+  auto get_value = [&](Value *v) {
+    auto it = value_map.find(v);
+    if (it != value_map.end())
+      return it->second;
+    return v;
+  };
+
+  // Loop node has extra (max_iters, initial_cond) inputs,
+  // body has an extra (loop_counter) input.
+  for (size_t i = 2; i < loop->inputs().size(); ++i) {
+    value_map[body->inputs()[i - 1]] = loop->inputs()[i];
+  }
+
+  for (Node *orig : body->nodes()) {
+    Node *clone = graph->insertNode(graph->createClone(orig, get_value));
+    for (size_t i = 0; i < orig->outputs().size(); ++i) {
+      value_map[orig->outputs()[i]] = clone->outputs()[i];
+    }
+  }
+  for (size_t i = 0; i < loop->outputs().size(); ++i) {
+    loop->outputs().at(i)->replaceAllUsesWith(get_value(body->outputs().at(i + 1)));
+  }
+  // XXX: it is extremely important to destroy the loop in here. DCE might not be able
+  // to conclude that it's safe, because the loop might contain side effects.
+  loop->destroy();
+}
+
+void repeatBody(Block *body, int64_t times) {
+  // We will be adding nodes to the body, so cache the initial start and end.
+  // XXX: they are both inclusive, because the exclusive body_end would point to
+  //      return_node, which would move further away if we were to add nodes, and we
+  //      would enter an infinite loop.
+  auto body_start = body->nodes().begin();
+  auto body_end = std::prev(body->nodes().end());
+  auto graph = body->owningGraph();
+  WithInsertPoint insert_point_guard { body };
+
+  std::unordered_map<Value*, Value*> value_map;
+  auto get_value = [&](Value *v) {
+    auto it = value_map.find(v);
+    if (it != value_map.end())
+      return it->second;
+    return v;
+  };
+
+  for (int64_t i = 1; i < times; ++i) {
+    // Update loop-carried values
+    // NB: note that we don't need to worry about the loop counter, because we've
+    //     replaced it with a loop-carried variable
+    JIT_ASSERT(body->inputs().size() == body->outputs().size());
+    for (size_t i = 1; i < body->inputs().size(); ++i) {
+      value_map[body->inputs()[i]] = get_value(body->outputs()[i]);
+    }
+
+    // Clone the nodes
+    for (auto it = body_start; it != std::next(body_end); ++it) {
+      Node *orig = *it;
+      Node *clone = graph->insertNode(graph->createClone(orig, get_value));
+      for (size_t i = 0; i < orig->outputs().size(); ++i) {
+        value_map[orig->outputs()[i]] = clone->outputs()[i];
+      }
+    }
+  }
+
+  // Update outputs of the body
+  const std::vector<Value*> new_outputs = fmap(body->outputs(), get_value);
+  for (int64_t i = new_outputs.size() - 1; i >= 0; --i) {
+    body->eraseOutput(i);
+  }
+  for (Value *output : new_outputs) {
+    body->registerOutput(output);
+  }
+
+  // It's likely that we have some dead nodes now - for example the "true" constant
+  // that prevents the loop from breaking. We shouldn't wait too long before removing
+  // them because they might artificially increase the loop size and prevent outer loop
+  // unrolling.
+  EliminateDeadCode(body, false);
+}
+
+// Replaces the builtin loop counter with a "mutable" variable outside of the loop.
+void replaceLoopCounter(Node *loop) {
+  Graph *graph = loop->owningGraph();
+  Block *body = loop->blocks().at(0);
+  Node *init_counter_node = graph->createConstant(at::CPU(at::kLong).scalarTensor(0))
+                                 ->insertBefore(loop);
+  loop->insertInput(2, init_counter_node->output());
+  loop->insertOutput(0);
+
+  Value * internal_counter = body->insertInput(1);
+  body->inputs()[0]->replaceAllUsesWith(internal_counter);
+
+  WithInsertPoint insertPointGuard{ body->return_node() };
+  body->insertOutput(1, SymbolicVariable(internal_counter) + at::Scalar(1));
+}
+
+void unroll(Node *loop) {
+  Graph *graph = loop->owningGraph();
+  Block *body = loop->blocks().at(0);
+  if (!isSmallBlock(body))
+    return;
+
+  // We will be using a "mutable" counter outside of the loop instead of the default
+  // one, because this will allow us to share it between the unrolled loop and its epilogue.
+  // This is necessary only if the loop counter is actually used in the body.
+  if (body->inputs()[0]->uses().size() > 0)
+    replaceLoopCounter(loop);
+
+  // Some optimization for constant-length loops. If we know they won't run too many
+  // times, then we can unroll them entirely.
+  Value *trip_count = loop->inputs().at(0);
+  int64_t const_len = constant_as<int64_t>(trip_count).value_or(-1);
+  if (const_len != -1 && const_len < kMaxBodyRepeats) {
+    repeatBody(body, const_len);
+    inlineBody(loop);
+    return;
+  }
+
+  WithInsertPoint insert_point_guard { loop };
+
+  // Clone the loop before we unroll it. The clone will become the epilogue.
+  Node *loop_epilogue = graph->createClone(loop, [](Value *v) { return v; })
+                             ->insertAfter(loop);
+  for (size_t i = 0; i < loop->outputs().size(); ++i) {
+    loop->outputs()[i]->replaceAllUsesWith(loop_epilogue->outputs()[i]);
+    loop_epilogue->replaceInput(i + 2, loop->outputs()[i]);
+  }
+
+  repeatBody(body, kUnrollFactor);
+
+  // Change the iteration counts of both loops
+  SymbolicVariable iter_count = loop->inputs().at(0);
+  SymbolicVariable unrolled_iter_count = iter_count / kUnrollFactor;
+  loop->replaceInput(0, unrolled_iter_count);
+  loop_epilogue->replaceInput(0, iter_count - (unrolled_iter_count * kUnrollFactor));
+}
+
+void UnrollLoops(Block *block) {
+  for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+    // XXX: unroll might destroy the current node, so we need to pre-increment the iterator
+    Node *node = *it; ++it;
+    for (Block *subblock : node->blocks()) {
+      UnrollLoops(subblock);
+    }
+    if (isForLoop(node)) {
+      unroll(node);
+    }
+  }
+}
+
+} // anonymous namespace
+
+void UnrollLoops(std::shared_ptr<Graph>& graph) {
+  UnrollLoops(graph->block());
+  EliminateDeadCode(graph);
+}
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/loop_unrolling.h b/torch/csrc/jit/passes/loop_unrolling.h
new file mode 100644
index 00000000000000..4ca1fd761f9459
--- /dev/null
+++ b/torch/csrc/jit/passes/loop_unrolling.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+void UnrollLoops(std::shared_ptr<Graph>& graph);
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 8f9f818e623916..163b8adda1ace3 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -290,20 +290,6 @@ static bool isTensorSubtype(Value* v) {
   return v->type()->isSubtypeOf(*DynamicType::get());
 }
 
-// if a value is a constant then try to turn into type T using the
-// same rules as the interpreter
-template<typename T>
-at::optional<T> constant_as(Value* v) {
-  if(v->node()->kind() != prim::Constant)
-    return at::nullopt;
-  auto tensor = v->node()->t(attr::value);
-  try {
-    return tensor_as<T>(std::move(tensor));
-  } catch (tensor_conversion_error& err) {
-    return at::nullopt;
-  }
-}
-
 at::optional<std::vector<int64_t>> getIntListAttribute(at::optional<int32_t> N, Value* input) {
   auto list = constant_as<at::IntList>(input);
   if(list)
@@ -324,9 +310,21 @@ void liftConstantAttributes(const FunctionSchema& schema, Node* node) {
   JIT_ASSERT(!node->hasAttributes());
   std::vector<Value*> new_inputs;
   Attributes<Node> attributes;
-  for(size_t i = 0; i < node->inputs().size(); ++i) {
+  for(size_t i = 0, n = 0; i < schema.arguments.size(); ++i) {
     const auto& arg = schema.arguments[i];
-    auto input = node->input(i);
+    // this was a builtin with a vararg list lowered,
+    if(arg.type->kind() == TypeKind::ListType) {
+      // we do not support constant lifting of the arg itself
+      if(arg.attribute_info)
+        return;
+      // but we do support it for other values so we need to skip all the vararg nodes:
+      size_t vararg_list_size = node->inputs().size() - (schema.arguments.size() - 1);
+      while(n < i + vararg_list_size) {
+        new_inputs.push_back(node->input(n++));
+      }
+      continue;
+    }
+    auto input = node->input(n++);
     if(arg.attribute_info) {
       switch(arg.attribute_info->kind) {
         case AttributeKind::i: {
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index e488a7535ea25b..d9d44fe5658685 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -4,6 +4,14 @@
 #include "torch/csrc/jit/tensor_conversions.h"
 #include "torch/csrc/jit/python_tracer.h"
 
+#include <torch/csrc/api/include/torch/detail/ordered_dict.h>
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 namespace torch {
 namespace jit {
 namespace script {
@@ -248,11 +256,11 @@ struct ModuleValue : public SugaredValue {
 
   // select an attribute on it, e.g. `this.field`
   virtual std::shared_ptr<SugaredValue> attr(SourceRange loc, Method & m, const std::string& field) override {
-    if(at::optional<NamedModule&> v = module->find_module(field)) {
+    if(NamedModule* v = module->find_module(field)) {
       return std::make_shared<ModuleValue>(v->module);
-    } else if(at::optional<Method&> v = module->find_method(field)) {
+    } else if(Method* v = module->find_method(field)) {
       return std::make_shared<MethodValue>(module, *v);
-    } else if(at::optional<NamedParameter&> v = module->find_parameter(field)) {
+    } else if(NamedParameter* v = module->find_parameter(field)) {
       return std::make_shared<SimpleValue>(m.get_or_add_parameter(v->slot()));
     }
     // This can also be a call to a non-script module, or a plain
@@ -330,11 +338,11 @@ py::object unpackVariableTensorList(std::vector<at::Tensor> outputs) {
 }
 
 static void gatherParametersAndBuffers(std::vector<at::Tensor*> & values, const Module & m) {
-  for(auto & params : m.get_parameters()) {
-    values.push_back(params.slot());
+  for(auto & param : m.get_parameters()) {
+    values.push_back(param->slot());
   }
   for(const auto & sub : m.get_modules()) {
-    gatherParametersAndBuffers(values, *sub.module);
+    gatherParametersAndBuffers(values, *sub->module);
   }
 }
 
@@ -378,8 +386,8 @@ void initJitScriptBindings(PyObject* module) {
         auto & modules = self.get_modules();
         py::tuple result(modules.size());
         for(size_t i = 0; i < modules.size(); ++i) {
-          auto & nm = modules[i];
-          result[i] = std::make_pair(nm.name, nm.module);
+          auto & item = modules[i];
+          result[i] = std::make_pair(item.key, item.value);
         }
         return result;
       })
@@ -390,9 +398,9 @@ void initJitScriptBindings(PyObject* module) {
           auto & p = parameters[i];
           py::tuple r(3);
           result[i] = std::make_tuple(
-            p.name,
-            static_cast<const autograd::Variable&>(*p.slot()),
-            p.is_buffer);
+            p.key,
+            static_cast<const autograd::Variable&>(*p->slot()),
+            p->is_buffer);
 
         }
         return result;
@@ -416,8 +424,9 @@ void initJitScriptBindings(PyObject* module) {
         return bool(self.find_method(name));
       })
       .def("_method_names", [](Module& self) {
-        return fmap(self.get_methods(), [](const std::unique_ptr<Method> & m) {
-          return m->name();
+        using Item = torch::detail::OrderedDict<std::string, std::unique_ptr<Method>>::Item;
+        return fmap(self.get_methods(), [](const Item & item) {
+          return (*item)->name();
         });
       })
       .def("_create_method_from_trace", [](
diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h
index 64ea9118da5c56..b51a5fdf82d10d 100644
--- a/torch/csrc/jit/script/module.h
+++ b/torch/csrc/jit/script/module.h
@@ -7,8 +7,17 @@
 #include "torch/csrc/jit/function_schema.h"
 #include "torch/csrc/jit/named_value.h"
 
+#include <torch/csrc/api/include/torch/detail/ordered_dict.h>
+
 #include <ATen/optional.h>
+#include <ATen/ArrayRef.h>
+
 #include <functional>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
 
 // This file contains classes which assist in desugaring Python style
 // modules and their methods into flattened graphs which don't have any
@@ -189,69 +198,12 @@ struct NamedParameter {
   std::unique_ptr<at::Tensor> parameter;
 };
 
-// simple ordered dict used only in Module
-// contains only the minimum necessary functionality for Module
-template<typename T>
-struct OrderedDict {
-  OrderedDict(const char * what)
-  : what(what) {}
-  // note: slight difference from python here.
-  // we do not allow for insertion of an already existing value,
-  // because we not allow allow methods or submodules to be updated
-  // once created
-  T& insert(const std::string& name,  T&& value) {
-    if(index_.count(name) != 0) {
-      std::stringstream ss;
-      ss << "module " << what << "'" << name << "' already defined.";
-      throw std::runtime_error(ss.str());
-    }
-    values_.push_back(std::move(value));
-    index_[name] = values_.size() - 1;
-    return values_.back();
-  }
-  at::optional<T&> find(const std::string& str) {
-    auto it = index_.find(str);
-    if(it == index_.end())
-      return at::nullopt;
-    return at::optional<T&>(values_.at(it->second));
-  }
-  at::optional<const T&> find(const std::string& str) const {
-    auto it = index_.find(str);
-    if(it == index_.end())
-      return at::nullopt;
-    return at::optional<const T&>(values_.at(it->second));
-  }
-  T& get(const std::string& name) {
-    if(auto v = find(name)) {
-      return *v;
-    }
-    std::stringstream ss;
-    ss << "module " << what << "'" << name << "' is not defined.";
-    throw std::runtime_error(ss.str());
-  }
-  const T& get(const std::string& name) const {
-    if(auto v = find(name)) {
-      return *v;
-    }
-    std::stringstream ss;
-    ss << "module " << what << "'" << name << "' is not defined.";
-    throw std::runtime_error(ss.str());
-  }
-  const std::vector<T>& values() const {
-    return values_;
-  }
-private:
-  std::unordered_map<std::string, size_t> index_;
-  std::vector<T> values_;
-  const char * what;
-};
-
 struct Module : public std::enable_shared_from_this<Module> {
   TH_DISALLOW_COPY_AND_ASSIGN(Module);
   Module()
-  : modules("modules")
-  , parameters("parameters")
-  , methods("methods")
+  : modules("Module")
+  , parameters("Parameter")
+  , methods("Method")
   , optimize(true) {}
 
   // note this doesn't change the flags of existing methods just ones
@@ -296,7 +248,7 @@ struct Module : public std::enable_shared_from_this<Module> {
   }
 
   // each module owns its method. The reference returned here
-  // is guarenteed to stay valid until this module has been destoryed
+  // is guarenteed to stay valid until this module has been destroyed
   Method& get_method(const std::string& name) const {
     return *methods.get(name);
   }
@@ -305,39 +257,38 @@ struct Module : public std::enable_shared_from_this<Module> {
     return modules.get(name).module;
   }
 
-  const std::vector<NamedModule>& get_modules() const {
-    return modules.values();
+  const detail::OrderedDict<std::string, NamedModule>& get_modules() const {
+    return modules;
   }
-  const  std::vector<NamedParameter>& get_parameters() const {
-    return parameters.values();
+  const detail::OrderedDict<std::string, NamedParameter>& get_parameters() const {
+    return parameters;
   }
-  const  std::vector<std::unique_ptr<Method>>& get_methods() const {
-    return methods.values();
+  const detail::OrderedDict<std::string, std::unique_ptr<Method>>& get_methods() const {
+    return methods;
   }
 
-
-  at::optional<NamedParameter&> find_parameter(const std::string& name) {
+  NamedParameter* find_parameter(const std::string& name) {
     return parameters.find(name);
   }
-  at::optional<NamedModule&> find_module(const std::string& name) {
+  NamedModule* find_module(const std::string& name) {
     return modules.find(name);
   }
-  at::optional<Method&> find_method(const std::string& name) {
-    if(auto pm = methods.find(name))
-      return at::optional<Method&>(**pm);
-    return at::nullopt;
+  Method* find_method(const std::string& name) {
+    if (auto* pm = methods.find(name)) {
+      return pm->get();
+    }
+    return nullptr;
   }
 
-
-private:
+ private:
 
   // invariant: to ensure member_inputs of Methods stay valid,
   // it is only legal to _add_ new modules and parameters.
   // removing them will allow member_inputs to point to invalid parameters
   // no such restriction exists for methods
-  OrderedDict<NamedModule> modules;
-  OrderedDict<NamedParameter> parameters;
-  OrderedDict<std::unique_ptr<Method>> methods;
+  detail::OrderedDict<std::string, NamedModule> modules;
+  detail::OrderedDict<std::string, NamedParameter> parameters;
+  detail::OrderedDict<std::string, std::unique_ptr<Method>> methods;
   bool optimize;
 };
 
diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h
index 1e971f96ade355..238022333ff7c8 100644
--- a/torch/csrc/jit/symbolic_variable.h
+++ b/torch/csrc/jit/symbolic_variable.h
@@ -79,6 +79,24 @@ struct SymbolicVariable {
   SymbolicVariable operator-() const {
     return create(aten::neg, {*this})[0].typeLike(*this);
   }
+  SymbolicVariable operator-(const SymbolicVariable rhs) const {
+    Node *n;
+    auto r = create(aten::sub, {*this, rhs}, 1, &n)[0].typeLike(*this);
+    n->t_(attr::alpha, at::Scalar(1).toTensor());
+    return r;
+  }
+  SymbolicVariable operator/(at::Scalar rhs) const {
+    Node *n;
+    auto r = create(aten::div, {*this}, 1, &n)[0].typeLike(*this);
+    n->t_(attr::other, rhs.toTensor());
+    return r;
+  }
+  SymbolicVariable operator%(at::Scalar rhs) const {
+    Node *n;
+    auto r = create(aten::remainder, {*this}, 1, &n)[0].typeLike(*this);
+    n->t_(attr::other, rhs.toTensor());
+    return r;
+  }
   SymbolicVariable mm(const SymbolicVariable rhs) const {
     auto r = create(t("mm"), {*this, rhs})[0];
     return r;
diff --git a/torch/csrc/jit/tensor_conversions.h b/torch/csrc/jit/tensor_conversions.h
index 3401d0c0d9b2e1..ef3c0febdaa711 100644
--- a/torch/csrc/jit/tensor_conversions.h
+++ b/torch/csrc/jit/tensor_conversions.h
@@ -123,4 +123,22 @@ inline at::Tensor as_variable(const T& t) {
   return autograd::make_variable(as_tensor(t));
 }
 
+//////////////////////////////////////////////////////////////////////////////////
+// Helper for retrieving constants
+//////////////////////////////////////////////////////////////////////////////////
+
+// if a value is a constant then try to turn into type T using the
+// same rules as the interpreter
+template<typename T>
+at::optional<T> constant_as(Value* v) {
+  if(v->node()->kind() != prim::Constant)
+    return at::nullopt;
+  auto tensor = v->node()->t(attr::value);
+  try {
+    return tensor_as<T>(std::move(tensor));
+  } catch (tensor_conversion_error& err) {
+    return at::nullopt;
+  }
+}
+
 }} // namespace torch::jit
diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
index af7cb560ddbbfb..f8266acec3e8e8 100644
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@@ -17,10 +17,6 @@
 #include "generic/utils.cpp"
 #include <TH/THGenerateHalfType.h>
 
-#ifdef WITH_CUDA
-#include "torch/csrc/cuda/THCP.h"
-#endif
-
 int THPUtils_getCallable(PyObject *arg, PyObject **result) {
   if (!PyCallable_Check(arg))
     return 0;
@@ -231,30 +227,3 @@ bool maybeThrowBackCompatKeepdimWarn(char *func) {
   }
   return true;
 }
-
-#ifdef WITH_CUDA
-std::vector <THCStream*> THPUtils_PySequence_to_THCStreamList(PyObject *obj) {
-  if (!PySequence_Check(obj)) {
-    throw std::runtime_error("Expected a sequence in THPUtils_PySequence_to_THCStreamList");
-  }
-  THPObjectPtr seq = THPObjectPtr(PySequence_Fast(obj, NULL));
-  if (seq.get() == NULL) {
-    throw std::runtime_error("expected PySequence, but got " + std::string(THPUtils_typename(obj)));
-  }
-
-  std::vector<THCStream*> streams;
-  Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get());
-  for (Py_ssize_t i = 0; i < length; i++) {
-    PyObject *stream = PySequence_Fast_GET_ITEM(seq.get(), i);
-
-    if (PyObject_IsInstance(stream, THCPStreamClass)) {
-      streams.push_back( ((THCPStream *)stream)->cdata);
-    } else if (stream == Py_None) {
-      streams.push_back(NULL);
-    } else {
-      std::runtime_error("Unknown data type found in stream list. Need THCStream or None");
-    }
-  }
-  return streams;
-}
-#endif
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index 5756954bc27027..296cd5e97229fe 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -133,11 +133,10 @@ void THPUtils_addPyMethodDefs(std::vector<PyMethodDef>& vector, PyMethodDef* met
 
 int THPUtils_getCallable(PyObject *arg, PyObject **result);
 
-#define THStoragePtr TH_CONCAT_3(TH,Real,StoragePtr)
-#define THTensorPtr  TH_CONCAT_3(TH,Real,TensorPtr)
+#define THWStoragePtr TH_CONCAT_3(TH,Real,StoragePtr)
+#define THWTensorPtr  TH_CONCAT_3(TH,Real,TensorPtr)
 #define THPStoragePtr TH_CONCAT_3(THP,Real,StoragePtr)
 #define THPTensorPtr  TH_CONCAT_3(THP,Real,TensorPtr)
-#define THSTensorPtr  TH_CONCAT_3(THS,Real,TensorPtr)
 #define THSPTensorPtr  TH_CONCAT_3(THSP,Real,TensorPtr)
 
 typedef THPPointer<THPGenerator> THPGeneratorPtr;
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index f4e20f722b18a6..0dd9099d8a677b 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -367,9 +367,9 @@ def _script_graph(fn, _frames_up=0):
     return _jit_script_compile(ast, rcb)
 
 
-def script(fn, _frames_up=0):
+def script(fn, optimize=True, _frames_up=0):
     graph = _script_graph(fn, _frames_up=_frames_up + 1)
-    return functools.wraps(fn)(torch._C.GraphExecutor(graph, True))
+    return functools.wraps(fn)(torch._C.GraphExecutor(graph, optimize))
 
 
 ScriptMethodStub = namedtuple('ScriptMethodStub', ('resolution_callback', 'ast', 'original_method'))
diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index d661af34038985..2e216216da59cb 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -15,9 +15,9 @@ include(CMakeInitializeConfigs)
 # Relies on CMAKE_INSTALL_PREFIX to be set to ../tmp_install.
 # It then finds $PREFIX/share/cmake/ATen/ATenConfig.cmake,
 # which defines ATEN_INCLUDE_DIR and ATEN_LIBRARIES.
-find_package(ATen REQUIRED)
-if(NOT ATen_FOUND)
-  message(FATAL_ERROR "ATen not found")
+find_package(Caffe2 REQUIRED)
+if(NOT Caffe2_FOUND)
+  message(FATAL_ERROR "Caffe2 not found")
 endif()
 
 find_package(Gloo REQUIRED)
@@ -39,12 +39,6 @@ if(NOT CUDA_FOUND)
   message(FATAL_ERROR "CUDA not found")
 endif()
 
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-if (NOT MSVC)
-  list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
-  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
-endif()
-
 set(C10D_SRCS
   Utils.cpp
   Store.cpp
@@ -55,13 +49,17 @@ set(C10D_SRCS
   )
 
 add_library(c10d ${C10D_SRCS})
-target_compile_options(c10d PUBLIC "-std=c++11")
-target_include_directories(c10d PUBLIC ${ATEN_INCLUDE_DIR})
-target_link_libraries(c10d PUBLIC ${ATEN_LIBRARIES})
-target_include_directories(c10d PUBLIC ${CUDA_INCLUDE_DIRS})
-target_link_libraries(c10d PUBLIC ${CUDA_LIBRARIES})
+target_compile_options(c10d PUBLIC
+  -Wall
+  -Wextra
+  -Wno-unused-parameter
+  -Wno-missing-field-initializers
+  -Wno-write-strings
+  -Wno-unknown-pragmas
+  )
+target_link_libraries(c10d PUBLIC caffe2_gpu)
 
-# c10d links to ATen, but the ATen targets don't add TH/THC to the include path
+# c10d links to Caffe2/ATen, but the targets don't add TH/THC to the include path
 target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/TH)
 target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/THC)
 
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index 01d29829661195..435486b6f64071 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -85,6 +85,8 @@ const ::gloo::ReductionFunction<T>* reductionFunction(const ReduceOp& r) {
       return ::gloo::ReductionFunction<T>::min;
     case ReduceOp::MAX:
       return ::gloo::ReductionFunction<T>::max;
+    case ReduceOp::UNUSED:
+      break;
   }
 
   throw std::runtime_error("Unhandled ReduceOp");
@@ -92,7 +94,7 @@ const ::gloo::ReductionFunction<T>* reductionFunction(const ReduceOp& r) {
 
 std::vector<cudaStream_t> getStreamVector(AlgorithmEntry& entry) {
   std::vector<cudaStream_t> streams(entry.streams.size());
-  for (auto i = 0; i < entry.streams.size(); i++) {
+  for (size_t i = 0; i < entry.streams.size(); i++) {
     streams[i] = entry.streams[i].getStream();
   }
   return streams;
@@ -103,7 +105,7 @@ std::vector<cudaStream_t> getStreamVector(AlgorithmEntry& entry) {
 void synchronizeStreams(THCState* thcState, AlgorithmEntry* entry) {
   CUDADevice deviceGuard;
   const auto& key = entry->key;
-  for (auto i = 0; i < key.devices.size(); i++) {
+  for (size_t i = 0; i < key.devices.size(); i++) {
     const auto& device = key.devices[i];
     auto publicStream = THCState_getCurrentStreamOnDevice(thcState, device);
     auto privateStream = entry->streams[i].getStream();
@@ -138,7 +140,7 @@ bool ProcessGroupGloo::WorkGloo::isSuccess() const {
 void ProcessGroupGloo::WorkGloo::synchronize() {
   if (cuda_) {
     auto thcState = ::at::globalContext().lazyInitCUDA();
-    for (auto i = 0; i < devices_.size(); i++) {
+    for (size_t i = 0; i < devices_.size(); i++) {
       auto stream = THCState_getCurrentStreamOnDevice(thcState, devices_[i]);
       auto event = events_[i].getEvent();
       C10D_CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0));
@@ -174,7 +176,7 @@ void ProcessGroupGloo::WorkGloo::finish(const AlgorithmEntry& entry) {
       CUDADevice deviceGuard;
       devices_ = entry.key.devices;
       events_.resize(devices_.size());
-      for (auto i = 0; i < devices_.size(); i++) {
+      for (size_t i = 0; i < devices_.size(); i++) {
         deviceGuard.setDevice(devices_[i]);
         events_[i] = CUDAEvent::create();
         const auto& event = events_[i].getEvent();
@@ -218,7 +220,7 @@ ProcessGroupGloo::ProcessGroupGloo(
   }
 
   threads_.resize(options.threads);
-  for (int i = 0; i < threads_.size(); i++) {
+  for (size_t i = 0; i < threads_.size(); i++) {
     threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this);
   }
 
@@ -297,6 +299,8 @@ void ProcessGroupGloo::createAlgorithm(AlgorithmEntry& entry) {
     case CollectiveType::BROADCAST:
       GENERATE_ALL_TYPES(key.type->scalarType(), createBroadcast, entry);
       return;
+    case CollectiveType::UNUSED:
+      break;
   }
 
   throw std::runtime_error("Unhandled collective type");
@@ -387,7 +391,7 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) {
   // Allocate source tensors for this entry
   auto& srcSizes = key.srcSizes;
   entry->src.resize(srcSizes.size());
-  for (int i = 0; i < srcSizes.size(); i++) {
+  for (size_t i = 0; i < srcSizes.size(); i++) {
     deviceGuard.setDevice(key.type->is_cuda() ? key.devices[i] : -1);
     entry->src[i] = key.type->tensor(srcSizes[i]);
   }
@@ -396,7 +400,7 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) {
   if (key.type->is_cuda()) {
     entry->streams.resize(key.devices.size());
     entry->events.resize(key.devices.size());
-    for (auto i = 0; i < key.devices.size(); i++) {
+    for (size_t i = 0; i < key.devices.size(); i++) {
       deviceGuard.setDevice(key.devices[i]);
       entry->streams[i] = CUDAStream::create();
       entry->events[i] = CUDAEvent::create();
@@ -466,7 +470,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
     synchronizeStreams(thcState_, entry);
     entry->run = [=]() mutable {
       entry->algorithm->run();
-      for (auto i = 0; i < tensors.size(); i++) {
+      for (size_t i = 0; i < tensors.size(); i++) {
         // The THCStreamGuard is a RAII wrapper for temporarily
         // overriding the current THCStream. This also sets the
         // current device to the stream's device.
@@ -477,7 +481,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
   } else {
     entry->run = [=]() mutable {
       entry->algorithm->run();
-      for (auto i = 0; i < tensors.size(); i++) {
+      for (size_t i = 0; i < tensors.size(); i++) {
         tensors[i].copy_(entry->src[i]);
       }
     };
@@ -502,7 +506,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
   auto entry = checkout(key);
 
   // Copy input tensors
-  for (auto i = 0; i < tensors.size(); i++) {
+  for (size_t i = 0; i < tensors.size(); i++) {
     entry->src[i].copy_(tensors[i]);
   }
 
@@ -512,7 +516,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
     synchronizeStreams(thcState_, entry);
     entry->run = [=]() mutable {
       entry->algorithm->run();
-      for (auto i = 0; i < tensors.size(); i++) {
+      for (size_t i = 0; i < tensors.size(); i++) {
         // The THCStreamGuard is a RAII wrapper for temporarily
         // overriding the current THCStream. This also sets the
         // current device to the stream's device.
@@ -523,7 +527,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
   } else {
     entry->run = [=]() mutable {
       entry->algorithm->run();
-      for (auto i = 0; i < tensors.size(); i++) {
+      for (size_t i = 0; i < tensors.size(); i++) {
         tensors[i].copy_(entry->src[i]);
       }
     };
diff --git a/torch/lib/c10d/Types.hpp b/torch/lib/c10d/Types.hpp
index e589c6d621bd87..163e2312b55741 100644
--- a/torch/lib/c10d/Types.hpp
+++ b/torch/lib/c10d/Types.hpp
@@ -5,20 +5,8 @@
 namespace c10d {
 
 enum class CollectiveType : std::uint8_t {
-  ALLGATHER = 0,
-  GATHER,
-  SCATTER,
-  ALLREDUCE,
-  REDUCE,
   BROADCAST,
-  SEND,
-  BARRIER,
-  UNUSED,
-};
-
-enum class DeviceType : std::uint8_t {
-  CPU = 0,
-  CUDA,
+  ALLREDUCE,
   UNUSED,
 };
 
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
index a079ff2cfe2598..321268c029a5d3 100644
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@@ -23,7 +23,7 @@ namespace c10d {
 inline std::string toString(at::IntList l) {
   std::stringstream ss;
   ss << "(";
-  for (int i = 0; i < l.size(); i++) {
+  for (size_t i = 0; i < l.size(); i++) {
     if (i > 0) {
       ss << ", ";
     }
@@ -42,7 +42,7 @@ inline void assertSameSizeAndType(const std::vector<at::Tensor>& tensors) {
   // Ensure all tensors have identical type and shape
   auto& type = tensors[0].type();
   auto sizes = tensors[0].sizes();
-  for (auto i = 1; i < tensors.size(); i++) {
+  for (size_t i = 1; i < tensors.size(); i++) {
     if (tensors[i].type() != type) {
       const std::string expected = type.toString();
       const std::string actual = tensors[i].type().toString();
@@ -63,7 +63,7 @@ inline void assertSameSizeAndType(const std::vector<at::Tensor>& tensors) {
 inline std::vector<std::vector<int64_t>> getSizes(
     const std::vector<at::Tensor>& tensors) {
   std::vector<std::vector<int64_t>> sizes(tensors.size());
-  for (auto i = 0; i < tensors.size(); i++) {
+  for (size_t i = 0; i < tensors.size(); i++) {
     sizes[i] = tensors[i].sizes();
   }
   return sizes;
@@ -72,7 +72,7 @@ inline std::vector<std::vector<int64_t>> getSizes(
 inline std::vector<int> getDevices(const std::vector<at::Tensor>& tensors) {
   std::vector<int> devices(tensors.size(), -1);
   if (tensors[0].type().is_cuda()) {
-    for (auto i = 0; i < tensors.size(); i++) {
+    for (size_t i = 0; i < tensors.size(); i++) {
       devices[i] = tensors[i].storage()->getDevice();
     }
   }
@@ -82,7 +82,7 @@ inline std::vector<int> getDevices(const std::vector<at::Tensor>& tensors) {
 template <typename T>
 std::vector<T*> getDataPointers(const std::vector<at::Tensor>& tensors) {
   std::vector<T*> ptrs(tensors.size());
-  for (auto i = 0; i < tensors.size(); i++) {
+  for (size_t i = 0; i < tensors.size(); i++) {
     ptrs[i] = static_cast<T*>(tensors[i].storage()->data());
   }
   return ptrs;
diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt
index ab06631f02ff39..3138a7603bb03c 100644
--- a/torch/lib/c10d/test/CMakeLists.txt
+++ b/torch/lib/c10d/test/CMakeLists.txt
@@ -1,5 +1,4 @@
 cuda_add_library(c10d_cuda_test CUDATest.cu)
-target_compile_options(c10d_cuda_test PUBLIC "-std=c++11")
 target_link_libraries(c10d_cuda_test c10d)
 
 function(c10d_add_test test_src)
diff --git a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
index 2e1bc8de07f1f6..acc10ad1e87bdf 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
@@ -186,25 +186,25 @@ class AsyncBroadcastTest : public AsyncInputIsOutputTest {
 
 void runAsyncAllreduceTest(
     const std::string& path,
-    int numProcesses,
-    int numTensors) {
+    size_t numProcesses,
+    size_t numTensors) {
   auto tests = initialize<AsyncAllreduceTest>(path, numProcesses, numTensors);
   std::vector<std::shared_ptr<c10d::ProcessGroup::Work>> work(numProcesses);
-  for (auto i = 0; i < numProcesses; i++) {
+  for (size_t i = 0; i < numProcesses; i++) {
     work[i] = tests[i].run();
   }
 
   // Wait for work to complete
-  for (auto i = 0; i < numProcesses; i++) {
+  for (size_t i = 0; i < numProcesses; i++) {
     tests[i].wait(work[i]);
   }
 
   // Check results
-  for (auto i = 0; i < numProcesses; i++) {
+  for (size_t i = 0; i < numProcesses; i++) {
     const auto size = numProcesses * numTensors;
     const auto expected = (size * (size - 1)) / 2;
     auto tensors = tests[i].getTensors();
-    for (auto j = 0; j < tensors.size(); j++) {
+    for (size_t j = 0; j < tensors.size(); j++) {
       auto& tensor = tensors[j];
       auto data = tensor.data<float>();
       for (auto k = 0; k < tensor.numel(); k++) {
@@ -218,28 +218,28 @@ void runAsyncAllreduceTest(
 
 void runAsyncBroadcastTest(
     const std::string& path,
-    int numProcesses,
-    int numTensors) {
+    size_t numProcesses,
+    size_t numTensors) {
   auto tests = initialize<AsyncBroadcastTest>(path, numProcesses, numTensors);
 
   // Try every permutation of root rank and root tensor
-  for (auto rootRank = 0; rootRank < numProcesses; rootRank++) {
-    for (auto rootTensor = 0; rootTensor < numTensors; rootTensor++) {
+  for (size_t rootRank = 0; rootRank < numProcesses; rootRank++) {
+    for (size_t rootTensor = 0; rootTensor < numTensors; rootTensor++) {
       std::vector<std::shared_ptr<c10d::ProcessGroup::Work>> work(numProcesses);
-      for (auto i = 0; i < numProcesses; i++) {
+      for (size_t i = 0; i < numProcesses; i++) {
         work[i] = tests[i].run(rootRank, rootTensor);
       }
 
       // Wait for work to complete
-      for (auto i = 0; i < numProcesses; i++) {
+      for (size_t i = 0; i < numProcesses; i++) {
         tests[i].wait(work[i]);
       }
 
       // Check results
       const auto expected = (rootRank * numTensors + rootTensor);
-      for (auto i = 0; i < numProcesses; i++) {
+      for (size_t i = 0; i < numProcesses; i++) {
         auto tensors = tests[i].getTensors();
-        for (auto j = 0; j < tensors.size(); j++) {
+        for (size_t j = 0; j < tensors.size(); j++) {
           auto& tensor = tensors[j];
           auto data = tensor.data<float>();
           for (auto k = 0; k < tensor.numel(); k++) {
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
index 744f6c535fdab3..8aedc14dd9cd0a 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
@@ -139,10 +139,10 @@ class CollectiveTest {
 std::vector<std::vector<at::Tensor>> copyTensors(
     const std::vector<std::vector<at::Tensor>>& inputs) {
   std::vector<std::vector<at::Tensor>> outputs(inputs.size());
-  for (auto i = 0; i < inputs.size(); i++) {
+  for (size_t i = 0; i < inputs.size(); i++) {
     const auto& input = inputs[i];
     std::vector<at::Tensor> output(input.size());
-    for (auto j = 0; j < input.size(); j++) {
+    for (size_t j = 0; j < input.size(); j++) {
       output[j] = input[j].toBackend(at::kCPU);
     }
     outputs[i] = std::move(output);
diff --git a/torch/nn/parallel/parallel_apply.py b/torch/nn/parallel/parallel_apply.py
index 281f72a716e2e4..6a5ab99cacfae1 100644
--- a/torch/nn/parallel/parallel_apply.py
+++ b/torch/nn/parallel/parallel_apply.py
@@ -18,6 +18,15 @@ def get_a_var(obj):
 
 
 def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
+    r"""Applies each `module` in :attr:`modules` in parallel on arguments
+    contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword)
+    on each of :attr:`devices`.
+
+    :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
+    :attr:`devices` (if given) should all have same length. Moreover, each
+    element of :attr:`inputs` can either be a single object as the only argument
+    to a module, or a collection of positional arguments.
+    """
     assert len(modules) == len(inputs)
     if kwargs_tup is not None:
         assert len(modules) == len(kwargs_tup)
@@ -38,6 +47,9 @@ def _worker(i, module, input, kwargs, device=None):
             device = get_a_var(input).get_device()
         try:
             with torch.cuda.device(device):
+                # this also avoids accidental slicing of `input` if it is a Tensor
+                if not isinstance(input, (list, tuple)):
+                    input = (input,)
                 output = module(*input, **kwargs)
             with lock:
                 results[i] = output
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index 89b05ae0d23060..e7cd77bb9a8731 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -600,12 +600,15 @@ def index_select(g, self, index, dim):
 
 
 def type_as(g, self, other):
-    if self.type().scalarType() == other.type().scalarType():
-        # no-op
+    if self.isTensor() and other.isTensor() and self.type().scalarType() == other.type().scalarType():
         return self
+
+    if other.isTensor():
+        other_type_name = other.type().scalarType()
+        return g.op("Cast", self, to_i=cast_pytorch_to_onnx[other_type_name])
     else:
-        other_type_name = self.type().scalarType().lower()
-        return g.op("Cast", self, to_i=cast_pytorch_to_onnx[scalar_name_to_pytorch[other_type_name]])
+        # We don't know the type of other, bail by emitting ATen
+        return g.op("ATen", self, other, operator_s="type_as")
 
 
 # ignore clone operators that are inserted by PyTorch autograd