diff --git a/.circleci/config.yml b/.circleci/config.yml
index bcd2a5527b9835..476e9867cf2a4a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -5,11 +5,37 @@ docker_config_defaults: &docker_config_defaults
     aws_access_key_id: AKIAJ2J6FIG5OSZTQ3IA
     aws_secret_access_key: ${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY}
 
+# NOTE: We only perform the merge in build step and not in test step, because
+# all source files will be shared from build to test
+merge_pull_request_onto_master: &merge_pull_request_onto_master
+  name: Merge Onto Master
+  no_output_timeout: "10h"
+  command: |
+    if [[ "${CIRCLE_BRANCH}" != "master" ]]; then
+      git config --global user.email "circleci.ossci@gmail.com"
+      git config --global user.name "CircleCI"
+
+      git config remote.origin.url https://github.com/pytorch/pytorch.git
+      git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
+      git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=50 --quiet
+
+      export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/master`
+      echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
+      export GIT_COMMIT=${CIRCLE_SHA1}
+      echo "GIT_COMMIT: " ${GIT_COMMIT}
+
+      git checkout -f ${GIT_COMMIT}
+      git reset --hard ${GIT_COMMIT}
+      git merge --no-edit --no-ff ${GIT_MERGE_TARGET}
+    fi
+
 pytorch_linux_cpu_build_test_defaults: &pytorch_linux_cpu_build_test_defaults
   resource_class: large
   working_directory: /var/lib/jenkins/workspace
   steps:
   - checkout
+  - run:
+      <<: *merge_pull_request_onto_master
   - run:
       name: Build
       no_output_timeout: "10h"
@@ -22,7 +48,7 @@ pytorch_linux_cpu_build_test_defaults: &pytorch_linux_cpu_build_test_defaults
         # This IAM user allows write access to S3 bucket for sccache
         export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA
         export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}
-        git submodule update --init
+        git submodule sync && git submodule update --init
         .jenkins/pytorch/build.sh
         .jenkins/pytorch/test.sh
 
@@ -31,6 +57,8 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults
   working_directory: /var/lib/jenkins/workspace
   steps:
   - checkout
+  - run:
+      <<: *merge_pull_request_onto_master
   - run:
       name: Build
       no_output_timeout: "10h"
@@ -46,16 +74,18 @@ pytorch_linux_build_defaults: &pytorch_linux_build_defaults
         # This IAM user allows write access to S3 bucket for sccache
         export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA
         export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}
-        git submodule update --init || git submodule update --init || git submodule update --init
+        git submodule sync && git submodule update --init
         .jenkins/pytorch/build.sh
-        mkdir -p pytorch-ci-env/
-        cp -r /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch pytorch-ci-env/torch
-        cp -r build/bin pytorch-ci-env/cpp_test_bin
+        export PYTORCH_CI_ENV_DIR=/var/lib/jenkins/pytorch-ci-env
+        mkdir -p ${PYTORCH_CI_ENV_DIR}
+        cp -r /var/lib/jenkins/workspace ${PYTORCH_CI_ENV_DIR}/build_workspace  # This copies all source files from build step to the next step
+        cp -r /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch ${PYTORCH_CI_ENV_DIR}/torch
+        cp -r build/bin ${PYTORCH_CI_ENV_DIR}/cpp_test_bin
         if [ -d "../cpp-build" ]; then
-          cp -r ../cpp-build pytorch-ci-env/cpp-build
+          cp -r ../cpp-build ${PYTORCH_CI_ENV_DIR}/cpp-build
         fi
   - persist_to_workspace:
-      root: /var/lib/jenkins/workspace/pytorch-ci-env
+      root: /var/lib/jenkins/pytorch-ci-env
       paths:
         - "*"
 
@@ -63,7 +93,6 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
   machine:
     image: default
   steps:
-  - checkout
   - run:
       name: Prepare workspace
       command: |
@@ -107,12 +136,16 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
           id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
         fi
         pwd
+
+        cp -r /opt/workspace/build_workspace/. /home/circleci/project  # This copies all source files from build step to the current step
+
         echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env
         echo "declare -x PYTHON_VERSION=${PYTHON_VERSION}" >> /home/circleci/project/env
         echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env
         # This IAM user allows write access to S3 bucket for sccache
         echo "declare -x AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA" >> /home/circleci/project/env
         echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}" >> /home/circleci/project/env
+
         mkdir -p /home/circleci/project/build
         cp -r /opt/workspace/cpp_test_bin /home/circleci/project/build/bin
         docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace"
@@ -122,9 +155,9 @@ pytorch_linux_test_defaults: &pytorch_linux_test_defaults
           docker cp "/opt/workspace/cpp-build" "$id:/var/lib/jenkins/cpp-build"
         fi
         if [ -n "${MULTI_GPU}" ]; then
-          (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && (git submodule update --init || git submodule update --init || git submodule update --init) && .jenkins/pytorch/multigpu-test.sh') | docker exec -u jenkins -i "$id" bash
+          (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && .jenkins/pytorch/multigpu-test.sh') | docker exec -u jenkins -i "$id" bash
         else
-          (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && (git submodule update --init || git submodule update --init || git submodule update --init) && .jenkins/pytorch/test.sh') | docker exec -u jenkins -i "$id" bash
+          (echo "source ./workspace/env" && echo 'sudo chown -R jenkins workspace /opt/conda/lib/python${PYTHON_VERSION}/site-packages/torch && cd workspace && .jenkins/pytorch/test.sh') | docker exec -u jenkins -i "$id" bash
         fi
 
 caffe2_linux_build_defaults: &caffe2_linux_build_defaults
@@ -132,6 +165,8 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults
   working_directory: /var/lib/jenkins/workspace
   steps:
   - checkout
+  - run:
+      <<: *merge_pull_request_onto_master
   - run:
       name: Build
       no_output_timeout: "10h"
@@ -152,7 +187,7 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults
         cd third_party/onnx && git fetch --tags --progress origin +refs/pull/*:refs/remotes/origin/pr/* && cd -
 
         # Reinitialize submodules
-        git submodule update --init --recursive
+        git submodule sync && git submodule update --init --recursive
 
         # Ensure jenkins can write to the ccache root dir.
         sudo chown jenkins:jenkins "${HOME}/.ccache"
@@ -189,16 +224,18 @@ caffe2_linux_build_defaults: &caffe2_linux_build_defaults
         fi
 
         # Copy all necessary binaries to shared workspace
-        mkdir -p caffe2-ci-env
-        cp -r third_party/onnx caffe2-ci-env/onnx
+        export CAFFE2_CI_ENV_DIR=/var/lib/jenkins/caffe2-ci-env
+        mkdir -p ${CAFFE2_CI_ENV_DIR}
+        cp -r /var/lib/jenkins/workspace ${CAFFE2_CI_ENV_DIR}/build_workspace  # This copies all source files from build step to the next step
+        cp -r third_party/onnx ${CAFFE2_CI_ENV_DIR}/onnx
         if [ -d "/usr/local/caffe2" ]; then
-          cp -r /usr/local/caffe2 caffe2-ci-env/caffe2
+          cp -r /usr/local/caffe2 ${CAFFE2_CI_ENV_DIR}/caffe2
         fi
         if [ -d "/opt/conda" ]; then
-          cp -r /opt/conda caffe2-ci-env/conda_env
+          cp -r /opt/conda ${CAFFE2_CI_ENV_DIR}/conda_env
         fi
   - persist_to_workspace:
-      root: /var/lib/jenkins/workspace/caffe2-ci-env
+      root: /var/lib/jenkins/caffe2-ci-env
       paths:
         - "*"
 
@@ -206,7 +243,6 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults
   machine:
     image: default
   steps:
-  - checkout
   - run:
       name: Prepare workspace
       command: |
@@ -250,6 +286,7 @@ caffe2_linux_test_defaults: &caffe2_linux_test_defaults
           id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
         fi
         pwd
+        cp -r /opt/workspace/build_workspace/. /home/circleci/project  # This copies all source files from build step to the current step
         echo "declare -x IN_CIRCLECI=1" > /home/circleci/project/env
         echo "declare -x SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> /home/circleci/project/env
         # This IAM user allows write access to S3 bucket for sccache
@@ -315,6 +352,8 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
     xcode: "9.0"
   steps:
     - checkout
+    - run:
+        <<: *merge_pull_request_onto_master
     - run:
         name: Build
         no_output_timeout: "10h"
@@ -326,7 +365,7 @@ caffe2_macos_build_defaults: &caffe2_macos_build_defaults
           brew install cmake
 
           # Reinitialize submodules
-          git submodule update --init --recursive
+          git submodule sync && git submodule update --init --recursive
 
           # Reinitialize path (see man page for path_helper(8))
           eval `/usr/libexec/path_helper -s`
@@ -525,6 +564,8 @@ jobs:
       xcode: "9.0"
     steps:
       - checkout
+      - run:
+          <<: *merge_pull_request_onto_master
       - run:
           name: Build
           environment:
@@ -544,9 +585,12 @@ jobs:
             export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}
 
-            git submodule update --init
+            git submodule sync && git submodule update --init
             chmod a+x .jenkins/pytorch/macos-build.sh
             .jenkins/pytorch/macos-build.sh
+
+            # TODO: need to share source files from build to test, when macOS builds are enabled
+
       - persist_to_workspace:
           root: /Users/distiller/pytorch-ci-env
           paths:
@@ -556,7 +600,6 @@ jobs:
     macos:
       xcode: "9.0"
     steps:
-      - checkout
       - run:
           name: Prepare workspace
           command: |
@@ -570,9 +613,9 @@ jobs:
             BUILD_ENVIRONMENT: pytorch-macos-10.13-py3
           no_output_timeout: "10h"
           command: |
+            # TODO: need to share source files from build to test, when macOS builds are enabled
             set -ex
             export IN_CIRCLECI=1
-            git submodule update --init
             chmod a+x .jenkins/pytorch/macos-test.sh
             .jenkins/pytorch/macos-test.sh
 
@@ -581,6 +624,8 @@ jobs:
       xcode: "9.0"
     steps:
       - checkout
+      - run:
+          <<: *merge_pull_request_onto_master
       - run:
           name: Build
           environment:
@@ -616,7 +661,7 @@ jobs:
             export AWS_ACCESS_KEY_ID=AKIAJJZUW4G2ASX5W7KA
             export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET}
 
-            git submodule update --init
+            git submodule sync && git submodule update --init
             chmod a+x .jenkins/pytorch/macos-build.sh
             .jenkins/pytorch/macos-build.sh
 
@@ -845,36 +890,36 @@ workflows:
   version: 2
   build:
     jobs:
-      - pytorch_linux_trusty_py2_7_9_build_test
-      - pytorch_linux_trusty_py2_7_build_test
-      - pytorch_linux_trusty_py3_5_build_test
-      - pytorch_linux_trusty_py3_6_gcc4_8_build_test
-      - pytorch_linux_trusty_py3_6_gcc5_4_build_test
-      - pytorch_linux_trusty_py3_6_gcc7_build_test
-      - pytorch_linux_trusty_pynightly_build_test
-      - pytorch_linux_xenial_py3_clang5_asan_build
-      - pytorch_linux_xenial_py3_clang5_asan_test:
-          requires:
-            - pytorch_linux_xenial_py3_clang5_asan_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_test:
-          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
-          requires:
-            - pytorch_linux_xenial_cuda8_cudnn6_py3_build
-      - pytorch_linux_xenial_cuda9_cudnn7_py2_build
-      - pytorch_linux_xenial_cuda9_cudnn7_py2_test:
-          requires:
-            - pytorch_linux_xenial_cuda9_cudnn7_py2_build
-      - pytorch_linux_xenial_cuda9_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda9_cudnn7_py3_test:
-          requires:
-            - pytorch_linux_xenial_cuda9_cudnn7_py3_build
-      - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
-      - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
-          requires:
-            - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
+      # - pytorch_linux_trusty_py2_7_9_build_test
+      # - pytorch_linux_trusty_py2_7_build_test
+      # - pytorch_linux_trusty_py3_5_build_test
+      # - pytorch_linux_trusty_py3_6_gcc4_8_build_test
+      # - pytorch_linux_trusty_py3_6_gcc5_4_build_test
+      # - pytorch_linux_trusty_py3_6_gcc7_build_test
+      # - pytorch_linux_trusty_pynightly_build_test
+      # - pytorch_linux_xenial_py3_clang5_asan_build
+      # - pytorch_linux_xenial_py3_clang5_asan_test:
+      #     requires:
+      #       - pytorch_linux_xenial_py3_clang5_asan_build
+      # - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      # - pytorch_linux_xenial_cuda8_cudnn6_py3_test:
+      #     requires:
+      #       - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      # - pytorch_linux_xenial_cuda8_cudnn6_py3_multigpu_test:
+      #     requires:
+      #       - pytorch_linux_xenial_cuda8_cudnn6_py3_build
+      # - pytorch_linux_xenial_cuda9_cudnn7_py2_build
+      # - pytorch_linux_xenial_cuda9_cudnn7_py2_test:
+      #     requires:
+      #       - pytorch_linux_xenial_cuda9_cudnn7_py2_build
+      # - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      # - pytorch_linux_xenial_cuda9_cudnn7_py3_test:
+      #     requires:
+      #       - pytorch_linux_xenial_cuda9_cudnn7_py3_build
+      # - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
+      # - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test:
+      #     requires:
+      #       - pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
 
       # - pytorch_macos_10_13_py3_build
       # - pytorch_macos_10_13_py3_test:
@@ -882,48 +927,48 @@ workflows:
       #       - pytorch_macos_10_13_py3_build
       # - pytorch_macos_10_13_cuda9_2_cudnn7_py3_build
 
-      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
-      - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
-          requires:
-            - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
-      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
-      - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
-          requires:
-            - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
-      - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build
-      - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_test:
-          requires:
-            - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build
-      - caffe2_py2_mkl_ubuntu16_04_build
-      - caffe2_py2_mkl_ubuntu16_04_test:
-          requires:
-            - caffe2_py2_mkl_ubuntu16_04_build
-      - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
-      - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test:
-          requires:
-            - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
-      - caffe2_py2_gcc4_8_ubuntu14_04_build
-      - caffe2_py2_gcc4_8_ubuntu14_04_test:
-          requires:
-            - caffe2_py2_gcc4_8_ubuntu14_04_build
-      - caffe2_onnx_py2_gcc5_ubuntu16_04_build
-      - caffe2_onnx_py2_gcc5_ubuntu16_04_test:
-          requires:
-            - caffe2_onnx_py2_gcc5_ubuntu16_04_build
-      - caffe2_conda2_ubuntu16_04_build
-      - caffe2_conda2_ubuntu16_04_test:
-          requires:
-            - caffe2_conda2_ubuntu16_04_build
-      - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
-      - caffe2_py2_gcc4_9_ubuntu14_04_build
-      - caffe2_py2_clang3_8_ubuntu16_04_build
-      - caffe2_py2_clang3_9_ubuntu16_04_build
-      - caffe2_py2_gcc6_ubuntu16_04_build
-      - caffe2_py2_gcc7_ubuntu16_04_build
-      - caffe2_py2_cuda8_0_cudnn7_aten_ubuntu16_04_build
-      - caffe2_py2_android_ubuntu16_04_build
-      - caffe2_conda3_cuda9_0_cudnn7_ubuntu16_04_build
-      - caffe2_py2_cuda9_0_cudnn7_centos7_build
+      # - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
+      # - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_py2_cuda8_0_cudnn6_ubuntu16_04_build
+      # - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
+      # - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
+      # - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build
+      # - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_py2_cuda9_0_cudnn7_aten_ubuntu16_04_build
+      # - caffe2_py2_mkl_ubuntu16_04_build
+      # - caffe2_py2_mkl_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_py2_mkl_ubuntu16_04_build
+      # - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
+      # - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
+      # - caffe2_py2_gcc4_8_ubuntu14_04_build
+      # - caffe2_py2_gcc4_8_ubuntu14_04_test:
+      #     requires:
+      #       - caffe2_py2_gcc4_8_ubuntu14_04_build
+      # - caffe2_onnx_py2_gcc5_ubuntu16_04_build
+      # - caffe2_onnx_py2_gcc5_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_onnx_py2_gcc5_ubuntu16_04_build
+      # - caffe2_conda2_ubuntu16_04_build
+      # - caffe2_conda2_ubuntu16_04_test:
+      #     requires:
+      #       - caffe2_conda2_ubuntu16_04_build
+      # - caffe2_py2_cuda8_0_cudnn7_ubuntu16_04_build
+      # - caffe2_py2_gcc4_9_ubuntu14_04_build
+      # - caffe2_py2_clang3_8_ubuntu16_04_build
+      # - caffe2_py2_clang3_9_ubuntu16_04_build
+      # - caffe2_py2_gcc6_ubuntu16_04_build
+      # - caffe2_py2_gcc7_ubuntu16_04_build
+      # - caffe2_py2_cuda8_0_cudnn7_aten_ubuntu16_04_build
+      # - caffe2_py2_android_ubuntu16_04_build
+      # - caffe2_conda3_cuda9_0_cudnn7_ubuntu16_04_build
+      # - caffe2_py2_cuda9_0_cudnn7_centos7_build
 
       # - caffe2_py2_ios_macos10_13_build
       # - caffe2_py2_system_macos10_13_build
diff --git a/.gitignore b/.gitignore
index da783554fe1120..a56ae2ab81df84 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,16 +25,17 @@ aten/src/ATen/cuda/CUDAConfig.h
 build/
 dist/
 docs/src/**/*
-docs/cpp/xml/
-docs/cpp/html/
-docs/cpp/api/
+docs/cpp/build
+docs/cpp/source/api
 test/.coverage
 test/cpp/api/mnist
+test/custom_operator/model.pt
 test/data/gpu_tensors.pt
 test/data/legacy_modules.t7
 test/data/legacy_serialized.pt
 test/data/linear.pt
 test/htmlcov
+test/cpp_extensions/install/
 third_party/build/
 tools/shared/_utils_internal.py
 torch.egg-info/
@@ -43,6 +44,7 @@ torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
 torch/csrc/jit/generated/*
+torch/csrc/jit/fusers/Config.h
 torch/csrc/nn/THCUNN.cpp
 torch/csrc/nn/THCUNN.cwrap
 torch/csrc/nn/THNN_generic.cpp
@@ -65,6 +67,7 @@ torch/lib/protoc
 torch/lib/tmp_install
 torch/lib/torch_shm_manager
 torch/lib/python*
+torch/share/
 torch/version.py
 
 # IPython notebook checkpoints
diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
index d9b2a2e096a1e5..ffcbbc136b50d5 100755
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@@ -226,7 +226,7 @@ else
     export MAX_JOBS=`expr $(nproc) - 1`
   fi
 
-  USE_OPENCV=1 BUILD_BINARY=1 python setup.py install --user
+  USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_BINARY=1 python setup.py install --user
 
   # This is to save test binaries for testing
   cp -r torch/lib/tmp_install $INSTALL_PREFIX
diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index b0f9c41382601f..585f994367f339 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -49,6 +49,20 @@ fi
 
 mkdir -p $TEST_DIR/{cpp,python}
 
+if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
+  export LANG=C.UTF-8
+  export LC_ALL=C.UTF-8
+
+  # Pin individual runs to specific gpu so that we can schedule
+  # multiple jobs on machines that have multi-gpu.
+  NUM_AMD_GPUS=$(/opt/rocm/bin/rocminfo | grep 'Device Type.*GPU' | wc -l)
+  if (( $NUM_AMD_GPUS == 0 )); then
+      echo >&2 "No AMD GPU detected!"
+      exit 1
+  fi
+  export HIP_VISIBLE_DEVICES=$(($BUILD_NUMBER % $NUM_AMD_GPUS))
+fi
+
 cd "${WORKSPACE}"
 
 # C++ tests
@@ -62,19 +76,27 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do
     */mkl_utils_test|*/aten/integer_divider_test)
       continue
       ;;
-    */aten/*)
-      # ATen uses test framework Catch2
-      # NB: We do NOT use the xml test reporter, because
-      # Catch doesn't support multiple reporters
+    */scalar_tensor_test|*/basic|*/native_test)
+	  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+		continue
+	  else
+	    "$test"
+	  fi
+	  ;;
+	*)
+      # Currently, we use a mixture of gtest (caffe2) and Catch2 (ATen). While
+      # planning to migrate to gtest as the common PyTorch c++ test suite, we
+      # currently do NOT use the xml test reporter, because Catch doesn't
+      # support multiple reporters
       # c.f. https://github.com/catchorg/Catch2/blob/master/docs/release-notes.md#223
       # which means that enabling XML output means you lose useful stdout
       # output for Jenkins.  It's more important to have useful console
       # output than it is to have XML output for Jenkins.
+      # Note: in the future, if we want to use xml test reporter once we switch
+      # to all gtest, one can simply do:
+      # "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml"
       "$test"
       ;;
-    *)
-      "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml"
-      ;;
   esac
 done
 
@@ -98,9 +120,6 @@ fi
 
 rocm_ignore_test=()
 if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
-  export LANG=C.UTF-8
-  export LC_ALL=C.UTF-8
-
   # Currently these tests are failing on ROCM platform:
 
   # Unknown reasons, need to debug
@@ -115,10 +134,6 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
   # Our cuda top_k op has some asm code, the hipified version doesn't
   # compile yet, so we don't have top_k operator for now
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py")
-
-  # Our AMD CI boxes have 4 gpus on each
-  # Remove this once we have added multi-gpu support
-  export HIP_VISIBLE_DEVICES=$(($BUILD_NUMBER % 4))
 fi
 
 # Python tests
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 3ffed384b081b7..2dc64157c5d00d 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -11,7 +11,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-* ]]; then
   sudo apt-get install -y --allow-downgrades --allow-change-held-packages libnccl-dev=2.2.13-1+cuda9.0 libnccl2=2.2.13-1+cuda9.0
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda8-* ]] || [[ "$BUILD_ENVIRONMENT" == *-xenial-cuda9-cudnn7-py2* ]] || [[ "$BUILD_ENVIRONMENT" == *-trusty-py2.7.9* ]]; then
   # TODO: move this to Docker
   sudo apt-get update
   sudo apt-get install -y --allow-downgrades --allow-change-held-packages openmpi-bin libopenmpi-dev
@@ -102,12 +102,6 @@ fi
 # Add the test binaries so that they won't be git clean'ed away
 git add -f build/bin
 
-# Testing ATen install
-if [[ "$BUILD_ENVIRONMENT" != *cuda* ]]; then
-  echo "Testing ATen install"
-  time tools/test_aten_install.sh
-fi
-
 # Test C FFI plugins
 # cffi install doesn't work for Python 3.7
 if [[ "$BUILD_ENVIRONMENT" != *pynightly* ]]; then
@@ -124,7 +118,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xenial-cuda8-cudnn6-py3* ]]; then
   pushd docs
   # TODO: Don't run this here
   pip install -r requirements.txt || true
-  make html
+  LC_ALL=C make html
   popd
 fi
 
@@ -138,4 +132,14 @@ if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then
   pushd ../cpp-build/caffe2
   WERROR=1 VERBOSE=1 DEBUG=1 python $BUILD_LIBTORCH_PY
   popd
+
+  # Build custom operator tests.
+  CUSTOM_OP_BUILD="$PWD/../custom-op-build"
+  CUSTOM_OP_TEST="$PWD/test/custom_operator"
+  SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+  mkdir "$CUSTOM_OP_BUILD"
+  pushd "$CUSTOM_OP_BUILD"
+  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake "$CUSTOM_OP_TEST"
+  make VERBOSE=1
+  popd
 fi
diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh
index 5ce6ee01a46975..ca728df2b826c5 100644
--- a/.jenkins/pytorch/common.sh
+++ b/.jenkins/pytorch/common.sh
@@ -112,7 +112,8 @@ else
   exit 1
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-trusty-py3.6-gcc7* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda9-cudnn7-py3 ]] || \
+   [[ "$BUILD_ENVIRONMENT" == *pytorch-linux-trusty-py3.6-gcc7* ]]; then
   BUILD_TEST_LIBTORCH=1
 else
   BUILD_TEST_LIBTORCH=0
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 87e0476e418ba5..16d34342c544c8 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -78,13 +78,35 @@ test_cpp_api() {
   "$CPP_BUILD"/caffe2/bin/test_api
 }
 
+test_custom_script_ops() {
+  echo "Testing custom script operators"
+  pushd test/custom_operator
+  # Build the custom operator library.
+  rm -rf build && mkdir build
+  pushd build
+  SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+  CMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" cmake ..
+  make VERBOSE=1
+  popd
+
+  # Run tests Python-side and export a script module.
+  python test_custom_ops.py -v
+  python model.py --export-script-module=model.pt
+  # Run tests C++-side and load the exported script module.
+  build/test_custom_ops ./model.pt
+  popd
+}
+
+
 if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
   test_python_all
   test_cpp_api
+  test_custom_script_ops
 else
   if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
     test_python_all
   elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then
     test_cpp_api
+    test_custom_script_ops
   fi
 fi
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 05bd71602b9783..471fd8fac1fc6e 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -90,6 +90,8 @@ test_python_all_except_nn() {
 
 test_aten() {
   # Test ATen
+  # The following test(s) of ATen have already been skipped by caffe2 in rocm environment:
+  # scalar_tensor_test, basic, native_test
   if ([[ "$BUILD_ENVIRONMENT" != *asan* ]] && [[ "$BUILD_ENVIRONMENT" != *rocm* ]]); then
     echo "Running ATen tests with pytorch lib"
     TORCH_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/torch/lib
@@ -97,7 +99,7 @@ test_aten() {
     # put the dynamic libraries somewhere were the dynamic linker can find them.
     # This is a bit of a hack.
     if [[ "$BUILD_ENVIRONMENT" == *ppc64le* ]]; then
-      SUDO=sudo 
+      SUDO=sudo
     fi
 
     ${SUDO} ln -s "$TORCH_LIB_PATH"/libcaffe2* build/bin
@@ -140,12 +142,28 @@ test_libtorch() {
   fi
 }
 
+test_custom_script_ops() {
+  if [[ "$BUILD_TEST_LIBTORCH" == "1" ]]; then
+    echo "Testing custom script operators"
+    CUSTOM_OP_BUILD="$PWD/../custom-op-build"
+    pushd test/custom_operator
+    cp -r "$CUSTOM_OP_BUILD" build
+    # Run tests Python-side and export a script module.
+    python test_custom_ops.py -v
+    python model.py --export-script-module=model.pt
+    # Run tests C++-side and load the exported script module.
+    build/test_custom_ops ./model.pt
+    popd
+  fi
+}
+
 if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
   test_python_nn
   test_python_all_except_nn
   test_aten
   test_torchvision
   test_libtorch
+  test_custom_script_ops
 else
   if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
     test_python_nn
@@ -154,5 +172,6 @@ else
     test_aten
     test_torchvision
     test_libtorch
+    test_custom_script_ops
   fi
 fi
diff --git a/.travis.yml b/.travis.yml
index be45e69f67cb2f..77d430ee8917a4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,4 +28,4 @@ matrix:
         script: mypy @mypy-files.txt
       - env: CPP_DOC_CHECK
         install: sudo apt-get install -y doxygen
-        script: cd docs/cpp && ./check-doxygen.sh
+        script: cd docs/cpp/source && ./check-doxygen.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b7f56f96e87f3a..827121b1fc5931 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ option(BUILD_BINARY "Build C++ binaries" OFF)
 option(BUILD_DOCS "Build Caffe2 documentation" OFF)
 option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
 option(BUILD_PYTHON "Build Python binaries" ON)
+option(BUILD_CAFFE2_OPS "Build Caffe2 operators" ON)
 option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
 cmake_dependent_option(
     CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON
@@ -115,7 +116,7 @@ option(USE_IDEEP "Use IDEEP interface in MKL BLAS" ON)
 option(USE_MKLML "Use MKLML interface in MKL BLAS" ON)
 option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(
-    USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." OFF
+    USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
     "USE_DISTRIBUTED" OFF)
 cmake_dependent_option(
     USE_GLOO "Use Gloo. Only available if USE_DISTRIBUTED is on." ON
@@ -123,6 +124,7 @@ cmake_dependent_option(
 cmake_dependent_option(
     USE_GLOO_IBVERBS "Use Gloo IB verbs for distributed. Only available if USE_GLOO is on." OFF
     "USE_GLOO" OFF)
+option(TORCH_USE_CEREAL "Build the C++ API with Cereal for serialization support" OFF)
 
 # Used when building Caffe2 through setup.py
 option(BUILDING_WITH_TORCH_LIBS "Tell cmake if Caffe2 is being built alongside torch libs" OFF)
diff --git a/CODEOWNERS b/CODEOWNERS
index 113be035c9b99f..5723d6ebe5c058 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -2,6 +2,7 @@
 # Each line is a file pattern followed by one or more owners.
 
 /aten/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
+/aten/src/ATen/core/
 /torch/ @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
 /docs/source @apaszke @soumith @colesbury @gchanan @zdevito @ezyang @ssnl @zou3519
 /docs/cpp @goldsborough @ebetica @apaszke @soumith @colesbury @gchanan @zdevito @ezyang
diff --git a/README.md b/README.md
index b909001edc6f14..b23bc60aa19de6 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ We are in an early-release beta. Expect some adventures and rough edges.
   - [Binaries](#binaries)
   - [From Source](#from-source)
   - [Docker Image](#docker-image)
+  - [Building the Documentation](#building-the-documentation)
   - [Previous Versions](#previous-versions)
 - [Getting Started](#getting-started)
 - [Communication](#communication)
@@ -200,9 +201,8 @@ set DISTUTILS_USE_SDK=1
 REM The following two lines are needed for Python 2.7, but the support for it is very experimental.
 set MSSdk=1
 set FORCE_PY27_BUILD=1
-REM As for CUDA 8, VS2015 Update 3 is also required to build PyTorch. Use the following two lines.
-set "PREBUILD_COMMAND=%VS140COMNTOOLS%\..\..\VC\vcvarsall.bat"
-set PREBUILD_COMMAND_ARGS=x64
+REM As for CUDA 8, VS2015 Update 3 is also required to build PyTorch. Use the following line.
+set "CUDA_HOST_COMPILER=%VS140COMNTOOLS%\..\..\VC\bin\amd64\cl.exe"
 
 call "%VS150COMNTOOLS%\vcvarsall.bat" x64 -vcvars_ver=14.11
 python setup.py install
@@ -224,6 +224,18 @@ Please note that PyTorch uses shared memory to share data between processes, so
 for multithreaded data loaders) the default shared memory segment size that container runs with is not enough, and you
 should increase shared memory size either with `--ipc=host` or `--shm-size` command line options to `nvidia-docker run`.
 
+### Building the Documentation
+
+To build documentation in various formats, you will need Sphinx and the
+readthedocs theme.
+
+```
+cd docs/
+pip install -r requirements.txt
+```
+You can then build the documentation by running ``make <format>`` from the
+``docs/`` folder. Run ``make`` to get a list of all available output formats.
+
 ### Previous Versions
 
 Installation instructions and binaries for previous PyTorch versions may be found
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index ee025265a982e7..0f0019d57b11e9 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -1,19 +1,5 @@
-if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
-  project(ATen CXX C)
-  include(CMakeDependentOption)
-  option(USE_CUDA "Use CUDA" ON)
-  option(USE_ROCM "Use ROCm" OFF)
-  option(USE_CUDNN "Use cuDNN" ON)
-  option(USE_MKLDNN "Use MKLDNN" ON)
-  cmake_dependent_option(
-      USE_CUDNN "Use cuDNN" ON
-      "USE_CUDA" OFF)
-  option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
-else()
-  if (BUILD_ATEN_MOBILE)
-    return()
-  endif()
+if (BUILD_ATEN_MOBILE)
+  return()
 endif()
 
 # Find modules
@@ -42,32 +28,6 @@ SET(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory")
 SET(ATEN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "ATen install library subdirectory")
 SET(ATEN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "ATen install include subdirectory")
 
-if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  # ---[ Build variables set within the cmake tree
-  include(../cmake/BuildVariables.cmake)
-  set(CAFFE2_WHITELIST "" CACHE STRING "A whitelist file of files that one should build.")
-
-  # ---[ Misc checks to cope with various compiler modes
-  include(../cmake/MiscCheck.cmake)
-
-  # External projects
-  include(ExternalProject)
-
-  # ---[ Utils
-  # TODO: merge the following 3 files into cmake/public/utils.cmake.
-  include(../cmake/Utils.cmake)
-  include(../cmake/public/utils.cmake)
-
-  # ---[ Dependencies
-  include(../cmake/Dependencies.cmake)
-  list(APPEND ATen_CPU_INCLUDE ${Caffe2_CPU_INCLUDE})
-  list(APPEND ATen_CUDA_INCLUDE ${Caffe2_GPU_INCLUDE})
-  list(APPEND ATen_CPU_DEPENDENCY_LIBS ${Caffe2_DEPENDENCY_LIBS})
-  list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS})
-  list(APPEND ATen_PUBLIC_CUDA_DEPENDENCY_LIBS
-    ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
-endif()
-
 if(USE_CUDA)
   list(APPEND ATen_CUDA_INCLUDE ${CUDA_INCLUDE_DIRS})
 endif()
@@ -132,16 +92,14 @@ list(APPEND ATen_CPU_INCLUDE
   ${CMAKE_CURRENT_BINARY_DIR}/src/ATen)
 add_subdirectory(src/ATen)
 
-if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  # Pass source, includes, and libs to parent
-  set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
-  set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
-  set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
-  set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
-  set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
-  set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
-  set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
-  set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
-  set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
-  set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
-endif()
+# Pass source, includes, and libs to parent
+set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
+set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
+set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
+set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
+set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
+set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
+set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
+set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h
index cf074730bf072b..29812852e24fdb 100644
--- a/aten/src/ATen/ATen.h
+++ b/aten/src/ATen/ATen.h
@@ -19,7 +19,7 @@
 #include "ATen/core/Storage.h"
 #include "ATen/Tensor.h"
 #include "ATen/TensorGeometry.h"
-#include "ATen/TensorMethods.h"
+#include "ATen/core/TensorMethods.h"
 #include "ATen/TensorOperators.h"
 #include "ATen/core/TensorOptions.h"
 #include "ATen/Type.h"
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index f6d296dfe79e45..994756fa18c995 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -1,11 +1,6 @@
 cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
 SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
 
-if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  # ---[ Generate and install header and cpp files
-  include(../../../cmake/Codegen.cmake)
-endif()
-
 IF(NOT MSVC)
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-qualifiers")
   SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-ignored-qualifiers")
@@ -352,34 +347,6 @@ if(NOT AT_LINK_STYLE STREQUAL "INTERFACE")
   endif()
 endif()
 
-if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  # Eventually replace this use of LOCATION with use of
-  # $<TARGET_FILE:ATen_cpu>, but generators only work in some cases
-  cmake_policy(SET CMP0026 OLD)
-  get_target_property(ATEN_CPU_OUTPUT_NAME ATen_cpu LOCATION)
-  get_filename_component(ATEN_CPU_OUTPUT_NAME ${ATEN_CPU_OUTPUT_NAME} NAME)
-  set(ATEN_LIBRARIES
-    "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_LIB_DIR}/${ATEN_CPU_OUTPUT_NAME}")
-  if(USE_CUDA OR USE_ROCM)
-    get_target_property(ATEN_CUDA_OUTPUT_NAME ATen_cuda LOCATION)
-    get_filename_component(ATEN_CUDA_OUTPUT_NAME ${ATEN_CUDA_OUTPUT_NAME} NAME)
-    list(APPEND ATEN_LIBRARIES
-      "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_LIB_DIR}/${ATEN_CUDA_OUTPUT_NAME}")
-  endif()
-
-  install(TARGETS ATen_cpu
-    RUNTIME DESTINATION "${AT_INSTALL_BIN_DIR}"
-    LIBRARY DESTINATION "${AT_INSTALL_LIB_DIR}"
-    ARCHIVE DESTINATION "${AT_INSTALL_LIB_DIR}")
-
-  if(USE_CUDA OR USE_ROCM)
-    install(TARGETS ATen_cuda
-      RUNTIME DESTINATION "${AT_INSTALL_BIN_DIR}"
-      LIBRARY DESTINATION "${AT_INSTALL_LIB_DIR}"
-      ARCHIVE DESTINATION "${AT_INSTALL_LIB_DIR}")
-  endif()
-endif()
-
 SET(ATEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_INCLUDE_DIR}")
 CONFIGURE_FILE(ATenConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake")
 INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
@@ -404,38 +371,6 @@ else()
   add_subdirectory(test)
 endif()
 
-if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
-  foreach(test_src ${ATen_CPU_TEST_SRCS})
-    get_filename_component(test_name ${test_src} NAME_WE)
-    add_executable(${test_name} "${test_src}")
-    target_include_directories(
-        ${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
-    target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE})
-    target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE})
-    target_link_libraries(${test_name} ATen_cpu)
-    add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
-    install(TARGETS ${test_name} DESTINATION test)
-  endforeach()
-
-  if(USE_CUDA OR USE_ROCM)
-    foreach(test_src ${ATen_CUDA_TEST_SRCS})
-      get_filename_component(test_name ${test_src} NAME_WE)
-      torch_cuda_based_add_executable(${test_name} "${test_src}")
-      target_include_directories(
-          ${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
-      target_include_directories(${test_name} PRIVATE ${ATen_CPU_INCLUDE})
-      target_include_directories(${test_name} SYSTEM PRIVATE ${ATen_THIRD_PARTY_INCLUDE})
-      target_link_libraries(${test_name} -Wl,--no-as-needed ATen_cpu ATen_cuda)
-      add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
-      install(TARGETS ${test_name} DESTINATION test)
-    endforeach()
-  endif()
-
-  # Make sure these don't get built by parent
-  set(ATen_CPU_TEST_SRCS)
-  set(ATen_CUDA_TEST_SRCS)
-endif()
-
 # Pass source, includes, and libs to parent
 set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
 set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 40cb364e91fd0c..c73d2efd8ea813 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -12,6 +12,7 @@
 
 #include "ATen/CPUGenerator.h"
 #include "ATen/RegisterCPU.h"
+#include "ATen/Tensor.h"
 
 #include "TH/TH.h"  // for USE_LAPACK
 
@@ -107,15 +108,19 @@ bool Context::setFlushDenormal(bool on) {
 #endif
 }
 
-Type& getType(TensorOptions options) {
+TypeExtendedInterface& getType(TensorOptions options) {
   return globalContext().getType(
             options.backend(), options.dtype(), options.is_variable());
 }
 
-Type& getType(const TensorImpl* impl) {
+TypeExtendedInterface& getType(const TensorImpl* impl) {
   Backend backend = tensorTypeIdToBackend(impl->type_id());
   return globalContext().getType(
-            backend, impl->scalar_type(), impl->is_variable());
+            backend, dataTypeToScalarType(impl->dtype().id()), impl->is_variable());
+}
+
+TypeExtendedInterface& getType(const Tensor& t) {
+  return getType(t.unsafeGetTensorImpl());
 }
 
 Allocator* getCPUAllocator() {
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 7b3634dd83086f..4e147cffabbe86 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -5,6 +5,7 @@
 #include "ATen/CUDAStream.h"
 #include "ATen/core/Generator.h"
 #include "ATen/Type.h"
+#include "ATen/TypeExtendedInterface.h"
 #include "ATen/Utils.h"
 #include "ATen/core/Error.h"
 #include "ATen/detail/CUDAHooksInterface.h"
@@ -21,23 +22,25 @@
 
 namespace at {
 
+struct Tensor;
+
 class AT_API Context {
 public:
   Context();
-  Type* getNonVariableTypeRaw(Backend p, ScalarType s) {
-    return globalLegacyTypeDispatch().getNonVariableTypeRaw(p, s);
+  TypeExtendedInterface* getNonVariableTypeRaw(Backend p, ScalarType s) {
+    return static_cast<TypeExtendedInterface*>(globalLegacyTypeDispatch().getNonVariableTypeRaw(p, s));
   }
-  Type * getNonVariableTypeOpt(Backend p, ScalarType s) {
-    return globalLegacyTypeDispatch().getNonVariableTypeOpt(p, s);
+  TypeExtendedInterface * getNonVariableTypeOpt(Backend p, ScalarType s) {
+    return static_cast<TypeExtendedInterface*>(globalLegacyTypeDispatch().getNonVariableTypeOpt(p, s));
   }
-  Type & getNonVariableType(Backend p, ScalarType s) {
-    return globalLegacyTypeDispatch().getNonVariableType(p, s);
+  TypeExtendedInterface & getNonVariableType(Backend p, ScalarType s) {
+    return static_cast<TypeExtendedInterface&>(globalLegacyTypeDispatch().getNonVariableType(p, s));
   }
-  Type & getVariableType(Backend p, ScalarType s) {
-    return globalLegacyTypeDispatch().getVariableType(p, s);
+  TypeExtendedInterface & getVariableType(Backend p, ScalarType s) {
+    return static_cast<TypeExtendedInterface&>(globalLegacyTypeDispatch().getVariableType(p, s));
   }
-  Type & getType(Backend p, ScalarType s, bool is_variable) {
-    return globalLegacyTypeDispatch().getType(p, s, is_variable);
+  TypeExtendedInterface & getType(Backend p, ScalarType s, bool is_variable) {
+    return static_cast<TypeExtendedInterface&>(globalLegacyTypeDispatch().getType(p, s, is_variable));
   }
   // The passed in Type must be delete'able
   // TODO: Just make it take a unique_ptr
@@ -142,24 +145,25 @@ static inline void init() {
   }
 }
 
-static inline Type& getNonVariableType(Backend p, ScalarType s) {
+static inline TypeExtendedInterface& getNonVariableType(Backend p, ScalarType s) {
   return globalContext().getNonVariableType(p, s);
 }
 
-static inline Type& getNonVariableType(DeviceType p, ScalarType s) {
+static inline TypeExtendedInterface& getNonVariableType(DeviceType p, ScalarType s) {
   return globalContext().getNonVariableType(deviceTypeToBackend(p), s);
 }
 
-AT_API Type& getType(TensorOptions options);
-AT_API Type& getType(const TensorImpl*);
+AT_API TypeExtendedInterface& getType(TensorOptions options);
+AT_API TypeExtendedInterface& getType(const TensorImpl*);
+AT_API TypeExtendedInterface& getType(const Tensor&);
 
 AT_API Allocator* getCPUAllocator();
 
-static inline Type& CPU(ScalarType s) {
+static inline TypeExtendedInterface& CPU(ScalarType s) {
   return getNonVariableType(Backend::CPU, s);
 }
 
-static inline Type& CUDA(ScalarType s) {
+static inline TypeExtendedInterface& CUDA(ScalarType s) {
   return getNonVariableType(Backend::CUDA, s);
 }
 
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
index 9d67537ccdedd8..d45815c5b600c9 100644
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -302,8 +302,7 @@
       output: True
     - accreal start
     - accreal end
-    - arg: accreal step
-      default: 1
+    - accreal step
 ]]
 [[
   name: _arange
@@ -320,8 +319,7 @@
           output: True
         - accreal start
         - accreal end
-        - arg: accreal step
-          default: 1
+        - accreal step
     - cname: arange
       arguments:
         - arg: THTensor* result
@@ -1956,8 +1954,7 @@
       output: True
     - real start
     - real end
-    - arg: long steps
-      default: 100
+    - long steps
 ]]
 [[
   name: _logspace
@@ -1976,8 +1973,7 @@
       output: True
     - real start
     - real end
-    - arg: long steps
-      default: 100
+    - long steps
 ]]
 [[
   name: histc
@@ -2471,11 +2467,12 @@
         - THTensor* mat2
 ]]
 [[
-  name: bmm
+  name: _th_bmm
   cname: baddbmm
   variants:
-    - method
     - function
+  backends:
+    - CUDA
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -2525,10 +2522,12 @@
     - THTensor* batch2
 ]]
 [[
-  name: baddbmm
+  name: _th_baddbmm
+  cname: baddbmm
   variants:
-    - method
     - function
+  backends:
+    - CUDA
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -2544,22 +2543,6 @@
     - THTensor* batch1
     - THTensor* batch2
 ]]
-[[
-  name: baddbmm_
-  cname: baddbmm
-  return: argument 0
-  arguments:
-    - THTensor* self
-    - arg: real beta
-      default: AS_REAL(1)
-      kwarg_only: True
-    - THTensor* self
-    - arg: real alpha
-      default: AS_REAL(1)
-      kwarg_only: True
-    - THTensor* batch1
-    - THTensor* batch2
-]]
 [[
   name: addcmul
   variants:
diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h
index b51d80d22d350f..4da336aef5b7cd 100644
--- a/aten/src/ATen/DeviceGuard.h
+++ b/aten/src/ATen/DeviceGuard.h
@@ -4,6 +4,7 @@
 #include <ATen/core/ScalarType.h>
 #include <ATen/Tensor.h>
 #include <ATen/core/Error.h>
+#include <ATen/core/optional.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 
 #include <cstddef>
@@ -28,6 +29,12 @@ struct DeviceGuard {
     }
   }
 
+  explicit DeviceGuard(optional<Device> device_opt) {
+    if (device_opt.has_value() && device_opt.value().is_cuda()) {
+      set_index(device_opt.value().index());
+    }
+  }
+
   /// Calls `set_index` with the given index.
   explicit DeviceGuard(int32_t index) {
     set_index(index);
diff --git a/aten/src/ATen/DimVector.h b/aten/src/ATen/DimVector.h
index 9bbf37b5a9f4d4..cb652fffcb1481 100644
--- a/aten/src/ATen/DimVector.h
+++ b/aten/src/ATen/DimVector.h
@@ -1,11 +1,2 @@
 #pragma once
-
-#include <ATen/core/SmallVector.h>
-#include <stdint.h>
-
-namespace at {
-
-/// A container for sizes or strides
-using DimVector = SmallVector<int64_t, 5>;
-
-}
+#include <ATen/core/DimVector.h>
diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index c598901b2b943f..64f181d4dccb3c 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -79,3 +79,52 @@
         AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
     }                                                                         \
   }()
+
+#define AT_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...)                            \
+  [&] {                                                                       \
+    const at::Type& the_type = TYPE;                                          \
+    switch (the_type.scalarType()) {                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex<float>, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex<double>, __VA_ARGS__)      \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
+    }                                                                         \
+  }()
+
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX(TYPE, NAME, ...)                       \
+  [&] {                                                                       \
+    const at::Type& the_type = TYPE;                                          \
+    switch (the_type.scalarType()) {                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex<float>, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex<double>, __VA_ARGS__)      \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
+    }                                                                         \
+  }()
+
+#define AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX(TYPE, NAME, ...)                       \
+  [&] {                                                                       \
+    const at::Type& the_type = TYPE;                                          \
+    switch (the_type.scalarType()) {                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)         \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__)       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex<float>, __VA_ARGS__)        \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex<double>, __VA_ARGS__)      \
+      default:                                                                \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
+    }                                                                         \
+  }()
diff --git a/aten/src/ATen/Formatting.h b/aten/src/ATen/Formatting.h
index 6a8b502cc978a8..392e2a27b0130c 100644
--- a/aten/src/ATen/Formatting.h
+++ b/aten/src/ATen/Formatting.h
@@ -1,24 +1 @@
-#pragma once
-
-#include <iostream>
-#include "ATen/Type.h"
-#include "ATen/core/Scalar.h"
-
-namespace at {
-
-AT_API std::ostream& operator<<(std::ostream & out, IntList list);
-AT_API std::ostream& operator<<(std::ostream & out, Backend b);
-AT_API std::ostream& operator<<(std::ostream & out, const Type & t);
-AT_API std::ostream& print(std::ostream& stream, const Tensor & tensor, int64_t linesize);
-static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
-  return print(out,t,80);
-}
-static inline void print(const Tensor & t, int64_t linesize=80) {
-  print(std::cout,t,linesize);
-}
-
-static inline std::ostream& operator<<(std::ostream & out, Scalar s) {
-  return out << (s.isFloatingPoint() ? s.toDouble() : s.toLong());
-}
-
-}
+#include <ATen/core/Formatting.h>
diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index 6aadd62eb1d3fd..a4c8b50abe8263 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -27,7 +27,7 @@ inline void parallel_for(
     const int64_t grain_size,
     const F& f) {
 #ifdef _OPENMP
-#pragma omp parallel if ((end - begin) >= grain_size)
+#pragma omp parallel if (!omp_in_parallel() && ((end - begin) >= grain_size))
   {
     int64_t num_threads = omp_get_num_threads();
     int64_t tid = omp_get_thread_num();
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index 3f13d59b4467e5..66b71dd7b8a650 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -28,13 +28,13 @@ namespace {
 //
 // This means that we allocate a [1,0] size indices tensor and a [0] size
 // values tensor for such an empty tensor.
-SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, at::ScalarType scalar_type)
-    : TensorImpl(type_id, scalar_type, nullptr, false)
+SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, const caffe2::TypeMeta& data_type)
+    : TensorImpl(type_id, data_type, nullptr, false)
     , size_{0}
     , sparseDims_(1)
     , denseDims_(0)
     , indices_(globalContext().getNonVariableTypeOpt(sparseTensorIdToDenseBackend(type_id), ScalarType::Long)->tensor({1, 0}))
-    , values_(globalContext().getNonVariableTypeOpt(sparseTensorIdToDenseBackend(type_id), scalar_type)->tensor()) {}
+    , values_(globalContext().getNonVariableTypeOpt(sparseTensorIdToDenseBackend(type_id), dataTypeToScalarType(data_type.id()))->tensor()) {}
 
 IntList SparseTensorImpl::sizes() const {
   return size_;
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 835b45c2a541ee..42b670bea08541 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "ATen/Tensor.h"
-#include "ATen/TensorImpl.h"
+#include "ATen/core/TensorImpl.h"
 #include "ATen/core/Error.h"
 
 namespace at {
@@ -36,7 +36,7 @@ struct AT_API SparseTensorImpl : public TensorImpl {
 
 public:
   // Public for now...
-  explicit SparseTensorImpl(at::TensorTypeId, at::ScalarType);
+  explicit SparseTensorImpl(at::TensorTypeId, const caffe2::TypeMeta&);
 
   int64_t nnz() const { return values_.size(0); }
   int64_t sparseDims() const { return sparseDims_; }
diff --git a/aten/src/ATen/Tensor.h b/aten/src/ATen/Tensor.h
new file mode 100644
index 00000000000000..cef05f5341cb31
--- /dev/null
+++ b/aten/src/ATen/Tensor.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <ATen/core/Tensor.h>
diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp
index ca3fcd961feda0..b11c7bb159900b 100644
--- a/aten/src/ATen/TensorGeometry.cpp
+++ b/aten/src/ATen/TensorGeometry.cpp
@@ -1,4 +1,5 @@
 #include <ATen/TensorGeometry.h>
+#include <ATen/TensorUtils.h>
 
 #include <ATen/ATen.h>
 
@@ -8,15 +9,7 @@ bool TensorGeometry::is_contiguous() const {
   if (numel_ == 0) {
     return true;
   }
-  int64_t dim = sizes_.size();
-  int64_t expected_stride = 1;
-  for (int64_t i = dim - 1; i >= 0; i--) {
-    if (sizes_[i] != 1 && strides_[i] != expected_stride) {
-      return false;
-    }
-    expected_stride *= sizes_[i];
-  }
-  return true;
+  return at::geometry_is_contiguous(sizes_, strides_);
 }
 
 Tensor TensorGeometry::zeros_with_stride(const Type& type) const {
diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h
index e0a649a49b6ccd..34ece0fc0d03fb 100644
--- a/aten/src/ATen/TensorImpl.h
+++ b/aten/src/ATen/TensorImpl.h
@@ -1,204 +1,2 @@
 #pragma once
-
-#include <atomic>
-#include <memory>
-
-#include "ATen/core/Storage.h"
-#include "ATen/core/optional.h"
-#include "ATen/core/TensorTypeId.h"
-#include "ATen/core/TensorTypeIdRegistration.h"
-#include "ATen/core/LegacyTypeDispatch.h"
-#include "ATen/core/Backend.h"
-
-struct THTensor;
-
-namespace at {
-class Scalar;
-struct Type;
-struct Storage;
-struct Tensor;
-} // namespace at
-
-namespace at {
-struct AT_API TensorImpl : public c10::intrusive_ptr_target {
-  TensorImpl() = delete;
-  TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable);
-  TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
-
-  virtual void release_resources() override;
-
-  Type & type() const {
-    // NB: It's valid to use getTypeRaw here, because the TensorImpl
-    // could not have been created without initializing the Type first.
-    // TODO: This is not actually true via the Caffe2 codepath!  Make
-    // it so.
-    return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), scalar_type(), is_variable());
-  }
-
-  TensorTypeId type_id() const { return type_id_; }
-  virtual IntList sizes() const;
-  virtual IntList strides() const;
-  virtual int64_t dim() const;
-  virtual const Storage& storage() const;
-  friend struct Type;
-
-  virtual int64_t numel() const {
-#ifdef DEBUG
-    AT_ASSERT(compute_numel() == numel_);
-#endif
-    return numel_;
-  }
-
-  virtual bool is_contiguous() const {
-#ifdef DEBUG
-    AT_ASSERT(compute_contiguous() == is_contiguous_);
-#endif
-    return is_contiguous_;
-  }
-
-  // this is called by the generated wrapper code when there are conditions
-  // when this output tensor should be zero dimensional. e.g. when all inputs
-  // to a function 'add' were zero dimensional, then condition_when_zero_dim == true.
-  // we also prevent this from getting marked as a zero dim tensor if it is not
-  // the right shape afterall.
-  virtual TensorImpl* maybe_zero_dim(bool condition_when_zero_dim);
-
-  // True if a tensor was auto-wrapped from a C++ or Python number.
-  // Wrapped numbers do not participate in the result type computation for
-  // mixed-type operations if there are any Tensors that are not wrapped
-  // numbers. Otherwise, they behave like their non-wrapped equivalents.
-  // See [Result type computation] in TensorIterator.h.
-  bool is_wrapped_number() const {
-    return is_wrapped_number_;
-  }
-  void set_wrapped_number(bool value) {
-    AT_ASSERT(dim() == 0);
-    is_wrapped_number_ = value;
-  }
-
-  // ~~~~~ Autograd API ~~~~~
-  // Some methods below are defined in TensorImpl.cpp because Tensor is an
-  // incomplete type.
-
-  virtual void set_requires_grad(bool requires_grad) {
-    AT_ERROR("set_requires_grad is not implemented for Tensor");
-  }
-  virtual bool requires_grad() const {
-    AT_ERROR("requires_grad is not implemented for Tensor");
-  }
-
-  virtual Tensor& grad();
-  virtual const Tensor& grad() const;
-
-  // TODO: make these protected
-  // Note: storage->size() may be greater than the recorded size
-  // of a tensor
-  at::Storage storage_;
-
-  template <typename T>
-  inline T * data() const {
-    return storage_.data<T>() + storage_offset_;
-  }
-
-  template <typename T>
-  inline T * unsafe_data() const {
-    return storage_.unsafe_data<T>() + storage_offset_;
-  }
-
-  inline at::ScalarType scalar_type() const {
-    return scalar_type_;
-  }
-
-  virtual int64_t storage_offset() const {
-    return storage_offset_;
-  }
-
-  // represents that numel() == 0.
-  inline bool is_empty() const {
-    return numel() == 0;
-  }
-
-  virtual void resize_dim(int64_t ndim) {
-    // NB: This is *truly* a resize; calling code (e.g., squeeze)
-    // assumes that old values are preserved
-    sizes_.resize(ndim);
-    strides_.resize(ndim);
-    refresh_numel();
-    refresh_contiguous();
-  }
-
-  virtual void set_size(int64_t dim, int64_t new_size) {
-    sizes_[dim] = new_size;
-    refresh_numel();
-    refresh_contiguous();
-  }
-
-  virtual void set_stride(int64_t dim, int64_t new_stride) {
-    strides_[dim] = new_stride;
-    refresh_numel();
-    refresh_contiguous();
-  }
-
-  virtual void set_storage_offset(int64_t storage_offset) {
-    storage_offset_ = storage_offset;
-    refresh_numel();
-    refresh_contiguous();
-  }
-
-  // WARNING: This function does not check if the requested
-  // sizes/strides are in bounds for the storage that is allocated;
-  // this is the responsibility of the caller
-  void set_sizes_and_strides(at::IntList new_size, at::IntList new_stride) {
-    AT_CHECK(
-        new_size.size() == new_stride.size(),
-        "dimensionality of sizes (",
-        new_size.size(),
-        ") must match dimensionality of strides (",
-        new_stride.size(),
-        ")");
-    sizes_ = new_size.vec();
-    strides_ = new_stride.vec();
-    refresh_numel();
-    refresh_contiguous();
-  }
-
-  virtual int64_t size(int64_t d) const;
-  virtual int64_t stride(int64_t d) const;
-
-  bool is_variable() const { return is_variable_; };
-
- private:
-  int64_t storage_offset_;
-  std::vector<int64_t> sizes_;
-  std::vector<int64_t> strides_;
-
-  bool is_contiguous_;
-  int64_t numel_;
-
-  int64_t compute_numel() const {
-    int64_t n = 1;
-    for (auto s : sizes()) {
-      n *= s;
-    }
-    return n;
-  }
-  bool compute_contiguous() const;
-
- protected:
-  void refresh_numel() {
-    numel_ = compute_numel();
-  }
-  void refresh_contiguous() {
-    is_contiguous_ = compute_contiguous();
-  }
-  TensorTypeId type_id_;
-  // INVARIANT: When storage is non-null, this scalar type must
-  // agree with the scalar type in storage
-  ScalarType scalar_type_;
-  bool is_variable_ = false;
-  bool is_wrapped_number_ = false;
-
- private:
-  TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable);
-};
-} // namespace at
+#include <ATen/core/TensorImpl.h>
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 454ab9e91fd29e..4ec8e374c2e515 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -215,4 +215,24 @@ void * maybe_data_ptr(const Tensor& tensor) {
 void * maybe_data_ptr(const TensorArg& tensor) {
   return tensor->defined() ? (void *)tensor->data_ptr() : nullptr;
 }
+
+// See TensorUtils.h on why this is useful now that we cache is_contiguous.
+bool geometry_is_contiguous(IntList sizes, IntList strides) {
+  int64_t dim = sizes.size();
+  int64_t expected_stride = 1;
+  bool contig_if_nonempty = true;
+  for (int64_t i = dim - 1; i >= 0; i--) {
+    if (sizes[i] == 0) {
+      return true;
+    }
+    if (contig_if_nonempty) {
+      if (sizes[i] != 1 && strides[i] != expected_stride) {
+        contig_if_nonempty = false;
+      }
+      expected_stride *= sizes[i];
+    }
+  }
+  return contig_if_nonempty;
+}
+
 }
diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h
index cc7453f77375f6..2443bde4b482cb 100644
--- a/aten/src/ATen/TensorUtils.h
+++ b/aten/src/ATen/TensorUtils.h
@@ -78,4 +78,11 @@ AT_API void checkBackend(CheckedFrom c, at::ArrayRef<Tensor> t, at::Backend back
 AT_API void * maybe_data_ptr(const Tensor& tensor);
 AT_API void * maybe_data_ptr(const TensorArg& tensor);
 
+// Return if the tensor geometry represented by `sizes` and `strides` is contiguous
+// Although we cache is_contiguous in tensor now, this is till useful because it
+// allows checking if a particular geometry is contiguous without explicitly
+// constructing a tensor, e.g., when you want to choose a kernel strategy based
+// on whether a subgeometry is contiguous.
+AT_API bool geometry_is_contiguous(IntList sizes, IntList strides);
+
 }
diff --git a/aten/src/ATen/Type.h b/aten/src/ATen/Type.h
new file mode 100644
index 00000000000000..0c95f43e0482e9
--- /dev/null
+++ b/aten/src/ATen/Type.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <ATen/core/Type.h>
diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp
deleted file mode 100644
index 956c70b9f178af..00000000000000
--- a/aten/src/ATen/UndefinedTensor.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "ATen/UndefinedTensor.h"
-#include "ATen/core/Error.h"
-
-namespace at {
-
-// should this use the globalContext?  Can it get a context passed in somehow?
-UndefinedTensor::UndefinedTensor()
-: TensorImpl(UndefinedTensorId(), ScalarType::Undefined, nullptr, /* is variable */ false) {
-}
-
-IntList UndefinedTensor::sizes() const {
-  AT_ERROR("sizes() called on undefined Tensor");
-}
-
-int64_t UndefinedTensor::size(int64_t d) const {
-  AT_ERROR("size(dim) called on an undefined Tensor");
-}
-
-int64_t UndefinedTensor::stride(int64_t d) const {
-  AT_ERROR("stride(dim) called on an undefined Tensor");
-}
-
-int64_t UndefinedTensor::dim() const {
-  AT_ERROR("dim() called on undefined Tensor");
-}
-
-const Storage& UndefinedTensor::storage() const {
-  AT_ERROR("storage() called on undefined Tensor");
-}
-
-int64_t UndefinedTensor::storage_offset() const {
-  AT_ERROR("storage_offset() called on an undefined Tensor");
-}
-
-IntList UndefinedTensor::strides() const {
-  AT_ERROR("strides() called on undefined Tensor");
-}
-UndefinedTensor UndefinedTensor::_singleton;
-
-}
diff --git a/aten/src/ATen/UndefinedType.cpp b/aten/src/ATen/UndefinedType.cpp
index bea9baf61892f5..8e9722eae3be09 100644
--- a/aten/src/ATen/UndefinedType.cpp
+++ b/aten/src/ATen/UndefinedType.cpp
@@ -8,6 +8,9 @@ UndefinedType::UndefinedType()
 ScalarType UndefinedType::scalarType() const {
   return ScalarType::Undefined;
 }
+caffe2::TypeMeta UndefinedType::typeMeta() const {
+  return scalarTypeToTypeMeta(scalarType());
+}
 Backend UndefinedType::backend() const {
   return Backend::Undefined;
 }
diff --git a/aten/src/ATen/UndefinedType.h b/aten/src/ATen/UndefinedType.h
index 594fb99e61dc0e..4ccd6101851a72 100644
--- a/aten/src/ATen/UndefinedType.h
+++ b/aten/src/ATen/UndefinedType.h
@@ -14,6 +14,7 @@ namespace at {
 struct UndefinedType final : public TypeDefault {
   explicit UndefinedType();
   virtual ScalarType scalarType() const override;
+  virtual caffe2::TypeMeta typeMeta() const override;
   virtual Backend backend() const override;
   virtual Allocator* allocator() const override;
   virtual Device getDeviceFromPtr(void* data) const override;
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
index fff88e39aba053..c4473d1471ab7d 100644
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@@ -2,7 +2,7 @@
 
 #include "ATen/core/ATenGeneral.h"
 #include "ATen/StorageImpl.h"
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 
 #include <ATen/core/ScalarType.h>
 #include "ATen/Formatting.h"
@@ -44,12 +44,12 @@ static inline const Storage& checked_storage(
         name,
         "'");
   }
-  if (expr.dtype() != data_type) {
+  if (expr.dtype().id() != data_type) {
     AT_ERROR(
         "Expected object of data type ",
         data_type,
         " but got data type ",
-        expr.dtype(),
+        expr.dtype().id(),
         " for argument #",
         pos,
         " '",
diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
index 8e9db589c5267a..467a5664f6e05b 100644
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "ATen/core/WrapDimMinimal.h"
-#include "ATen/TensorImpl.h"
+#include "ATen/core/TensorImpl.h"
 
 namespace at {
 
diff --git a/aten/src/ATen/WrapDimUtilsMulti.h b/aten/src/ATen/WrapDimUtilsMulti.h
index f3d3a81a365c26..4d3df92fe0bc57 100644
--- a/aten/src/ATen/WrapDimUtilsMulti.h
+++ b/aten/src/ATen/WrapDimUtilsMulti.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "ATen/TensorImpl.h"
+#include "ATen/core/TensorImpl.h"
 #include "ATen/WrapDimUtils.h"
 #include <sstream>
 #include <bitset>
diff --git a/aten/src/ATen/copy_wrapper.py b/aten/src/ATen/copy_wrapper.py
index e4651f8c846070..a746f5543901ae 100644
--- a/aten/src/ATen/copy_wrapper.py
+++ b/aten/src/ATen/copy_wrapper.py
@@ -238,7 +238,7 @@ def create(all_types, backend):
         top_env['copy_includes'].append(
             '#include "ATen/{}.h"'.format(the_type['Type']))
     top_env['copy_includes'].append(
-        '#include "ATen/TensorImpl.h"')
+        '#include "ATen/core/TensorImpl.h"')
 
     # Code generation
     for the_type in all_types:
diff --git a/aten/src/ATen/core/ATenCoreTest.cpp b/aten/src/ATen/core/ATenCoreTest.cpp
index 5bb595a0bce5de..bb670b315f16c9 100644
--- a/aten/src/ATen/core/ATenCoreTest.cpp
+++ b/aten/src/ATen/core/ATenCoreTest.cpp
@@ -1,9 +1,11 @@
 #include <ATen/core/ATenCoreTest.h>
+#include <ATen/core/Tensor.h>
 
 namespace at {
 
 static int CoreTestGlobal = 0;
 int CoreTest() {
+  Tensor x;
   return CoreTestGlobal++;
 }
 
diff --git a/aten/src/ATen/core/C++17.h b/aten/src/ATen/core/C++17.h
index d8440ceea0c21a..ac5b3022ed5ff1 100644
--- a/aten/src/ATen/core/C++17.h
+++ b/aten/src/ATen/core/C++17.h
@@ -12,6 +12,53 @@
 
 namespace c10 { namespace guts {
 
+
+
+#ifdef __cpp_lib_transformation_trait_aliases
+template<bool B, class T, class F> using conditional_t = std::conditional_t<B, T, F>;
+template<bool B, class T = void> using enable_if_t = std::enable_if_t<B, T>;
+template<class T> using add_lvalue_reference_t = std::add_lvalue_reference_t<T>;
+template<class T> using remove_reference_t = std::remove_reference_t<T>;
+template<class T> using remove_cv_t = std::remove_cv_t<T>;
+template<class T> using result_of_t = std::result_of_t<T>;
+template<class T> using decay_t = std::decay_t<T>;
+template<class T> using remove_const_t = std::remove_const_t<T>;
+template<class T> using remove_pointer_t = std::remove_pointer_t<T>;
+#else
+template<bool B, class T, class F> using conditional_t = typename std::conditional<B, T, F>::type;
+template<bool B, class T = void> using enable_if_t = typename std::enable_if<B, T>::type;
+template<class T> using add_lvalue_reference_t = typename std::add_lvalue_reference<T>::type;
+template<class T> using remove_reference_t = typename std::remove_reference<T>::type;
+template<class T> using remove_cv_t = typename std::remove_cv<T>::type;
+template<class T> using result_of_t = typename std::result_of<T>::type;
+template<class T> using decay_t = typename std::decay<T>::type;
+template<class T> using remove_const_t = typename std::remove_const<T>::type;
+template<class T> using remove_pointer_t = typename std::remove_pointer<T>::type;
+#endif
+
+
+
+
+// C++11 doesn't have constexpr std::move / std::forward.
+// Implementation taken from libc++.
+template<class T>
+constexpr inline guts::remove_reference_t<T>&& move(T&& t) noexcept {
+  return static_cast<guts::remove_reference_t<T>&&>(t);
+}
+template <class T>
+constexpr inline T&& forward(guts::remove_reference_t<T>& t) noexcept {
+    return static_cast<T&&>(t);
+}
+template <class T>
+constexpr inline T&& forward(guts::remove_reference_t<T>&& t) noexcept {
+    static_assert(!std::is_lvalue_reference<T>::value,
+                  "can not forward an rvalue as an lvalue.");
+    return static_cast<T&&>(t);
+}
+
+
+
+
 #if __cplusplus >= 201402L || defined(__cpp_lib_make_unique) && __cpp_lib_make_unique >= 201304L || \
   (defined(__ANDROID__) && __ANDROID__ && __cplusplus >= 201300L) || defined(_MSC_VER) && _MSC_VER >= 1900
 
@@ -23,7 +70,7 @@ namespace c10 { namespace guts {
 template <typename T, typename... Args>
 typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
 make_unique(Args&&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+  return std::unique_ptr<T>(new T(forward<Args>(args)...));
 }
 // Allows 'make_unique<T[]>(10)'. (N3690 s20.9.1.4 p3-4)
 template <typename T>
@@ -39,6 +86,7 @@ make_unique(Args&&...) = delete;
 #endif
 
 
+
 #ifdef __cpp_lib_integer_sequence
 
 template<class T, T... Ints> using integer_sequence = std::integer_sequence<T, Ints...>;
@@ -73,26 +121,6 @@ template<class... T> using index_sequence_for = make_index_sequence<sizeof...(T)
 #endif
 
 
-#ifdef __cpp_lib_transformation_trait_aliases
-template<bool B, class T, class F> using conditional_t = std::conditional_t<B, T, F>;
-template<bool B, class T = void> using enable_if_t = std::enable_if_t<B, T>;
-template<class T> using add_lvalue_reference_t = std::add_lvalue_reference_t<T>;
-template<class T> using remove_reference_t = std::remove_reference_t<T>;
-template<class T> using remove_cv_t = std::remove_cv_t<T>;
-template<class T> using result_of_t = std::result_of_t<T>;
-template<class T> using decay_t = std::decay_t<T>;
-template<class T> using remove_const_t = std::remove_const_t<T>;
-#else
-template<bool B, class T, class F> using conditional_t = typename std::conditional<B, T, F>::type;
-template<bool B, class T = void> using enable_if_t = typename std::enable_if<B, T>::type;
-template<class T> using add_lvalue_reference_t = typename std::add_lvalue_reference<T>::type;
-template<class T> using remove_reference_t = typename std::remove_reference<T>::type;
-template<class T> using remove_cv_t = typename std::remove_cv<T>::type;
-template<class T> using result_of_t = typename std::result_of<T>::type;
-template<class T> using decay_t = typename std::decay<T>::type;
-template<class T> using remove_const_t = typename std::remove_const<T>::type;
-#endif
-
 
 
 #ifdef __cpp_lib_logical_traits
@@ -153,7 +181,7 @@ template<typename... Ts> using void_t = typename make_void<Ts...>::type;
 
 template <class F, class Tuple>
 inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
-  return std::apply(std::forward<F>(f), std::forward<Tuple>(t));
+  return std::apply(forward<F>(f), forward<Tuple>(t));
 }
 
 #else
@@ -162,19 +190,19 @@ inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
 // TODO This is an incomplete implementation of std::apply, not working for member functions.
 namespace detail {
 template <class F, class Tuple, std::size_t... I>
-constexpr auto apply_impl(F&& f, Tuple&& t, guts::index_sequence<I...>) -> decltype(std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...))
+constexpr auto apply_impl(F&& f, Tuple&& t, guts::index_sequence<I...>) -> decltype(forward<F>(f)(std::get<I>(forward<Tuple>(t))...))
 {
-    return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
+    return forward<F>(f)(std::get<I>(forward<Tuple>(t))...);
 }
 }  // namespace detail
 
 template <class F, class Tuple>
 constexpr auto apply(F&& f, Tuple&& t) -> decltype(detail::apply_impl(
-    std::forward<F>(f), std::forward<Tuple>(t),
+    forward<F>(f), forward<Tuple>(t),
     guts::make_index_sequence<std::tuple_size<guts::remove_reference_t<Tuple>>::value>{}))
 {
     return detail::apply_impl(
-        std::forward<F>(f), std::forward<Tuple>(t),
+        forward<F>(f), forward<Tuple>(t),
         guts::make_index_sequence<std::tuple_size<guts::remove_reference_t<Tuple>>::value>{});
 }
 
diff --git a/aten/src/ATen/core/DimVector.h b/aten/src/ATen/core/DimVector.h
new file mode 100644
index 00000000000000..a98c841a94777b
--- /dev/null
+++ b/aten/src/ATen/core/DimVector.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/core/SmallVector.h>
+#include <stdint.h>
+
+namespace at {
+
+/// A container for sizes or strides
+using DimVector = SmallVector<int64_t, 5>;
+
+} // namespace at
diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
similarity index 99%
rename from aten/src/ATen/Formatting.cpp
rename to aten/src/ATen/core/Formatting.cpp
index 390230316bd0dc..f13b0082d90d10 100644
--- a/aten/src/ATen/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@@ -1,6 +1,4 @@
-#include "ATen/Formatting.h"
-
-#include <ATen/ATen.h>
+#include "ATen/core/Formatting.h"
 
 #include <cmath>
 #include <cstdint>
diff --git a/aten/src/ATen/core/Formatting.h b/aten/src/ATen/core/Formatting.h
new file mode 100644
index 00000000000000..c6ac26b8a9e0e3
--- /dev/null
+++ b/aten/src/ATen/core/Formatting.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <ATen/core/Scalar.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/TensorMethods.h>
+#include <ATen/core/Type.h>
+#include <iostream>
+
+namespace at {
+
+AT_API std::ostream& operator<<(std::ostream & out, IntList list);
+AT_API std::ostream& operator<<(std::ostream & out, Backend b);
+AT_API std::ostream& operator<<(std::ostream & out, const Type & t);
+AT_API std::ostream& print(std::ostream& stream, const Tensor & tensor, int64_t linesize);
+static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
+  return print(out,t,80);
+}
+static inline void print(const Tensor & t, int64_t linesize=80) {
+  print(std::cout,t,linesize);
+}
+
+static inline std::ostream& operator<<(std::ostream & out, Scalar s) {
+  return out << (s.isFloatingPoint() ? s.toDouble() : s.toLong());
+}
+
+}
diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h
index a1786d0bb9db6e..75ff2a2fe6937f 100644
--- a/aten/src/ATen/core/Half-inl.h
+++ b/aten/src/ATen/core/Half-inl.h
@@ -16,7 +16,7 @@ namespace at {
 
 /// Constructors
 
-inline AT_HOSTDEVICE Half::Half(float value) {
+inline AT_HOST_DEVICE Half::Half(float value) {
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   x = __half_as_short(__float2half(value));
 #else
@@ -26,7 +26,7 @@ inline AT_HOSTDEVICE Half::Half(float value) {
 
 /// Implicit conversions
 
-inline AT_HOSTDEVICE Half::operator float() const {
+inline AT_HOST_DEVICE Half::operator float() const {
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   return __half2float(*reinterpret_cast<const __half*>(&x));
 #else
@@ -35,150 +35,158 @@ inline AT_HOSTDEVICE Half::operator float() const {
 }
 
 #ifdef __CUDACC__
-inline AT_HOSTDEVICE Half::Half(const __half& value) {
+inline AT_HOST_DEVICE Half::Half(const __half& value) {
   x = *reinterpret_cast<const unsigned short*>(&value);
 }
-inline AT_HOSTDEVICE Half::operator __half() const {
+inline AT_HOST_DEVICE Half::operator __half() const {
   return *reinterpret_cast<const __half*>(&x);
 }
 #endif
 
+// CUDA intrinsics
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)
+inline __device__ Half __ldg(const Half* ptr) {
+    return __ldg(reinterpret_cast<const __half*>(ptr));
+}
+#endif
+
 /// Arithmetic
 
-inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half operator+(const Half& a, const Half& b) {
   return static_cast<float>(a) + static_cast<float>(b);
 }
 
-inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half operator-(const Half& a, const Half& b) {
   return static_cast<float>(a) - static_cast<float>(b);
 }
 
-inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half operator*(const Half& a, const Half& b) {
   return static_cast<float>(a) * static_cast<float>(b);
 }
 
-inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half operator/(const Half& a, const Half& b) {
   return static_cast<float>(a) / static_cast<float>(b);
 }
 
-inline AT_HOSTDEVICE Half operator-(const Half& a) {
+inline AT_HOST_DEVICE Half operator-(const Half& a) {
   return -static_cast<float>(a);
 }
 
-inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half& operator+=(Half& a, const Half& b) {
   a = a + b;
   return a;
 }
 
-inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half& operator-=(Half& a, const Half& b) {
   a = a - b;
   return a;
 }
 
-inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half& operator*=(Half& a, const Half& b) {
   a = a * b;
   return a;
 }
 
-inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) {
+inline AT_HOST_DEVICE Half& operator/=(Half& a, const Half& b) {
   a = a / b;
   return a;
 }
 
 /// Arithmetic with floats
 
-inline AT_HOSTDEVICE float operator+(Half a, float b) {
+inline AT_HOST_DEVICE float operator+(Half a, float b) {
   return static_cast<float>(a) + b;
 }
-inline AT_HOSTDEVICE float operator-(Half a, float b) {
+inline AT_HOST_DEVICE float operator-(Half a, float b) {
   return static_cast<float>(a) - b;
 }
-inline AT_HOSTDEVICE float operator*(Half a, float b) {
+inline AT_HOST_DEVICE float operator*(Half a, float b) {
   return static_cast<float>(a) * b;
 }
-inline AT_HOSTDEVICE float operator/(Half a, float b) {
+inline AT_HOST_DEVICE float operator/(Half a, float b) {
   return static_cast<float>(a) / b;
 }
 
-inline AT_HOSTDEVICE float operator+(float a, Half b) {
+inline AT_HOST_DEVICE float operator+(float a, Half b) {
   return a + static_cast<float>(b);
 }
-inline AT_HOSTDEVICE float operator-(float a, Half b) {
+inline AT_HOST_DEVICE float operator-(float a, Half b) {
   return a - static_cast<float>(b);
 }
-inline AT_HOSTDEVICE float operator*(float a, Half b) {
+inline AT_HOST_DEVICE float operator*(float a, Half b) {
   return a * static_cast<float>(b);
 }
-inline AT_HOSTDEVICE float operator/(float a, Half b) {
+inline AT_HOST_DEVICE float operator/(float a, Half b) {
   return a / static_cast<float>(b);
 }
 
-inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) {
+inline AT_HOST_DEVICE float& operator+=(float& a, const Half& b) {
   return a += static_cast<float>(b);
 }
-inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) {
+inline AT_HOST_DEVICE float& operator-=(float& a, const Half& b) {
   return a -= static_cast<float>(b);
 }
-inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) {
+inline AT_HOST_DEVICE float& operator*=(float& a, const Half& b) {
   return a *= static_cast<float>(b);
 }
-inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) {
+inline AT_HOST_DEVICE float& operator/=(float& a, const Half& b) {
   return a /= static_cast<float>(b);
 }
 
 /// Arithmetic with doubles
 
-inline AT_HOSTDEVICE double operator+(Half a, double b) {
+inline AT_HOST_DEVICE double operator+(Half a, double b) {
   return static_cast<double>(a) + b;
 }
-inline AT_HOSTDEVICE double operator-(Half a, double b) {
+inline AT_HOST_DEVICE double operator-(Half a, double b) {
   return static_cast<double>(a) - b;
 }
-inline AT_HOSTDEVICE double operator*(Half a, double b) {
+inline AT_HOST_DEVICE double operator*(Half a, double b) {
   return static_cast<double>(a) * b;
 }
-inline AT_HOSTDEVICE double operator/(Half a, double b) {
+inline AT_HOST_DEVICE double operator/(Half a, double b) {
   return static_cast<double>(a) / b;
 }
 
-inline AT_HOSTDEVICE double operator+(double a, Half b) {
+inline AT_HOST_DEVICE double operator+(double a, Half b) {
   return a + static_cast<double>(b);
 }
-inline AT_HOSTDEVICE double operator-(double a, Half b) {
+inline AT_HOST_DEVICE double operator-(double a, Half b) {
   return a - static_cast<double>(b);
 }
-inline AT_HOSTDEVICE double operator*(double a, Half b) {
+inline AT_HOST_DEVICE double operator*(double a, Half b) {
   return a * static_cast<double>(b);
 }
-inline AT_HOSTDEVICE double operator/(double a, Half b) {
+inline AT_HOST_DEVICE double operator/(double a, Half b) {
   return a / static_cast<double>(b);
 }
 
 /// Arithmetic with ints
 
-inline AT_HOSTDEVICE Half operator+(Half a, int b) {
+inline AT_HOST_DEVICE Half operator+(Half a, int b) {
   return a + static_cast<Half>(b);
 }
-inline AT_HOSTDEVICE Half operator-(Half a, int b) {
+inline AT_HOST_DEVICE Half operator-(Half a, int b) {
   return a - static_cast<Half>(b);
 }
-inline AT_HOSTDEVICE Half operator*(Half a, int b) {
+inline AT_HOST_DEVICE Half operator*(Half a, int b) {
   return a * static_cast<Half>(b);
 }
-inline AT_HOSTDEVICE Half operator/(Half a, int b) {
+inline AT_HOST_DEVICE Half operator/(Half a, int b) {
   return a / static_cast<Half>(b);
 }
 
-inline AT_HOSTDEVICE Half operator+(int a, Half b) {
+inline AT_HOST_DEVICE Half operator+(int a, Half b) {
   return static_cast<Half>(a) + b;
 }
-inline AT_HOSTDEVICE Half operator-(int a, Half b) {
+inline AT_HOST_DEVICE Half operator-(int a, Half b) {
   return static_cast<Half>(a) - b;
 }
-inline AT_HOSTDEVICE Half operator*(int a, Half b) {
+inline AT_HOST_DEVICE Half operator*(int a, Half b) {
   return static_cast<Half>(a) * b;
 }
-inline AT_HOSTDEVICE Half operator/(int a, Half b) {
+inline AT_HOST_DEVICE Half operator/(int a, Half b) {
   return static_cast<Half>(a) / b;
 }
 
diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h
index c306fcd6b92b72..47a8e8e52d2adb 100644
--- a/aten/src/ATen/core/Half.h
+++ b/aten/src/ATen/core/Half.h
@@ -30,14 +30,6 @@
 #include <hip/hip_fp16.h>
 #endif
 
-#ifndef AT_HOSTDEVICE
-#ifdef __CUDACC__
-#define AT_HOSTDEVICE __host__ __device__
-#else
-#define AT_HOSTDEVICE
-#endif
-#endif
-
 namespace at {
 
 namespace detail {
@@ -55,18 +47,18 @@ struct alignas(2) Half {
 
   // HIP wants __host__ __device__ tag, CUDA does not
 #ifdef __HIP_PLATFORM_HCC__
-  AT_HOSTDEVICE Half() = default;
+  AT_HOST_DEVICE Half() = default;
 #else
   Half() = default;
 #endif
 
-  constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits){};
-  inline AT_HOSTDEVICE Half(float value);
-  inline AT_HOSTDEVICE operator float() const;
+  constexpr AT_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits){};
+  inline AT_HOST_DEVICE Half(float value);
+  inline AT_HOST_DEVICE operator float() const;
 
 #ifdef __CUDACC__
-  inline AT_HOSTDEVICE Half(const __half& value);
-  inline AT_HOSTDEVICE operator __half() const;
+  inline AT_HOST_DEVICE Half(const __half& value);
+  inline AT_HOST_DEVICE operator __half() const;
 #endif
 };
 
@@ -186,17 +178,8 @@ To checked_convert(From f, const char* name) {
   return convert<To, From>(f);
 }
 
-template <typename To, typename From>
-To HalfFix(From h) {
-  To ret;
-  ret.x = h.x;
-  return ret;
-}
-
 AT_CORE_API std::ostream& operator<<(std::ostream& out, const Half& value);
 
 } // namespace at
 
 #include "ATen/core/Half-inl.h"
-
-#undef AT_HOSTDEVICE
diff --git a/aten/src/ATen/core/Macros.h b/aten/src/ATen/core/Macros.h
index 67efa523ac2bba..244124475bc08f 100644
--- a/aten/src/ATen/core/Macros.h
+++ b/aten/src/ATen/core/Macros.h
@@ -39,6 +39,17 @@
 #define AT_CORE_API AT_CORE_IMPORT
 #endif // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
 
+#ifdef __CUDACC__
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define AT_HOST_DEVICE __host__ __device__
+#define AT_DEVICE __device__
+#define AT_HOST __host__
+#else
+#define AT_HOST_DEVICE
+#define AT_HOST
+#define AT_DEVICE
+#endif
+
 // Disable the copy and assignment operator for a class. Note that this will
 // disable the usage of the class in std containers.
 #define AT_DISABLE_COPY_AND_ASSIGN(classname) \
diff --git a/aten/src/ATen/core/Scalar.h b/aten/src/ATen/core/Scalar.h
index 35c4b538336aeb..de01a56ce33748 100644
--- a/aten/src/ATen/core/Scalar.h
+++ b/aten/src/ATen/core/Scalar.h
@@ -99,6 +99,6 @@ template<> \
 inline T Scalar::to<T>() { \
   return to##name(); \
 }
-AT_FORALL_SCALAR_TYPES(DEFINE_TO)
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO)
 #undef DEFINE_TO
 }
diff --git a/aten/src/ATen/core/ScalarType.h b/aten/src/ATen/core/ScalarType.h
index b5e1a47646d7d6..6fe88bfadb05f5 100644
--- a/aten/src/ATen/core/ScalarType.h
+++ b/aten/src/ATen/core/ScalarType.h
@@ -80,6 +80,18 @@ static inline DataType scalarTypeToDataType(ScalarType scalar_type) {
 #undef DEFINE_CASE
 }
 
+static inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) {
+#define DEFINE_CASE(ctype,name,_) \
+  case ScalarType:: name : return caffe2::TypeMeta::Make<ctype>();
+
+  switch(scalar_type) {
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
+    case ScalarType::Undefined: return caffe2::TypeMeta();
+    default: AT_ERROR("Unrecognized Scalartype ", scalar_type, " (please report this error)");
+  }
+#undef DEFINE_CASE
+}
+
 static inline ScalarType dataTypeToScalarType(DataType dtype) {
 #define DEFINE_IF(ctype,name,_) \
   if (dtype == caffe2::TypeMeta::Id<ctype>()) { \
diff --git a/aten/src/ATen/core/Storage.cpp b/aten/src/ATen/core/Storage.cpp
index 21f3e35ada4fd2..aca4bb75d2c95b 100644
--- a/aten/src/ATen/core/Storage.cpp
+++ b/aten/src/ATen/core/Storage.cpp
@@ -2,28 +2,4 @@
 
 namespace at {
 
-Storage::Storage(
-    at::ScalarType scalar_type,
-    size_t size,
-    Allocator* allocator,
-    bool resizable)
-    : storage_impl_(c10::make_intrusive<StorageImpl>(
-          at::scalarTypeToDataType(scalar_type),
-          size,
-          allocator,
-          resizable)) {}
-
-Storage::Storage(
-    at::ScalarType scalar_type,
-    at::DataPtr data_ptr,
-    size_t size,
-    const std::function<void(void*)>& deleter,
-    bool resizable)
-    : storage_impl_(c10::make_intrusive<StorageImpl>(
-          at::scalarTypeToDataType(scalar_type),
-          size,
-          std::move(data_ptr),
-          /* allocator */ nullptr,
-          resizable)) {}
-
 } // namespace at
diff --git a/aten/src/ATen/core/Storage.h b/aten/src/ATen/core/Storage.h
index 6f2a8fd68ee716..ab201be88d630e 100644
--- a/aten/src/ATen/core/Storage.h
+++ b/aten/src/ATen/core/Storage.h
@@ -7,20 +7,58 @@ namespace at {
 struct AT_API Storage {
 public:
   Storage() {}
-  Storage(StorageImpl* storage_impl) : storage_impl_(c10::intrusive_ptr<StorageImpl>::reclaim(storage_impl)) {}
-  Storage(const c10::intrusive_ptr<StorageImpl>& ptr) : storage_impl_(ptr) {}
-  Storage(c10::intrusive_ptr<StorageImpl>&& ptr) : storage_impl_(std::move(ptr)) {}
+  Storage(c10::intrusive_ptr<StorageImpl> ptr) : storage_impl_(std::move(ptr)) {}
   Storage(
-      at::ScalarType,
+      caffe2::TypeMeta data_type,
       size_t size,
       Allocator* allocator,
-      bool resizable = false);
+      bool resizable = false)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(
+            data_type,
+            size,
+            allocator,
+            resizable)) {}
+
   Storage(
-      at::ScalarType,
-      at::DataPtr,
+      caffe2::TypeMeta data_type,
+      at::DataPtr data_ptr,
       size_t size,
       const std::function<void(void*)>& deleter,
-      bool resizable = false);
+      bool resizable = false)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(
+            data_type,
+            size,
+            std::move(data_ptr),
+            /* allocator */ nullptr,
+            resizable)) {}
+
+  Storage(at::DeviceType device_type)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(device_type)) {}
+  Storage(at::DeviceType device_type, caffe2::TypeMeta data_type)
+      : storage_impl_(
+            c10::make_intrusive<StorageImpl>(device_type, data_type)) {}
+
+  Storage(
+      caffe2::TypeMeta data_type,
+      int64_t numel,
+      at::DataPtr data_ptr,
+      at::Allocator* allocator,
+      bool resizable)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(
+            data_type,
+            numel,
+            std::move(data_ptr),
+            allocator,
+            resizable)) {}
+
+  void reset() {
+    storage_impl_->reset();
+  }
+
+  template <typename T>
+  inline bool IsType() const {
+    return storage_impl_->IsType<T>();
+  }
 
   template <typename T>
   T* data() const { return storage_impl_->data<T>(); }
@@ -28,32 +66,122 @@ struct AT_API Storage {
   template <typename T>
   T* unsafe_data() const { return storage_impl_->unsafe_data<T>(); }
 
-  size_t elementSize() const { return storage_impl_->itemsize(); }
-  ptrdiff_t size() const { return storage_impl_->numel(); }
-  bool resizable() const { return storage_impl_->resizable(); }
+  size_t elementSize() const {
+    return storage_impl_->itemsize();
+  }
+
+  inline size_t itemsize() const {
+    return storage_impl_->itemsize();
+  }
+
+  ptrdiff_t size() const {
+    return storage_impl_->numel();
+  }
+
+  int64_t numel() const {
+    return storage_impl_->numel();
+  }
+
+  // TODO: remove later
+  void set_numel(int64_t numel) {
+    storage_impl_->set_numel(numel);
+  }
+
+  bool resizable() const {
+    return storage_impl_->resizable();
+  }
+
+  size_t capacity() const {
+    return storage_impl_->capacity();
+  }
   // get() use here is to get const-correctness
-  void* data() const { return storage_impl_.get()->data(); }
-  const at::DataType dtype() const {
+
+  void* data() {
+    return storage_impl_->data();
+  }
+
+  void* data() const {
+    return storage_impl_.get()->data();
+  }
+
+  const caffe2::TypeMeta& dtype() const {
     return storage_impl_->dtype();
   }
-  const at::DataPtr& data_ptr() const { return storage_impl_->data_ptr(); }
-  DeviceType device_type() const { return storage_impl_->device_type(); }
-  at::Allocator* allocator() const { return storage_impl_.get()->allocator(); }
-  at::Device device() const { return storage_impl_->device(); }
+
+  at::DataPtr& data_ptr() {
+    return storage_impl_->data_ptr();
+  }
+
+  const at::DataPtr& data_ptr() const {
+    return storage_impl_->data_ptr();
+  }
+
+  // Returns the previous data_ptr
+  at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) {
+    return storage_impl_->set_data_ptr(std::move(data_ptr));
+  };
+
+  void set_dtype(const caffe2::TypeMeta& data_type) {
+    storage_impl_->set_dtype(data_type);
+  }
+
+  DeviceType device_type() const {
+    return storage_impl_->device_type();
+  }
+
+  at::Allocator* allocator() const {
+    return storage_impl_.get()->allocator();
+  }
+
+  at::Device device() const {
+    return storage_impl_->device();
+  }
 
   StorageImpl* unsafeReleaseStorageImpl() {
     return storage_impl_.release();
   }
+
   StorageImpl* unsafeGetStorageImpl() const noexcept {
     return storage_impl_.get();
   }
+
   operator bool() const {
     return storage_impl_;
   }
+
   size_t use_count() const {
     return storage_impl_.use_count();
   }
 
+  inline bool unique() const {
+    return storage_impl_.unique();
+  }
+
+  void UniqueStorageShareExternalPointer(
+      void* src,
+      const caffe2::TypeMeta& data_type,
+      size_t capacity,
+      DeleterFnPtr d = nullptr) {
+    if (!storage_impl_.unique()) {
+      AT_ERROR(
+          "UniqueStorageShareExternalPointer can only be called when use_count == 1");
+    }
+    storage_impl_->UniqueStorageShareExternalPointer(
+        src, data_type, capacity, d);
+  }
+
+  void UniqueStorageShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const caffe2::TypeMeta& data_type,
+      size_t capacity) {
+    if (!storage_impl_.unique()) {
+      AT_ERROR(
+          "UniqueStorageShareExternalPointer can only be called when use_count == 1");
+    }
+    storage_impl_->UniqueStorageShareExternalPointer(
+        std::move(data_ptr), data_type, capacity);
+  }
+
  protected:
   c10::intrusive_ptr<StorageImpl> storage_impl_;
 };
diff --git a/aten/src/ATen/core/StorageImpl.cpp b/aten/src/ATen/core/StorageImpl.cpp
index b0f82132e39cb8..5190a7766dcb49 100644
--- a/aten/src/ATen/core/StorageImpl.cpp
+++ b/aten/src/ATen/core/StorageImpl.cpp
@@ -1,30 +1 @@
 #include <ATen/core/StorageImpl.h>
-
-namespace at {
-
-StorageImpl::StorageImpl(
-    at::DataType data_type,
-    int64_t numel,
-    at::DataPtr data_ptr,
-    at::Allocator* allocator,
-    bool resizable)
-    : data_type_(data_type),
-      data_ptr_(std::move(data_ptr)),
-      numel_(numel),
-      resizable_(resizable),
-      allocator_(allocator) {}
-
-StorageImpl::StorageImpl(
-    at::DataType data_type,
-    int64_t numel,
-    at::Allocator* allocator,
-    bool resizable)
-    : StorageImpl(
-          data_type,
-          numel,
-          allocator->allocate(
-              at::elementSize(dataTypeToScalarType(data_type)) * numel),
-          allocator,
-          resizable) {}
-
-} // namespace at
diff --git a/aten/src/ATen/core/StorageImpl.h b/aten/src/ATen/core/StorageImpl.h
index e80c11c6b0e211..cc63bd00906669 100644
--- a/aten/src/ATen/core/StorageImpl.h
+++ b/aten/src/ATen/core/StorageImpl.h
@@ -12,32 +12,73 @@ struct Type;
 
 struct AT_API StorageImpl : public c10::intrusive_ptr_target {
  public:
-  StorageImpl() = delete;
-  ~StorageImpl() {};
   StorageImpl(
-      at::DataType data_type,
+      caffe2::TypeMeta data_type,
       int64_t numel,
       at::DataPtr data_ptr,
       at::Allocator* allocator,
-      bool resizable);
+      bool resizable)
+      : data_type_(data_type),
+        data_ptr_(std::move(data_ptr)),
+        numel_(numel),
+        resizable_(resizable),
+        allocator_(allocator) {
+    if (numel > 0) {
+      if (data_type_.id() == caffe2::TypeIdentifier::uninitialized()) {
+        AT_ERROR(
+            "Constructing a storage with meta of unknown type and non-zero numel");
+      }
+    }
+  }
+
   StorageImpl(
-      at::DataType data_type,
+      caffe2::TypeMeta data_type,
       int64_t numel,
       at::Allocator* allocator,
-      bool resizable);
+      bool resizable)
+      : StorageImpl(
+            data_type,
+            numel,
+            allocator->allocate(data_type.itemsize() * numel),
+            allocator,
+            resizable) {}
+
+  explicit StorageImpl(at::DeviceType device_type)
+      : StorageImpl(device_type, caffe2::TypeMeta()) {}
+
+  StorageImpl(at::DeviceType device_type, caffe2::TypeMeta data_type)
+      : StorageImpl(
+            data_type,
+            0,
+            at::DataPtr(nullptr, at::Device(device_type)),
+            nullptr,
+            true) {}
+
+  StorageImpl& operator=(StorageImpl&& other) = default;
+  StorageImpl& operator=(const StorageImpl&) = delete;
+  StorageImpl() = delete;
+  StorageImpl(StorageImpl&& other) = default;
   StorageImpl(StorageImpl&) = delete;
   StorageImpl(const StorageImpl&) = delete;
-  // NB: Don't move ref count!
-  StorageImpl(StorageImpl&& other) = default;
-  StorageImpl& operator=(StorageImpl&& other) = default;
+  ~StorageImpl() = default;
+
+  void reset() {
+    data_ptr_.clear();
+    numel_ = 0;
+  }
+
+  template <typename T>
+  inline bool IsType() const {
+    return data_type_.Match<T>();
+  }
 
   template <typename T>
   inline T* data() const {
     auto data_type_T = at::scalarTypeToDataType(at::CTypeToScalarType<T>::to());
-    if (dtype() != data_type_T) {
+    if (dtype().id() != data_type_T) {
       AT_ERROR(
           "Attempt to access StorageImpl having data type ",
-          dtype(),
+          dtype().id(),
           " as data type ",
           data_type_T);
     }
@@ -53,52 +94,77 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
     data_ptr_.clear();
   }
 
-  void operator=(const StorageImpl&) = delete;
-
   size_t itemsize() const {
-    return at::elementSize(dataTypeToScalarType(data_type_));
+    return data_type_.itemsize();
   }
 
   Type& type();
 
+  size_t capacity() const {
+    return numel_ * itemsize();
+  }
+
   int64_t numel() const {
     return numel_;
   };
+
+  // TODO: remove later
   void set_numel(int64_t numel) {
     numel_ = numel;
   };
+
   bool resizable() const {
     return resizable_;
   };
+
   at::DataPtr& data_ptr() {
     return data_ptr_;
   };
+
   const at::DataPtr& data_ptr() const {
     return data_ptr_;
   };
+
   // Returns the previous data_ptr
   at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) {
     std::swap(data_ptr_, data_ptr);
     return std::move(data_ptr);
   };
+
+  // XXX: TERRIBLE! DONT USE UNLESS YOU HAVE TO! AND EVEN THEN DONT, JUST DONT!
+  // Setting the data_type will require you to audit many other parts of the
+  // struct again to make sure it's still valid.
+  void set_dtype(const caffe2::TypeMeta& data_type) {
+    int64_t capacity = numel_ * data_type_.itemsize();
+    data_type_ = data_type;
+    numel_ = capacity / data_type_.itemsize();
+  }
+
+  // TODO: Return const ptr eventually if possible
   void* data() {
     return data_ptr_.get();
-  };
-  const void* data() const {
+  }
+
+  void* data() const {
     return data_ptr_.get();
-  };
+  }
+
   at::DeviceType device_type() const {
     return data_ptr_.device().type();
   }
+
   at::Allocator* allocator() {
     return allocator_;
-  };
-  const DataType dtype() const {
+  }
+
+  const caffe2::TypeMeta& dtype() const {
     return data_type_;
   }
+
   const at::Allocator* allocator() const {
     return allocator_;
   };
+
   // You generally shouldn't use this method, but it is occasionally
   // useful if you want to override how a tensor will be reallocated,
   // after it was already allocated (and its initial allocator was
@@ -106,15 +172,53 @@ struct AT_API StorageImpl : public c10::intrusive_ptr_target {
   void set_allocator(at::Allocator* allocator) {
     allocator_ = allocator;
   }
+
   Device device() const {
     return data_ptr_.device();
   }
+
   void set_resizable(bool resizable) {
     resizable_ = resizable;
   }
 
+  /**
+   * Can only be called when use_count is 1
+   */
+  void UniqueStorageShareExternalPointer(
+      void* src,
+      const caffe2::TypeMeta& data_type,
+      size_t capacity,
+      DeleterFnPtr d = nullptr) {
+    UniqueStorageShareExternalPointer(
+        at::DataPtr(src, src, d, data_ptr_.device()), data_type, capacity);
+  }
+
+  /**
+   * Can only be called when use_count is 1
+   */
+  void UniqueStorageShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const caffe2::TypeMeta& data_type,
+      size_t capacity) {
+    data_type_ = data_type;
+    // TODO: Use CAFFE_ENFORCE_WITH_CALLER equivalent
+    // For now causes lots of redefine issues if caffe2/core/logging.h is used
+    if (data_type_.id() == caffe2::TypeIdentifier::uninitialized()) {
+      AT_ERROR(
+          "To share with a raw external pointer you need to have meta "
+          "already set.");
+    }
+    data_ptr_ = std::move(data_ptr);
+    // NOTE: data_type might change and so it's also possible that capacity
+    // might not be divisible by itemsize. There is no way for us to keep track
+    // of the exact capacity if we're not explicity storing is. More conrectely
+    // capacity() might not return the value that was set here, if itemsize does
+    // not evenly divide it.
+    numel_ = capacity / data_type_.itemsize();
+  }
+
  private:
-  at::DataType data_type_;
+  caffe2::TypeMeta data_type_;
   at::DataPtr data_ptr_;
   int64_t numel_;
   bool resizable_;
diff --git a/aten/src/ATen/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
similarity index 77%
rename from aten/src/ATen/Tensor.cpp
rename to aten/src/ATen/core/Tensor.cpp
index 860a5d2ab0afe1..924688d40b9551 100644
--- a/aten/src/ATen/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@@ -1,6 +1,6 @@
-#include <ATen/Tensor.h>
-#include <ATen/Type.h>
-#include <ATen/Formatting.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/Formatting.h>
+#include <ATen/core/Type.h>
 
 #include <iostream>
 
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
new file mode 100644
index 00000000000000..1b2c0f0e288264
--- /dev/null
+++ b/aten/src/ATen/core/Tensor.h
@@ -0,0 +1,688 @@
+#pragma once
+
+#include "ATen/core/Device.h"
+#include "ATen/core/Layout.h"
+#include "ATen/core/Scalar.h"
+#include "ATen/core/ScalarType.h"
+#include "ATen/core/SparseTensorRef.h"
+#include "ATen/core/Storage.h"
+#include "ATen/core/TensorAccessor.h"
+#include "ATen/core/TensorImpl.h"
+#include "ATen/core/optional.h"
+#include "ATen/core/UndefinedTensorImpl.h"
+#include "ATen/core/Error.h"
+
+namespace at {
+struct Generator;
+struct Type;
+struct Tensor;
+struct TensorOptions;
+} // namespace at
+
+namespace at {
+// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which
+// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr.
+//
+// For example:
+//
+// void func(Tensor a) {
+//   Tensor b = a;
+//   ...
+// }
+//
+// In this example, when we say Tensor b = a, we are creating a new object that points to the
+// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the
+// destructor decrements the reference count by calling release() on the TensorImpl it points to.
+// The existing constructors, operator overloads, etc. take care to implement the correct semantics.
+//
+// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
+// special care must be taken to handle this.
+struct AT_API Tensor {
+  Tensor(){};
+  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
+      : tensor_impl_(std::move(tensor_impl)) {
+    if (tensor_impl_.get() == nullptr) {
+      throw std::runtime_error("TensorBaseImpl with nullptr not supported");
+    }
+  }
+
+  Tensor(const Tensor&) = default;
+  Tensor(Tensor&&) = default;
+
+  int64_t dim() const {
+    return tensor_impl_->dim();
+  }
+
+  TensorImpl * unsafeGetTensorImpl() const {
+    return tensor_impl_.get();
+  }
+  TensorImpl * unsafeReleaseTensorImpl() {
+    return tensor_impl_.release();
+  }
+  const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr() const {
+    return tensor_impl_;
+  }
+
+  bool defined() const {
+    return tensor_impl_;
+  }
+
+  void reset() {
+    tensor_impl_.reset();
+  }
+
+  // The following overloads are very intruiging.  Consider the following
+  // program:
+  //
+  //    x[1] = 3;
+  //
+  // We would expect that the first entry of x is written to 3.  But how can we
+  // actually achieve this?  x[1] evaluates to a tensor...
+  //
+  // The answer is, using a ref-qualifier.  x[1] is an rvalue, which cannot be
+  // (profitably) assigned to in the traditional sense, so we overload
+  // assignment to mean, "Actually, copy 3 into the tensor data."  This is done
+  // with an rvalue-reference ref-qualified overload (the methods with && at the
+  // end of their type.)
+  //
+  // There's one more fly in the ointment: We also want
+  //
+  //    Tensor x = y;
+  //
+  // to work, and we want it NOT to copy.  So we need a traditional operator=
+  // overload.  But we MUST specify a mutable lvalue ref-qualifier, to
+  // disambiguate the traditional overload from the rvalue-reference
+  // ref-qualified overload.  Otherwise, it will be ambiguous, because
+  // a non ref-qualified method is eligible for all situations.
+
+  // Unfortunately, we have to write these constructors out manually
+  // to work around an MSVC bug:
+  //    error C2580: 'at::Tensor &at::Tensor::operator =(const at::Tensor &) &':
+  //    multiple versions of a defaulted special member functions are not allowed
+  // Tensor& operator=(const Tensor&) & = default;
+  // Tensor& operator=(Tensor&&) & = default;
+  Tensor& operator=(const Tensor& x) & {
+    tensor_impl_ = x.tensor_impl_;
+    return *this;
+  }
+  Tensor& operator=(Tensor&& x) & {
+    tensor_impl_ = std::move(x.tensor_impl_);
+    return *this;
+  }
+
+  Tensor& operator=(Scalar v) &&;
+  Tensor& operator=(const Tensor&) &&;
+  Tensor& operator=(Tensor&&) &&;
+
+  bool is_same(const Tensor& other) const noexcept {
+    return tensor_impl_ == other.tensor_impl_;
+  }
+  size_t use_count() const noexcept {
+    return tensor_impl_.use_count();
+  }
+  size_t weak_use_count() const noexcept {
+    return tensor_impl_.weak_use_count();
+  }
+
+  const char * toString() const;
+
+  IntList sizes() const {
+    return tensor_impl_->sizes();
+  }
+  IntList strides() const {
+    return tensor_impl_->strides();
+  }
+  int64_t ndimension() const {
+    return dim();
+  }
+  Type & type() const {
+    return tensor_impl_->type();
+  }
+  TensorTypeId type_id() const {
+    return tensor_impl_->type_id();
+  }
+  ScalarType scalar_type() const {
+    return dataTypeToScalarType(tensor_impl_->dtype().id());
+  }
+  const Storage& storage() const {
+    return tensor_impl_->storage();
+  }
+  Tensor toType(const Type & t, bool non_blocking=false) const;
+  Tensor & copy_(const Tensor & src, bool non_blocking=false);
+  Tensor toType(ScalarType t) const;
+  Tensor toBackend(Backend b) const;
+
+  /// Returns true if the `Tensor` is actually a `torch::autograd::Variable`.
+  /// Defined in Type.h because of include order issues.
+  bool is_variable() const noexcept;
+
+  /// Returns a `Tensor`'s layout. Defined in Type.h
+  Layout layout() const noexcept;
+
+  /// Returns a `Tensor`'s dtype (`ScalarType`). Defined in Type.h
+  ScalarType dtype() const noexcept;
+
+  /// Returns a `Tensor`'s device.
+  Device device() const;
+
+  /// Returns the `TensorOptions` corresponding to this `Tensor`. Defined in
+  /// TensorOptions.h.
+  TensorOptions options() const;
+
+  template<typename T>
+  T * data() const;
+
+  // Purposely not defined here to avoid inlining
+  void print() const;
+
+  //toLongData(), toFloatData() etc.
+  #define TO_TYPE_DATA(T,name,_) \
+  T * to##name##Data() const;
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_TYPE_DATA)
+  #undef TO_TYPE_DATA
+
+  #define TO_C_TYPE(T,name,_) \
+  T toC##name () const;
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE)
+  #undef TO_C_TYPE
+
+  // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and
+  // dimension.
+  template<typename T, size_t N>
+  TensorAccessor<T,N> accessor() const& {
+    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
+    AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
+    return TensorAccessor<T,N>(data<T>(),sizes().data(),strides().data());
+  }
+  template<typename T, size_t N>
+  TensorAccessor<T,N> accessor() && = delete;
+
+  // Return a `PackedTensorAccessor` for CUDA `Tensor`s. You have to specify scalar type and
+  // dimension. You can optionally specify RestrictPtrTraits as a template parameter to
+  // cast the data pointer to a __restrict__ pointer.
+  // In order to use this, your CUDA kernel has to take a corresponding PackedTensorAccessor
+  // as an argument.
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+    PackedTensorAccessor<T,N,PtrTraits> packed_accessor() const& {
+    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
+    AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
+    return PackedTensorAccessor<T,N,PtrTraits>(static_cast<typename PtrTraits<T>::PtrType>(data<T>()),sizes().data(),strides().data());
+  }
+  template<typename T, size_t N,  template <typename U> class PtrTraits = DefaultPtrTraits>
+  PackedTensorAccessor<T,N> packed_accessor() && = delete;
+
+  Tensor operator-() const;
+  Tensor& operator+=(const Tensor & other);
+  Tensor& operator+=(Scalar other);
+  Tensor& operator-=(const Tensor & other);
+  Tensor& operator-=(Scalar other);
+  Tensor& operator*=(const Tensor & other);
+  Tensor& operator*=(Scalar other);
+  Tensor& operator/=(const Tensor & other);
+  Tensor& operator/=(Scalar other);
+  Tensor operator[](Scalar index) const;
+  Tensor operator[](Tensor index) const;
+  Tensor operator[](int64_t index) const;
+
+  Tensor cpu() const;
+  Tensor cuda() const;
+
+  // ~~~~~ Autograd API ~~~~~
+
+  Tensor& set_requires_grad(bool requires_grad) {
+    tensor_impl_->set_requires_grad(requires_grad);
+    return *this;
+  }
+  bool requires_grad() const {
+    return tensor_impl_->requires_grad();
+  }
+
+  Tensor& grad() {
+    return tensor_impl_->grad();
+  }
+  const Tensor& grad() const {
+    return tensor_impl_->grad();
+  }
+
+  void set_data(Tensor new_data);
+
+  /// Computes the gradient of current tensor w.r.t. graph leaves.
+  void backward(
+      at::optional<Tensor> gradient = at::nullopt,
+      bool keep_graph = false,
+      bool create_graph = false);
+
+  // STOP.  Thinking of adding a method here, which only makes use
+  // of other ATen methods?  Define it in native_functions.yaml.
+
+  //example
+  //Tensor * add(Tensor & b);
+  int64_t storage_offset() const;
+  Tensor & resize_(IntList size);
+  Tensor & set_(Storage source);
+  Tensor & set_(Storage source, int64_t storage_offset, IntList size, IntList stride={});
+  Tensor & set_(const Tensor & source);
+  Tensor & set_();
+  bool is_contiguous() const;
+  bool is_set_to(const Tensor & tensor) const;
+  Tensor & masked_fill_(const Tensor & mask, Scalar value);
+  Tensor & masked_fill_(const Tensor & mask, const Tensor & value);
+  Tensor & masked_scatter_(const Tensor & mask, const Tensor & source);
+  Tensor masked_select(const Tensor & mask) const;
+  Tensor nonzero() const;
+  Tensor contiguous() const;
+  Tensor view(IntList size) const;
+  Tensor index_select(int64_t dim, const Tensor & index) const;
+  Tensor take(const Tensor & index) const;
+  Tensor & put_(const Tensor & index, const Tensor & source, bool accumulate=false);
+  Tensor & index_add_(int64_t dim, const Tensor & index, const Tensor & source);
+  Tensor & index_fill_(int64_t dim, const Tensor & index, Scalar value);
+  Tensor & index_fill_(int64_t dim, const Tensor & index, const Tensor & value);
+  Tensor unfold(int64_t dimension, int64_t size, int64_t step) const;
+  Tensor & scatter_(int64_t dim, const Tensor & index, const Tensor & src);
+  Tensor & scatter_(int64_t dim, const Tensor & index, Scalar value);
+  Tensor & scatter_add_(int64_t dim, const Tensor & index, const Tensor & src);
+  Tensor gather(int64_t dim, const Tensor & index) const;
+  void* data_ptr() const;
+  bool equal(const Tensor & other) const;
+  Tensor __and__(Scalar other) const;
+  Tensor __and__(const Tensor & other) const;
+  Tensor & __iand__(Scalar other);
+  Tensor & __iand__(const Tensor & other);
+  Tensor __or__(Scalar other) const;
+  Tensor __or__(const Tensor & other) const;
+  Tensor & __ior__(Scalar other);
+  Tensor & __ior__(const Tensor & other);
+  Tensor __xor__(Scalar other) const;
+  Tensor __xor__(const Tensor & other) const;
+  Tensor & __ixor__(Scalar other);
+  Tensor & __ixor__(const Tensor & other);
+  Tensor __lshift__(Scalar other) const;
+  Tensor __lshift__(const Tensor & other) const;
+  Tensor & __ilshift__(Scalar other);
+  Tensor & __ilshift__(const Tensor & other);
+  Tensor __rshift__(Scalar other) const;
+  Tensor __rshift__(const Tensor & other) const;
+  Tensor & __irshift__(Scalar other);
+  Tensor & __irshift__(const Tensor & other);
+  Tensor lt(Scalar other) const;
+  Tensor lt(const Tensor & other) const;
+  Tensor & lt_(Scalar other);
+  Tensor & lt_(const Tensor & other);
+  Tensor gt(Scalar other) const;
+  Tensor gt(const Tensor & other) const;
+  Tensor & gt_(Scalar other);
+  Tensor & gt_(const Tensor & other);
+  Tensor le(Scalar other) const;
+  Tensor le(const Tensor & other) const;
+  Tensor & le_(Scalar other);
+  Tensor & le_(const Tensor & other);
+  Tensor ge(Scalar other) const;
+  Tensor ge(const Tensor & other) const;
+  Tensor & ge_(Scalar other);
+  Tensor & ge_(const Tensor & other);
+  Tensor eq(Scalar other) const;
+  Tensor eq(const Tensor & other) const;
+  Tensor & eq_(Scalar other);
+  Tensor & eq_(const Tensor & other);
+  Tensor ne(Scalar other) const;
+  Tensor ne(const Tensor & other) const;
+  Tensor & ne_(Scalar other);
+  Tensor & ne_(const Tensor & other);
+  Tensor min(const Tensor & other) const;
+  Tensor min() const;
+  Tensor max(const Tensor & other) const;
+  Tensor max() const;
+  Tensor median() const;
+  std::tuple<Tensor,Tensor> sort(int64_t dim=-1, bool descending=false) const;
+  std::tuple<Tensor,Tensor> topk(int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const;
+  Tensor all() const;
+  Tensor any() const;
+  Tensor lgamma() const;
+  Tensor & lgamma_();
+  Tensor digamma() const;
+  Tensor & digamma_();
+  Tensor polygamma(int64_t n) const;
+  Tensor & polygamma_(int64_t n);
+  Tensor & erfinv_();
+  Tensor erfinv() const;
+  Tensor & frac_();
+  Tensor frac() const;
+  Tensor renorm(Scalar p, int64_t dim, Scalar maxnorm) const;
+  Tensor & renorm_(Scalar p, int64_t dim, Scalar maxnorm);
+  Tensor dist(const Tensor & other, Scalar p=2) const;
+  Tensor reciprocal() const;
+  Tensor & reciprocal_();
+  Tensor neg() const;
+  Tensor & neg_();
+  Tensor atan2(const Tensor & other) const;
+  Tensor & atan2_(const Tensor & other);
+  Tensor pow(const Tensor & exponent) const;
+  Tensor & pow_(Scalar exponent);
+  Tensor & pow_(const Tensor & exponent);
+  Tensor lerp(const Tensor & end, Scalar weight) const;
+  Tensor & lerp_(const Tensor & end, Scalar weight);
+  Tensor histc(int64_t bins=100, Scalar min=0, Scalar max=0) const;
+  Tensor sign() const;
+  Tensor & sign_();
+  Tensor trace() const;
+  Tensor fmod(Scalar other) const;
+  Tensor fmod(const Tensor & other) const;
+  Tensor & fmod_(Scalar other);
+  Tensor & fmod_(const Tensor & other);
+  Tensor remainder(Scalar other) const;
+  Tensor remainder(const Tensor & other) const;
+  Tensor & remainder_(Scalar other);
+  Tensor & remainder_(const Tensor & other);
+  Tensor tril(int64_t diagonal=0) const;
+  Tensor & tril_(int64_t diagonal=0);
+  Tensor triu(int64_t diagonal=0) const;
+  Tensor & triu_(int64_t diagonal=0);
+  Tensor cross(const Tensor & other, int64_t dim=-1) const;
+  Tensor diag(int64_t diagonal=0) const;
+  Tensor addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
+  Tensor addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  Tensor & addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
+  Tensor addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
+  Tensor & addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
+  std::tuple<Tensor,Tensor> gels(const Tensor & A) const;
+  std::tuple<Tensor,Tensor> trtrs(const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const;
+  std::tuple<Tensor,Tensor> symeig(bool eigenvectors=false, bool upper=true) const;
+  std::tuple<Tensor,Tensor> eig(bool eigenvectors=false) const;
+  std::tuple<Tensor,Tensor,Tensor> svd(bool some=true) const;
+  Tensor potrf(bool upper=true) const;
+  Tensor potrs(const Tensor & input2, bool upper=true) const;
+  Tensor potri(bool upper=true) const;
+  std::tuple<Tensor,Tensor> pstrf(bool upper=true, Scalar tol=-1) const;
+  std::tuple<Tensor,Tensor> qr() const;
+  std::tuple<Tensor,Tensor> geqrf() const;
+  Tensor orgqr(const Tensor & input2) const;
+  Tensor ormqr(const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const;
+  std::tuple<Tensor,Tensor> btrifact(bool pivot=true) const;
+  std::tuple<Tensor,Tensor,Tensor> btrifact_with_info(bool pivot=true) const;
+  Tensor btrisolve(const Tensor & LU_data, const Tensor & LU_pivots) const;
+  Tensor & random_(int64_t from, int64_t to, Generator * generator=nullptr);
+  Tensor & random_(int64_t to, Generator * generator=nullptr);
+  Tensor & random_(Generator * generator=nullptr);
+  Tensor multinomial(int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const;
+  Tensor & uniform_(double from=0, double to=1, Generator * generator=nullptr);
+  Tensor & normal_(double mean=0, double std=1, Generator * generator=nullptr);
+  Tensor & cauchy_(double median=0, double sigma=1, Generator * generator=nullptr);
+  Tensor & log_normal_(double mean=1, double std=2, Generator * generator=nullptr);
+  Tensor & exponential_(double lambd=1, Generator * generator=nullptr);
+  Tensor & geometric_(double p, Generator * generator=nullptr);
+  Tensor abs() const;
+  Tensor & abs_();
+  Tensor acos() const;
+  Tensor & acos_();
+  Tensor add(const Tensor & other, Scalar alpha=1) const;
+  Tensor & add_(const Tensor & other, Scalar alpha=1);
+  Tensor add(Scalar other, Scalar alpha=1) const;
+  Tensor & add_(Scalar other, Scalar alpha=1);
+  Tensor addmv(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addmv_(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1);
+  Tensor addr(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1);
+  Tensor all(int64_t dim, bool keepdim=false) const;
+  bool allclose(const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const;
+  Tensor any(int64_t dim, bool keepdim=false) const;
+  Tensor argmax(int64_t dim, bool keepdim=false) const;
+  Tensor argmax() const;
+  Tensor argmin(int64_t dim, bool keepdim=false) const;
+  Tensor argmin() const;
+  Tensor as_strided(IntList size, IntList stride) const;
+  Tensor & as_strided_(IntList size, IntList stride);
+  Tensor as_strided(IntList size, IntList stride, int64_t storage_offset) const;
+  Tensor & as_strided_(IntList size, IntList stride, int64_t storage_offset);
+  Tensor asin() const;
+  Tensor & asin_();
+  Tensor atan() const;
+  Tensor & atan_();
+  Tensor baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
+  Tensor bernoulli(const Tensor & p, Generator * generator=nullptr) const;
+  Tensor bernoulli(double p, Generator * generator=nullptr) const;
+  Tensor bernoulli() const;
+  Tensor & bernoulli_(const Tensor & p, Generator * generator=nullptr);
+  Tensor & bernoulli_(double p, Generator * generator=nullptr);
+  Tensor & bernoulli_();
+  Tensor bincount(const Tensor & weights={}, int64_t minlength=0) const;
+  Tensor bmm(const Tensor & mat2) const;
+  Tensor ceil() const;
+  Tensor & ceil_();
+  std::vector<Tensor> chunk(int64_t chunks, int64_t dim=0) const;
+  Tensor clamp(Scalar min, Scalar max) const;
+  Tensor & clamp_(Scalar min, Scalar max);
+  Tensor clamp_max(Scalar max) const;
+  Tensor & clamp_max_(Scalar max);
+  Tensor clamp_min(Scalar min) const;
+  Tensor & clamp_min_(Scalar min);
+  Tensor cos() const;
+  Tensor & cos_();
+  Tensor cosh() const;
+  Tensor & cosh_();
+  Tensor cumsum(int64_t dim, ScalarType dtype) const;
+  Tensor cumsum(int64_t dim) const;
+  Tensor cumprod(int64_t dim, ScalarType dtype) const;
+  Tensor cumprod(int64_t dim) const;
+  Tensor det() const;
+  Tensor diagflat(int64_t offset=0) const;
+  Tensor diagonal(int64_t offset=0, int64_t dim1=0, int64_t dim2=1) const;
+  Tensor div(const Tensor & other) const;
+  Tensor & div_(const Tensor & other);
+  Tensor div(Scalar other) const;
+  Tensor & div_(Scalar other);
+  Tensor dot(const Tensor & tensor) const;
+  Tensor erf() const;
+  Tensor & erf_();
+  Tensor erfc() const;
+  Tensor & erfc_();
+  Tensor exp() const;
+  Tensor & exp_();
+  Tensor expm1() const;
+  Tensor & expm1_();
+  Tensor expand(IntList size, bool implicit=false) const;
+  Tensor expand_as(const Tensor & other) const;
+  Tensor flatten(int64_t start_dim=0, int64_t end_dim=-1) const;
+  Tensor & fill_(Scalar value);
+  Tensor & fill_(const Tensor & value);
+  Tensor floor() const;
+  Tensor & floor_();
+  Tensor ger(const Tensor & vec2) const;
+  std::tuple<Tensor,Tensor> gesv(const Tensor & A) const;
+  Tensor fft(int64_t signal_ndim, bool normalized=false) const;
+  Tensor ifft(int64_t signal_ndim, bool normalized=false) const;
+  Tensor rfft(int64_t signal_ndim, bool normalized=false, bool onesided=true) const;
+  Tensor irfft(int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) const;
+  Tensor index(TensorList indices) const;
+  Tensor & index_copy_(int64_t dim, const Tensor & index, const Tensor & source);
+  Tensor index_put(TensorList indices, const Tensor & values) const;
+  Tensor & index_put_(TensorList indices, const Tensor & values);
+  Tensor inverse() const;
+  Tensor isclose(const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const;
+  bool is_cuda() const;
+  bool is_distributed() const;
+  bool is_floating_point() const;
+  bool is_complex() const;
+  bool is_nonzero() const;
+  bool is_same_size(const Tensor & other) const;
+  bool is_signed() const;
+  bool is_sparse() const;
+  std::tuple<Tensor,Tensor> kthvalue(int64_t k, int64_t dim=-1, bool keepdim=false) const;
+  Tensor log() const;
+  Tensor & log_();
+  Tensor log10() const;
+  Tensor & log10_();
+  Tensor log1p() const;
+  Tensor & log1p_();
+  Tensor log2() const;
+  Tensor & log2_();
+  Tensor logdet() const;
+  Tensor log_softmax(int64_t dim) const;
+  Tensor logsumexp(int64_t dim, bool keepdim=false) const;
+  Tensor matmul(const Tensor & other) const;
+  Tensor matrix_power(int64_t n) const;
+  std::tuple<Tensor,Tensor> max(int64_t dim, bool keepdim=false) const;
+  Tensor max_values(int64_t dim, bool keepdim=false) const;
+  Tensor mean(ScalarType dtype) const;
+  Tensor mean() const;
+  Tensor mean(int64_t dim, bool keepdim, ScalarType dtype) const;
+  Tensor mean(int64_t dim, bool keepdim=false) const;
+  Tensor mean(int64_t dim, ScalarType dtype) const;
+  std::tuple<Tensor,Tensor> median(int64_t dim, bool keepdim=false) const;
+  std::tuple<Tensor,Tensor> min(int64_t dim, bool keepdim=false) const;
+  Tensor min_values(int64_t dim, bool keepdim=false) const;
+  Tensor mm(const Tensor & mat2) const;
+  std::tuple<Tensor,Tensor> mode(int64_t dim=-1, bool keepdim=false) const;
+  Tensor mul(const Tensor & other) const;
+  Tensor & mul_(const Tensor & other);
+  Tensor mul(Scalar other) const;
+  Tensor & mul_(Scalar other);
+  Tensor mv(const Tensor & vec) const;
+  Tensor mvlgamma(int64_t p) const;
+  Tensor & mvlgamma_(int64_t p);
+  Tensor narrow(int64_t dim, int64_t start, int64_t length) const;
+  Tensor permute(IntList dims) const;
+  Tensor pin_memory() const;
+  Tensor pinverse(double rcond=1e-15) const;
+  Tensor repeat(IntList repeats) const;
+  Tensor reshape(IntList shape) const;
+  Tensor reshape_as(const Tensor & other) const;
+  Tensor round() const;
+  Tensor & round_();
+  Tensor relu() const;
+  Tensor & relu_();
+  Tensor hardshrink(Scalar lambd=0.5) const;
+  Tensor hardshrink_backward(const Tensor & grad_out, Scalar lambd) const;
+  Tensor rsqrt() const;
+  Tensor & rsqrt_();
+  Tensor select(int64_t dim, int64_t index) const;
+  Tensor sigmoid() const;
+  Tensor & sigmoid_();
+  Tensor sin() const;
+  Tensor & sin_();
+  Tensor sinh() const;
+  Tensor & sinh_();
+  Tensor detach() const;
+  Tensor & detach_();
+  int64_t size(int64_t dim) const;
+  Tensor slice(int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const;
+  std::tuple<Tensor,Tensor> slogdet() const;
+  Tensor smm(const Tensor & mat2) const;
+  Tensor softmax(int64_t dim) const;
+  std::vector<Tensor> split(int64_t split_size, int64_t dim=0) const;
+  std::vector<Tensor> split_with_sizes(IntList split_sizes, int64_t dim=0) const;
+  Tensor squeeze() const;
+  Tensor squeeze(int64_t dim) const;
+  Tensor & squeeze_();
+  Tensor & squeeze_(int64_t dim);
+  Tensor sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor stft(int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window={}, bool normalized=false, bool onesided=true) const;
+  int64_t stride(int64_t dim) const;
+  Tensor sum(ScalarType dtype) const;
+  Tensor sum() const;
+  Tensor sum(IntList dim, bool keepdim, ScalarType dtype) const;
+  Tensor sum(IntList dim, bool keepdim=false) const;
+  Tensor sum(IntList dim, ScalarType dtype) const;
+  Tensor sqrt() const;
+  Tensor & sqrt_();
+  Tensor std(bool unbiased=true) const;
+  Tensor std(int64_t dim, bool unbiased=true, bool keepdim=false) const;
+  Tensor prod(ScalarType dtype) const;
+  Tensor prod() const;
+  Tensor prod(int64_t dim, bool keepdim, ScalarType dtype) const;
+  Tensor prod(int64_t dim, bool keepdim=false) const;
+  Tensor prod(int64_t dim, ScalarType dtype) const;
+  Tensor t() const;
+  Tensor & t_();
+  Tensor tan() const;
+  Tensor & tan_();
+  Tensor tanh() const;
+  Tensor & tanh_();
+  Tensor transpose(int64_t dim0, int64_t dim1) const;
+  Tensor & transpose_(int64_t dim0, int64_t dim1);
+  Tensor flip(IntList dims) const;
+  Tensor rot90(int64_t k=1, IntList dims={0,1}) const;
+  Tensor trunc() const;
+  Tensor & trunc_();
+  Tensor type_as(const Tensor & other) const;
+  Tensor unsqueeze(int64_t dim) const;
+  Tensor & unsqueeze_(int64_t dim);
+  Tensor var(bool unbiased=true) const;
+  Tensor var(int64_t dim, bool unbiased=true, bool keepdim=false) const;
+  Tensor view_as(const Tensor & other) const;
+  Tensor where(const Tensor & condition, const Tensor & other) const;
+  Tensor norm(Scalar p=2) const;
+  Tensor norm(Scalar p, int64_t dim, bool keepdim=false) const;
+  Tensor clone() const;
+  Tensor & resize_as_(const Tensor & the_template);
+  Tensor pow(Scalar exponent) const;
+  Tensor & zero_();
+  Tensor sub(const Tensor & other, Scalar alpha=1) const;
+  Tensor & sub_(const Tensor & other, Scalar alpha=1);
+  Tensor sub(Scalar other, Scalar alpha=1) const;
+  Tensor & sub_(Scalar other, Scalar alpha=1);
+  Tensor addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
+  Tensor & addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1);
+  Tensor & sparse_resize_(IntList size, int64_t sparseDims, int64_t denseDims);
+  Tensor & sparse_resize_and_clear_(IntList size, int64_t sparseDims, int64_t denseDims);
+  Tensor sparse_mask(SparseTensorRef mask) const;
+  Tensor to_dense() const;
+  int64_t _sparseDims() const;
+  int64_t _denseDims() const;
+  int64_t _nnz() const;
+  Tensor coalesce() const;
+  bool is_coalesced() const;
+  Tensor _indices() const;
+  Tensor _values() const;
+  int64_t numel() const;
+  std::vector<Tensor> unbind(int64_t dim=0) const;
+  int64_t get_device() const;
+  Tensor to(Device device, ScalarType dtype, bool non_blocking=false) const;
+  Tensor to(ScalarType dtype, bool non_blocking=false) const;
+  Tensor to(Device device, bool non_blocking=false) const;
+  Tensor to(const Tensor & other, bool non_blocking=false) const;
+  Scalar _local_scalar() const;
+
+  template <typename F, typename... Args>
+  auto m(F func, Args&&... params) const -> decltype(func(*this, std::forward<Args>(params)...)) {
+    return func(*this, std::forward<Args>(params)...);
+  }
+
+  friend struct WeakTensor;
+
+protected:
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl_;
+};
+
+struct AT_API WeakTensor {
+  WeakTensor(const Tensor& t) : weak_tensor_impl_(t.tensor_impl_) {}
+
+  // XXX: this can return undefined tensors
+  // Ideally it would be at::optional<Tensor>, but MSVC is too cool for that
+  Tensor lock() const {
+    return Tensor(weak_tensor_impl_.lock());
+  }
+
+  bool is_same(const WeakTensor& other) const noexcept {
+    return weak_tensor_impl_ == other.weak_tensor_impl_;
+  }
+
+  size_t use_count() const noexcept {
+    return weak_tensor_impl_.use_count();
+  }
+  size_t weak_use_count() const noexcept {
+    return weak_tensor_impl_.weak_use_count();
+  }
+
+  TensorImpl* unsafeGetTensorImpl() const {
+    return weak_tensor_impl_._unsafe_get_target();
+  }
+
+private:
+  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_tensor_impl_;
+};
+} // namespace at
+
+#include "ATen/core/TensorMethods.h"
diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h
index e3a73a3cbea2e5..d8a851d998332a 100644
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@@ -2,46 +2,145 @@
 
 #include <cstddef>
 #include <stdint.h>
+#include <ATen/core/Macros.h>
 
 namespace at {
 
+// The PtrTraits argument to the TensorAccessor/PackedTensorAccessor
+// is used to enable the __restrict__ keyword/modifier for the data
+// passed to cuda.
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+
+#ifdef __CUDACC__
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
+#endif
+
+// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
+// For CUDA tensors it is used in device code (only). This means that we restrict ourselves
+// to functions and types available there (e.g. IntList isn't).
 
-template<typename T, size_t N>
+// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
 class TensorAccessorBase {
 public:
-  TensorAccessorBase(T * data_, const int64_t * sizes_, const int64_t * strides_)
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  AT_HOST_DEVICE TensorAccessorBase(PtrType data_, const int64_t * sizes_, const int64_t * strides_)
   : data_(data_), sizes_(sizes_), strides_(strides_) {}
-  IntList sizes() {
+  AT_HOST IntList sizes() const {
     return IntList(sizes_,N);
   }
-  IntList strides() {
+  AT_HOST IntList strides() const {
     return IntList(strides_,N);
   }
-  int64_t stride(int64_t i) { return strides()[i]; }
-  int64_t size(int64_t i) { return sizes()[i]; }
+  AT_HOST_DEVICE int64_t stride(int64_t i) const { return strides_[i]; }
+  AT_HOST_DEVICE int64_t size(int64_t i) const { return sizes_[i]; }
+  AT_HOST_DEVICE T *data() { return data_; }
+  AT_HOST_DEVICE const T *data() const { return data_; }
 protected:
-  T * data_;
+  PtrType data_;
   const int64_t* sizes_;
   const int64_t* strides_;
 };
 
-template<typename T, size_t N>
-class TensorAccessor : public TensorAccessorBase<T,N> {
+// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
+// `Tensor.accessor<T, N>()`.
+// For CUDA `Tensor`s, `PackedTensorAccessor` is used on the host and only
+// indexing on the device uses `TensorAccessor`s.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits> {
 public:
-  TensorAccessor(T * data_, const int64_t * sizes_, const int64_t * strides_)
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  AT_HOST_DEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const int64_t * strides_)
   : TensorAccessorBase<T,N>(data_,sizes_,strides_) {}
 
-  TensorAccessor<T,N-1> operator[](int64_t i) {
+  AT_HOST_DEVICE TensorAccessor<T,N-1> operator[](int64_t i) {
+    return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
+  }
+
+  AT_HOST_DEVICE const TensorAccessor<T,N-1> operator[](int64_t i) const {
     return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
   }
 };
 
-template<typename T>
-class TensorAccessor<T,1> : public TensorAccessorBase<T,1> {
+template<typename T, template <typename U> class PtrTraits>
+class TensorAccessor<T,1,PtrTraits> : public TensorAccessorBase<T,1,PtrTraits> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  AT_HOST_DEVICE TensorAccessor(PtrType data_, const int64_t * sizes_, const   int64_t * strides_)
+  : TensorAccessorBase<T,1,PtrTraits>(data_,sizes_,strides_) {}
+  AT_HOST_DEVICE T & operator[](int64_t i) {
+    return this->data_[this->strides_[0]*i];
+  }
+};
+
+
+// PackedTensorAccessorBase and PackedTensorAccessor are used on for CUDA `Tensor`s on the host
+// and as
+// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
+// in order to transfer them on the device when calling kernels.
+// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
+// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
+// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available
+// on the device, so those functions are host only.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+class PackedTensorAccessorBase {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  AT_HOST PackedTensorAccessorBase(PtrType data_, const int64_t * sizes_, const   int64_t * strides_)
+  : data_(data_)
+  {
+    std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
+    std::copy(strides_, strides_ + N, std::begin(this->strides_));
+  }
+  AT_HOST_DEVICE int64_t stride(int64_t i) const { return strides_[i]; }
+  AT_HOST_DEVICE int64_t size(int64_t i) const { return sizes_[i]; }
+protected:
+  PtrType data_;
+  int64_t sizes_[N];
+  int64_t strides_[N];
+};
+
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+class PackedTensorAccessor : public PackedTensorAccessorBase<T,N,PtrTraits> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  AT_HOST PackedTensorAccessor(PtrType data_, const int64_t * sizes_, const   int64_t * strides_)
+  : PackedTensorAccessorBase<T,N,PtrTraits>(data_, sizes_, strides_) {};
+
+  AT_DEVICE TensorAccessor<T,N-1> operator[](int64_t i) {
+    int64_t* new_sizes = this->sizes_+1;
+    int64_t* new_strides = this->strides_+1;
+    return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
+  }
+
+  AT_DEVICE const TensorAccessor<T,N-1> operator[](int64_t i) const {
+    int64_t* new_sizes = this->sizes_+1;
+    int64_t* new_strides = this->strides_+1;
+    return TensorAccessor<T,N-1>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
+  }
+};
+
+template<typename T, template <typename U> class PtrTraits>
+class PackedTensorAccessor<T,1,PtrTraits> : public PackedTensorAccessorBase<T,1,PtrTraits> {
 public:
-  TensorAccessor(T * data_, const int64_t * sizes_, const   int64_t * strides_)
-  : TensorAccessorBase<T,1>(data_,sizes_,strides_) {}
-  T & operator[](int64_t i) {
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  AT_HOST PackedTensorAccessor(PtrType data_, const int64_t * sizes_, const   int64_t * strides_)
+  : PackedTensorAccessorBase<T,1,PtrTraits>(data_, sizes_, strides_) {};
+
+  AT_DEVICE T & operator[](int64_t i) {
+    return this->data_[this->strides_[0]*i];
+  }
+  AT_DEVICE const T& operator[](int64_t i) const {
     return this->data_[this->strides_[0]*i];
   }
 };
diff --git a/aten/src/ATen/TensorImpl.cpp b/aten/src/ATen/core/TensorImpl.cpp
similarity index 79%
rename from aten/src/ATen/TensorImpl.cpp
rename to aten/src/ATen/core/TensorImpl.cpp
index f4ecaf0b6253fa..5b568482d8dfe2 100644
--- a/aten/src/ATen/TensorImpl.cpp
+++ b/aten/src/ATen/core/TensorImpl.cpp
@@ -1,4 +1,4 @@
-#include <ATen/TensorImpl.h>
+#include <ATen/core/TensorImpl.h>
 
 #include <ATen/core/optional.h>
 #include <ATen/core/Backend.h>
@@ -17,19 +17,19 @@ const Tensor& TensorImpl::grad() const {
   AT_ERROR("grad is not implemented for Tensor");
 }
 
-TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable)
-    : TensorImpl({}, type_id, scalar_type, is_variable) {
+TensorImpl::TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable)
+    : TensorImpl({}, type_id, data_type, is_variable) {
   // UndefinedTensors and SparseTensors don't have storages.
-  if (type_id != UndefinedTensorId() && scalar_type != ScalarType::Undefined
+  if (type_id != UndefinedTensorId() && data_type.id() != caffe2::TypeIdentifier::uninitialized()
       && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) {
-    storage_ = Storage(scalar_type, 0, allocator, true);
+    storage_ = Storage(data_type, 0, allocator, true);
   }
 }
 
 TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable)
-    : TensorImpl(std::move(storage), type_id, dataTypeToScalarType(storage.dtype()), is_variable) {}
+    : TensorImpl(std::move(storage), type_id, storage.dtype(), is_variable) {}
 
-TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scalar_type, bool is_variable)
+TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable)
     : storage_(std::move(storage)),
       storage_offset_(0),
       sizes_{0},
@@ -37,7 +37,7 @@ TensorImpl::TensorImpl(Storage&& storage, TensorTypeId type_id, ScalarType scala
       is_contiguous_(true),
       numel_(0),
       type_id_(type_id),
-      scalar_type_(scalar_type),
+      data_type_(data_type),
       is_variable_(is_variable) {}
 
 IntList TensorImpl::sizes() const {
diff --git a/aten/src/ATen/core/TensorImpl.h b/aten/src/ATen/core/TensorImpl.h
new file mode 100644
index 00000000000000..d2f98ff52780f8
--- /dev/null
+++ b/aten/src/ATen/core/TensorImpl.h
@@ -0,0 +1,210 @@
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+#include "ATen/core/Storage.h"
+#include "ATen/core/optional.h"
+#include "ATen/core/TensorTypeId.h"
+#include "ATen/core/TensorTypeIdRegistration.h"
+#include "ATen/core/LegacyTypeDispatch.h"
+#include "ATen/core/Backend.h"
+
+struct THTensor;
+
+namespace at {
+class Scalar;
+struct Type;
+struct Storage;
+struct Tensor;
+} // namespace at
+
+namespace at {
+struct AT_API TensorImpl : public c10::intrusive_ptr_target {
+  TensorImpl() = delete;
+  TensorImpl(TensorTypeId type_id, const caffe2::TypeMeta& data_type, Allocator *allocator, bool is_variable);
+  TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
+
+  virtual void release_resources() override;
+
+  Type & type() const {
+    // NB: It's valid to use getTypeRaw here, because the TensorImpl
+    // could not have been created without initializing the Type first.
+    // TODO: This is not actually true via the Caffe2 codepath!  Make
+    // it so.
+    return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), dataTypeToScalarType(dtype().id()), is_variable());
+  }
+
+  TensorTypeId type_id() const { return type_id_; }
+  virtual IntList sizes() const;
+  virtual IntList strides() const;
+  virtual int64_t dim() const;
+  virtual const Storage& storage() const;
+  friend struct Type;
+
+  virtual int64_t numel() const {
+#ifdef DEBUG
+    AT_ASSERT(compute_numel() == numel_);
+#endif
+    return numel_;
+  }
+
+  virtual bool is_contiguous() const {
+#ifdef DEBUG
+    AT_ASSERT(compute_contiguous() == is_contiguous_);
+#endif
+    return is_contiguous_;
+  }
+
+  // this is called by the generated wrapper code when there are conditions
+  // when this output tensor should be zero dimensional. e.g. when all inputs
+  // to a function 'add' were zero dimensional, then condition_when_zero_dim == true.
+  // we also prevent this from getting marked as a zero dim tensor if it is not
+  // the right shape afterall.
+  virtual TensorImpl* maybe_zero_dim(bool condition_when_zero_dim);
+
+  // True if a tensor was auto-wrapped from a C++ or Python number.
+  // Wrapped numbers do not participate in the result type computation for
+  // mixed-type operations if there are any Tensors that are not wrapped
+  // numbers. Otherwise, they behave like their non-wrapped equivalents.
+  // See [Result type computation] in TensorIterator.h.
+  bool is_wrapped_number() const {
+    return is_wrapped_number_;
+  }
+  void set_wrapped_number(bool value) {
+    AT_ASSERT(dim() == 0);
+    is_wrapped_number_ = value;
+  }
+
+  // ~~~~~ Autograd API ~~~~~
+  // Some methods below are defined in TensorImpl.cpp because Tensor is an
+  // incomplete type.
+
+  virtual void set_requires_grad(bool requires_grad) {
+    AT_ERROR("set_requires_grad is not implemented for Tensor");
+  }
+  virtual bool requires_grad() const {
+    AT_ERROR("requires_grad is not implemented for Tensor");
+  }
+
+  virtual Tensor& grad();
+  virtual const Tensor& grad() const;
+
+  // TODO: make these protected
+  // Note: storage->size() may be greater than the recorded size
+  // of a tensor
+  at::Storage storage_;
+
+  template <typename T>
+  inline T * data() const {
+    return storage_.data<T>() + storage_offset_;
+  }
+
+  inline void* data() const {
+    return static_cast<void*>(
+        static_cast<char*>(storage_.data()) +
+        data_type_.itemsize() * storage_offset_);
+  }
+
+  template <typename T>
+  inline T * unsafe_data() const {
+    return storage_.unsafe_data<T>() + storage_offset_;
+  }
+
+  const caffe2::TypeMeta& dtype() const {
+    return data_type_;
+  }
+
+  virtual int64_t storage_offset() const {
+    return storage_offset_;
+  }
+
+  // represents that numel() == 0.
+  inline bool is_empty() const {
+    return numel() == 0;
+  }
+
+  virtual void resize_dim(int64_t ndim) {
+    // NB: This is *truly* a resize; calling code (e.g., squeeze)
+    // assumes that old values are preserved
+    sizes_.resize(ndim);
+    strides_.resize(ndim);
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  virtual void set_size(int64_t dim, int64_t new_size) {
+    sizes_[dim] = new_size;
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  virtual void set_stride(int64_t dim, int64_t new_stride) {
+    strides_[dim] = new_stride;
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  virtual void set_storage_offset(int64_t storage_offset) {
+    storage_offset_ = storage_offset;
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  // WARNING: This function does not check if the requested
+  // sizes/strides are in bounds for the storage that is allocated;
+  // this is the responsibility of the caller
+  void set_sizes_and_strides(at::IntList new_size, at::IntList new_stride) {
+    AT_CHECK(
+        new_size.size() == new_stride.size(),
+        "dimensionality of sizes (",
+        new_size.size(),
+        ") must match dimensionality of strides (",
+        new_stride.size(),
+        ")");
+    sizes_ = new_size.vec();
+    strides_ = new_stride.vec();
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  virtual int64_t size(int64_t d) const;
+  virtual int64_t stride(int64_t d) const;
+
+  bool is_variable() const { return is_variable_; };
+
+ private:
+  int64_t storage_offset_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+
+  bool is_contiguous_;
+  int64_t numel_;
+
+  int64_t compute_numel() const {
+    int64_t n = 1;
+    for (auto s : sizes()) {
+      n *= s;
+    }
+    return n;
+  }
+  bool compute_contiguous() const;
+
+ protected:
+  void refresh_numel() {
+    numel_ = compute_numel();
+  }
+  void refresh_contiguous() {
+    is_contiguous_ = compute_contiguous();
+  }
+  TensorTypeId type_id_;
+  // INVARIANT: When storage is non-null, this type meta must
+  // agree with the type meta in storage
+  caffe2::TypeMeta data_type_;
+  bool is_variable_ = false;
+  bool is_wrapped_number_ = false;
+
+ private:
+  TensorImpl(Storage&& storage, TensorTypeId type_id, const caffe2::TypeMeta& data_type, bool is_variable);
+};
+} // namespace at
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
new file mode 100644
index 00000000000000..ff85267e78fb81
--- /dev/null
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -0,0 +1,1258 @@
+#pragma once
+
+#include "ATen/core/Tensor.h"
+#include "ATen/core/Scalar.h"
+#include "ATen/core/SparseTensorRef.h"
+#include "ATen/core/Type.h"
+#include "ATen/core/TensorOptions.h"
+
+namespace at {
+
+inline Tensor Tensor::toType(const Type & t, bool non_blocking) const {
+  if(type() == t)
+    return *this;
+  return t.copy(*this, non_blocking);
+}
+
+inline Tensor Tensor::cpu() const {
+  return toType(type().cpu());
+}
+
+inline Tensor Tensor::cuda() const {
+  return toType(type().cuda());
+}
+
+inline Tensor & Tensor::copy_(const Tensor & src, bool non_blocking) {
+  return type().copy_(*this, src, non_blocking);
+}
+
+inline Tensor Tensor::toType(ScalarType t) const {
+  return toType(type().toScalarType(t));
+}
+
+inline Tensor Tensor::toBackend(Backend b) const {
+  return toType(type().toBackend(b));
+}
+
+inline TensorOptions Tensor::options() const {
+  return TensorOptions().dtype(dtype())
+                        .device(device())
+                        .layout(layout())
+                        .is_variable(is_variable());
+}
+
+inline void Tensor::backward(
+    at::optional<Tensor> gradient,
+    bool keep_graph,
+    bool create_graph) {
+  type().backward(*this, std::move(gradient), keep_graph, create_graph);
+}
+
+inline void Tensor::set_data(Tensor new_data) {
+  type().set_data(*this, new_data);
+}
+
+// all static inline to allow for inlining of the non-dynamic part of dispatch
+inline int64_t Tensor::storage_offset() const {
+    return type().storage_offset(*this);
+}
+inline Tensor & Tensor::resize_(IntList size) {
+    return type().resize_(*this, size);
+}
+inline Tensor & Tensor::set_(Storage source) {
+    return type().set_(*this, source);
+}
+inline Tensor & Tensor::set_(Storage source, int64_t storage_offset, IntList size, IntList stride) {
+    return type().set_(*this, source, storage_offset, size, stride);
+}
+inline Tensor & Tensor::set_(const Tensor & source) {
+    return type().set_(*this, source);
+}
+inline Tensor & Tensor::set_() {
+    return type().set_(*this);
+}
+inline bool Tensor::is_contiguous() const {
+    return type().is_contiguous(*this);
+}
+inline bool Tensor::is_set_to(const Tensor & tensor) const {
+    return type().is_set_to(*this, tensor);
+}
+inline Tensor & Tensor::masked_fill_(const Tensor & mask, Scalar value) {
+    return type().masked_fill_(*this, mask, value);
+}
+inline Tensor & Tensor::masked_fill_(const Tensor & mask, const Tensor & value) {
+    return type().masked_fill_(*this, mask, value);
+}
+inline Tensor & Tensor::masked_scatter_(const Tensor & mask, const Tensor & source) {
+    return type().masked_scatter_(*this, mask, source);
+}
+inline Tensor Tensor::masked_select(const Tensor & mask) const {
+    return type().masked_select(*this, mask);
+}
+inline Tensor Tensor::nonzero() const {
+    return type().nonzero(*this);
+}
+inline Tensor Tensor::contiguous() const {
+    return type().contiguous(*this);
+}
+inline Tensor Tensor::view(IntList size) const {
+    return type().view(*this, size);
+}
+inline Tensor Tensor::index_select(int64_t dim, const Tensor & index) const {
+    return type().index_select(*this, dim, index);
+}
+inline Tensor Tensor::take(const Tensor & index) const {
+    return type().take(*this, index);
+}
+inline Tensor & Tensor::put_(const Tensor & index, const Tensor & source, bool accumulate) {
+    return type().put_(*this, index, source, accumulate);
+}
+inline Tensor & Tensor::index_add_(int64_t dim, const Tensor & index, const Tensor & source) {
+    return type().index_add_(*this, dim, index, source);
+}
+inline Tensor & Tensor::index_fill_(int64_t dim, const Tensor & index, Scalar value) {
+    return type().index_fill_(*this, dim, index, value);
+}
+inline Tensor & Tensor::index_fill_(int64_t dim, const Tensor & index, const Tensor & value) {
+    return type().index_fill_(*this, dim, index, value);
+}
+inline Tensor Tensor::unfold(int64_t dimension, int64_t size, int64_t step) const {
+    return type().unfold(*this, dimension, size, step);
+}
+inline Tensor & Tensor::scatter_(int64_t dim, const Tensor & index, const Tensor & src) {
+    return type().scatter_(*this, dim, index, src);
+}
+inline Tensor & Tensor::scatter_(int64_t dim, const Tensor & index, Scalar value) {
+    return type().scatter_(*this, dim, index, value);
+}
+inline Tensor & Tensor::scatter_add_(int64_t dim, const Tensor & index, const Tensor & src) {
+    return type().scatter_add_(*this, dim, index, src);
+}
+inline Tensor Tensor::gather(int64_t dim, const Tensor & index) const {
+    return type().gather(*this, dim, index);
+}
+inline void* Tensor::data_ptr() const {
+    return type().data_ptr(*this);
+}
+inline bool Tensor::equal(const Tensor & other) const {
+    return type().equal(*this, other);
+}
+inline Tensor Tensor::__and__(Scalar other) const {
+    return type().__and__(*this, other);
+}
+inline Tensor Tensor::__and__(const Tensor & other) const {
+    return type().__and__(*this, other);
+}
+inline Tensor & Tensor::__iand__(Scalar other) {
+    return type().__iand__(*this, other);
+}
+inline Tensor & Tensor::__iand__(const Tensor & other) {
+    return type().__iand__(*this, other);
+}
+inline Tensor Tensor::__or__(Scalar other) const {
+    return type().__or__(*this, other);
+}
+inline Tensor Tensor::__or__(const Tensor & other) const {
+    return type().__or__(*this, other);
+}
+inline Tensor & Tensor::__ior__(Scalar other) {
+    return type().__ior__(*this, other);
+}
+inline Tensor & Tensor::__ior__(const Tensor & other) {
+    return type().__ior__(*this, other);
+}
+inline Tensor Tensor::__xor__(Scalar other) const {
+    return type().__xor__(*this, other);
+}
+inline Tensor Tensor::__xor__(const Tensor & other) const {
+    return type().__xor__(*this, other);
+}
+inline Tensor & Tensor::__ixor__(Scalar other) {
+    return type().__ixor__(*this, other);
+}
+inline Tensor & Tensor::__ixor__(const Tensor & other) {
+    return type().__ixor__(*this, other);
+}
+inline Tensor Tensor::__lshift__(Scalar other) const {
+    return type().__lshift__(*this, other);
+}
+inline Tensor Tensor::__lshift__(const Tensor & other) const {
+    return type().__lshift__(*this, other);
+}
+inline Tensor & Tensor::__ilshift__(Scalar other) {
+    return type().__ilshift__(*this, other);
+}
+inline Tensor & Tensor::__ilshift__(const Tensor & other) {
+    return type().__ilshift__(*this, other);
+}
+inline Tensor Tensor::__rshift__(Scalar other) const {
+    return type().__rshift__(*this, other);
+}
+inline Tensor Tensor::__rshift__(const Tensor & other) const {
+    return type().__rshift__(*this, other);
+}
+inline Tensor & Tensor::__irshift__(Scalar other) {
+    return type().__irshift__(*this, other);
+}
+inline Tensor & Tensor::__irshift__(const Tensor & other) {
+    return type().__irshift__(*this, other);
+}
+inline Tensor Tensor::lt(Scalar other) const {
+    return type().lt(*this, other);
+}
+inline Tensor Tensor::lt(const Tensor & other) const {
+    return type().lt(*this, other);
+}
+inline Tensor & Tensor::lt_(Scalar other) {
+    return type().lt_(*this, other);
+}
+inline Tensor & Tensor::lt_(const Tensor & other) {
+    return type().lt_(*this, other);
+}
+inline Tensor Tensor::gt(Scalar other) const {
+    return type().gt(*this, other);
+}
+inline Tensor Tensor::gt(const Tensor & other) const {
+    return type().gt(*this, other);
+}
+inline Tensor & Tensor::gt_(Scalar other) {
+    return type().gt_(*this, other);
+}
+inline Tensor & Tensor::gt_(const Tensor & other) {
+    return type().gt_(*this, other);
+}
+inline Tensor Tensor::le(Scalar other) const {
+    return type().le(*this, other);
+}
+inline Tensor Tensor::le(const Tensor & other) const {
+    return type().le(*this, other);
+}
+inline Tensor & Tensor::le_(Scalar other) {
+    return type().le_(*this, other);
+}
+inline Tensor & Tensor::le_(const Tensor & other) {
+    return type().le_(*this, other);
+}
+inline Tensor Tensor::ge(Scalar other) const {
+    return type().ge(*this, other);
+}
+inline Tensor Tensor::ge(const Tensor & other) const {
+    return type().ge(*this, other);
+}
+inline Tensor & Tensor::ge_(Scalar other) {
+    return type().ge_(*this, other);
+}
+inline Tensor & Tensor::ge_(const Tensor & other) {
+    return type().ge_(*this, other);
+}
+inline Tensor Tensor::eq(Scalar other) const {
+    return type().eq(*this, other);
+}
+inline Tensor Tensor::eq(const Tensor & other) const {
+    return type().eq(*this, other);
+}
+inline Tensor & Tensor::eq_(Scalar other) {
+    return type().eq_(*this, other);
+}
+inline Tensor & Tensor::eq_(const Tensor & other) {
+    return type().eq_(*this, other);
+}
+inline Tensor Tensor::ne(Scalar other) const {
+    return type().ne(*this, other);
+}
+inline Tensor Tensor::ne(const Tensor & other) const {
+    return type().ne(*this, other);
+}
+inline Tensor & Tensor::ne_(Scalar other) {
+    return type().ne_(*this, other);
+}
+inline Tensor & Tensor::ne_(const Tensor & other) {
+    return type().ne_(*this, other);
+}
+inline Tensor Tensor::min(const Tensor & other) const {
+    return type().min(*this, other);
+}
+inline Tensor Tensor::min() const {
+    return type().min(*this);
+}
+inline Tensor Tensor::max(const Tensor & other) const {
+    return type().max(*this, other);
+}
+inline Tensor Tensor::max() const {
+    return type().max(*this);
+}
+inline Tensor Tensor::median() const {
+    return type().median(*this);
+}
+inline std::tuple<Tensor,Tensor> Tensor::sort(int64_t dim, bool descending) const {
+    return type().sort(*this, dim, descending);
+}
+inline std::tuple<Tensor,Tensor> Tensor::topk(int64_t k, int64_t dim, bool largest, bool sorted) const {
+    return type().topk(*this, k, dim, largest, sorted);
+}
+inline Tensor Tensor::all() const {
+    return type().all(*this);
+}
+inline Tensor Tensor::any() const {
+    return type().any(*this);
+}
+inline Tensor Tensor::lgamma() const {
+    return type().lgamma(*this);
+}
+inline Tensor & Tensor::lgamma_() {
+    return type().lgamma_(*this);
+}
+inline Tensor Tensor::digamma() const {
+    return type().digamma(*this);
+}
+inline Tensor & Tensor::digamma_() {
+    return type().digamma_(*this);
+}
+inline Tensor Tensor::polygamma(int64_t n) const {
+    return type().polygamma(n, *this);
+}
+inline Tensor & Tensor::polygamma_(int64_t n) {
+    return type().polygamma_(*this, n);
+}
+inline Tensor & Tensor::erfinv_() {
+    return type().erfinv_(*this);
+}
+inline Tensor Tensor::erfinv() const {
+    return type().erfinv(*this);
+}
+inline Tensor & Tensor::frac_() {
+    return type().frac_(*this);
+}
+inline Tensor Tensor::frac() const {
+    return type().frac(*this);
+}
+inline Tensor Tensor::renorm(Scalar p, int64_t dim, Scalar maxnorm) const {
+    return type().renorm(*this, p, dim, maxnorm);
+}
+inline Tensor & Tensor::renorm_(Scalar p, int64_t dim, Scalar maxnorm) {
+    return type().renorm_(*this, p, dim, maxnorm);
+}
+inline Tensor Tensor::dist(const Tensor & other, Scalar p) const {
+    return type().dist(*this, other, p);
+}
+inline Tensor Tensor::reciprocal() const {
+    return type().reciprocal(*this);
+}
+inline Tensor & Tensor::reciprocal_() {
+    return type().reciprocal_(*this);
+}
+inline Tensor Tensor::neg() const {
+    return type().neg(*this);
+}
+inline Tensor & Tensor::neg_() {
+    return type().neg_(*this);
+}
+inline Tensor Tensor::atan2(const Tensor & other) const {
+    return type().atan2(*this, other);
+}
+inline Tensor & Tensor::atan2_(const Tensor & other) {
+    return type().atan2_(*this, other);
+}
+inline Tensor Tensor::pow(const Tensor & exponent) const {
+    return type().pow(*this, exponent);
+}
+inline Tensor & Tensor::pow_(Scalar exponent) {
+    return type().pow_(*this, exponent);
+}
+inline Tensor & Tensor::pow_(const Tensor & exponent) {
+    return type().pow_(*this, exponent);
+}
+inline Tensor Tensor::lerp(const Tensor & end, Scalar weight) const {
+    return type().lerp(*this, end, weight);
+}
+inline Tensor & Tensor::lerp_(const Tensor & end, Scalar weight) {
+    return type().lerp_(*this, end, weight);
+}
+inline Tensor Tensor::histc(int64_t bins, Scalar min, Scalar max) const {
+    return type().histc(*this, bins, min, max);
+}
+inline Tensor Tensor::sign() const {
+    return type().sign(*this);
+}
+inline Tensor & Tensor::sign_() {
+    return type().sign_(*this);
+}
+inline Tensor Tensor::trace() const {
+    return type().trace(*this);
+}
+inline Tensor Tensor::fmod(Scalar other) const {
+    return type().fmod(*this, other);
+}
+inline Tensor Tensor::fmod(const Tensor & other) const {
+    return type().fmod(*this, other);
+}
+inline Tensor & Tensor::fmod_(Scalar other) {
+    return type().fmod_(*this, other);
+}
+inline Tensor & Tensor::fmod_(const Tensor & other) {
+    return type().fmod_(*this, other);
+}
+inline Tensor Tensor::remainder(Scalar other) const {
+    return type().remainder(*this, other);
+}
+inline Tensor Tensor::remainder(const Tensor & other) const {
+    return type().remainder(*this, other);
+}
+inline Tensor & Tensor::remainder_(Scalar other) {
+    return type().remainder_(*this, other);
+}
+inline Tensor & Tensor::remainder_(const Tensor & other) {
+    return type().remainder_(*this, other);
+}
+inline Tensor Tensor::tril(int64_t diagonal) const {
+    return type().tril(*this, diagonal);
+}
+inline Tensor & Tensor::tril_(int64_t diagonal) {
+    return type().tril_(*this, diagonal);
+}
+inline Tensor Tensor::triu(int64_t diagonal) const {
+    return type().triu(*this, diagonal);
+}
+inline Tensor & Tensor::triu_(int64_t diagonal) {
+    return type().triu_(*this, diagonal);
+}
+inline Tensor Tensor::cross(const Tensor & other, int64_t dim) const {
+    return type().cross(*this, other, dim);
+}
+inline Tensor Tensor::diag(int64_t diagonal) const {
+    return type().diag(*this, diagonal);
+}
+inline Tensor Tensor::addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const {
+    return type().addbmm(*this, batch1, batch2, beta, alpha);
+}
+inline Tensor & Tensor::addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) {
+    return type().addbmm_(*this, batch1, batch2, beta, alpha);
+}
+inline Tensor Tensor::addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const {
+    return type().addcmul(*this, tensor1, tensor2, value);
+}
+inline Tensor & Tensor::addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) {
+    return type().addcmul_(*this, tensor1, tensor2, value);
+}
+inline Tensor Tensor::addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const {
+    return type().addcdiv(*this, tensor1, tensor2, value);
+}
+inline Tensor & Tensor::addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) {
+    return type().addcdiv_(*this, tensor1, tensor2, value);
+}
+inline std::tuple<Tensor,Tensor> Tensor::gels(const Tensor & A) const {
+    return type().gels(*this, A);
+}
+inline std::tuple<Tensor,Tensor> Tensor::trtrs(const Tensor & A, bool upper, bool transpose, bool unitriangular) const {
+    return type().trtrs(*this, A, upper, transpose, unitriangular);
+}
+inline std::tuple<Tensor,Tensor> Tensor::symeig(bool eigenvectors, bool upper) const {
+    return type().symeig(*this, eigenvectors, upper);
+}
+inline std::tuple<Tensor,Tensor> Tensor::eig(bool eigenvectors) const {
+    return type().eig(*this, eigenvectors);
+}
+inline std::tuple<Tensor,Tensor,Tensor> Tensor::svd(bool some) const {
+    return type().svd(*this, some);
+}
+inline Tensor Tensor::potrf(bool upper) const {
+    return type().potrf(*this, upper);
+}
+inline Tensor Tensor::potrs(const Tensor & input2, bool upper) const {
+    return type().potrs(*this, input2, upper);
+}
+inline Tensor Tensor::potri(bool upper) const {
+    return type().potri(*this, upper);
+}
+inline std::tuple<Tensor,Tensor> Tensor::pstrf(bool upper, Scalar tol) const {
+    return type().pstrf(*this, upper, tol);
+}
+inline std::tuple<Tensor,Tensor> Tensor::qr() const {
+    return type().qr(*this);
+}
+inline std::tuple<Tensor,Tensor> Tensor::geqrf() const {
+    return type().geqrf(*this);
+}
+inline Tensor Tensor::orgqr(const Tensor & input2) const {
+    return type().orgqr(*this, input2);
+}
+inline Tensor Tensor::ormqr(const Tensor & input2, const Tensor & input3, bool left, bool transpose) const {
+    return type().ormqr(*this, input2, input3, left, transpose);
+}
+inline std::tuple<Tensor,Tensor> Tensor::btrifact(bool pivot) const {
+    return type().btrifact(*this, pivot);
+}
+inline std::tuple<Tensor,Tensor,Tensor> Tensor::btrifact_with_info(bool pivot) const {
+    return type().btrifact_with_info(*this, pivot);
+}
+inline Tensor Tensor::btrisolve(const Tensor & LU_data, const Tensor & LU_pivots) const {
+    return type().btrisolve(*this, LU_data, LU_pivots);
+}
+inline Tensor & Tensor::random_(int64_t from, int64_t to, Generator * generator) {
+    return type().random_(*this, from, to, generator);
+}
+inline Tensor & Tensor::random_(int64_t to, Generator * generator) {
+    return type().random_(*this, to, generator);
+}
+inline Tensor & Tensor::random_(Generator * generator) {
+    return type().random_(*this, generator);
+}
+inline Tensor Tensor::multinomial(int64_t num_samples, bool replacement, Generator * generator) const {
+    return type().multinomial(*this, num_samples, replacement, generator);
+}
+inline Tensor & Tensor::uniform_(double from, double to, Generator * generator) {
+    return type().uniform_(*this, from, to, generator);
+}
+inline Tensor & Tensor::normal_(double mean, double std, Generator * generator) {
+    return type().normal_(*this, mean, std, generator);
+}
+inline Tensor & Tensor::cauchy_(double median, double sigma, Generator * generator) {
+    return type().cauchy_(*this, median, sigma, generator);
+}
+inline Tensor & Tensor::log_normal_(double mean, double std, Generator * generator) {
+    return type().log_normal_(*this, mean, std, generator);
+}
+inline Tensor & Tensor::exponential_(double lambd, Generator * generator) {
+    return type().exponential_(*this, lambd, generator);
+}
+inline Tensor & Tensor::geometric_(double p, Generator * generator) {
+    return type().geometric_(*this, p, generator);
+}
+inline Tensor Tensor::abs() const {
+    return type().abs(*this);
+}
+inline Tensor & Tensor::abs_() {
+    return type().abs_(*this);
+}
+inline Tensor Tensor::acos() const {
+    return type().acos(*this);
+}
+inline Tensor & Tensor::acos_() {
+    return type().acos_(*this);
+}
+inline Tensor Tensor::add(const Tensor & other, Scalar alpha) const {
+    return type().add(*this, other, alpha);
+}
+inline Tensor & Tensor::add_(const Tensor & other, Scalar alpha) {
+    return type().add_(*this, other, alpha);
+}
+inline Tensor Tensor::add(Scalar other, Scalar alpha) const {
+    return type().add(*this, other, alpha);
+}
+inline Tensor & Tensor::add_(Scalar other, Scalar alpha) {
+    return type().add_(*this, other, alpha);
+}
+inline Tensor Tensor::addmv(const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const {
+    return type().addmv(*this, mat, vec, beta, alpha);
+}
+inline Tensor & Tensor::addmv_(const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) {
+    return type().addmv_(*this, mat, vec, beta, alpha);
+}
+inline Tensor Tensor::addr(const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const {
+    return type().addr(*this, vec1, vec2, beta, alpha);
+}
+inline Tensor & Tensor::addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
+    return type().addr_(*this, vec1, vec2, beta, alpha);
+}
+inline Tensor Tensor::all(int64_t dim, bool keepdim) const {
+    return type().all(*this, dim, keepdim);
+}
+inline bool Tensor::allclose(const Tensor & other, double rtol, double atol, bool equal_nan) const {
+    return type().allclose(*this, other, rtol, atol, equal_nan);
+}
+inline Tensor Tensor::any(int64_t dim, bool keepdim) const {
+    return type().any(*this, dim, keepdim);
+}
+inline Tensor Tensor::argmax(int64_t dim, bool keepdim) const {
+    return type().argmax(*this, dim, keepdim);
+}
+inline Tensor Tensor::argmax() const {
+    return type().argmax(*this);
+}
+inline Tensor Tensor::argmin(int64_t dim, bool keepdim) const {
+    return type().argmin(*this, dim, keepdim);
+}
+inline Tensor Tensor::argmin() const {
+    return type().argmin(*this);
+}
+inline Tensor Tensor::as_strided(IntList size, IntList stride) const {
+    return type().as_strided(*this, size, stride);
+}
+inline Tensor & Tensor::as_strided_(IntList size, IntList stride) {
+    return type().as_strided_(*this, size, stride);
+}
+inline Tensor Tensor::as_strided(IntList size, IntList stride, int64_t storage_offset) const {
+    return type().as_strided(*this, size, stride, storage_offset);
+}
+inline Tensor & Tensor::as_strided_(IntList size, IntList stride, int64_t storage_offset) {
+    return type().as_strided_(*this, size, stride, storage_offset);
+}
+inline Tensor Tensor::asin() const {
+    return type().asin(*this);
+}
+inline Tensor & Tensor::asin_() {
+    return type().asin_(*this);
+}
+inline Tensor Tensor::atan() const {
+    return type().atan(*this);
+}
+inline Tensor & Tensor::atan_() {
+    return type().atan_(*this);
+}
+inline Tensor Tensor::baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const {
+    return type().baddbmm(*this, batch1, batch2, beta, alpha);
+}
+inline Tensor & Tensor::baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) {
+    return type().baddbmm_(*this, batch1, batch2, beta, alpha);
+}
+inline Tensor Tensor::bernoulli(const Tensor & p, Generator * generator) const {
+    return type().bernoulli(*this, p, generator);
+}
+inline Tensor Tensor::bernoulli(double p, Generator * generator) const {
+    return type().bernoulli(*this, p, generator);
+}
+inline Tensor Tensor::bernoulli() const {
+    return type().bernoulli(*this);
+}
+inline Tensor & Tensor::bernoulli_(const Tensor & p, Generator * generator) {
+    return type().bernoulli_(*this, p, generator);
+}
+inline Tensor & Tensor::bernoulli_(double p, Generator * generator) {
+    return type().bernoulli_(*this, p, generator);
+}
+inline Tensor & Tensor::bernoulli_() {
+    return type().bernoulli_(*this);
+}
+inline Tensor Tensor::bincount(const Tensor & weights, int64_t minlength) const {
+    return type().bincount(*this, weights, minlength);
+}
+inline Tensor Tensor::bmm(const Tensor & mat2) const {
+    return type().bmm(*this, mat2);
+}
+inline Tensor Tensor::ceil() const {
+    return type().ceil(*this);
+}
+inline Tensor & Tensor::ceil_() {
+    return type().ceil_(*this);
+}
+inline std::vector<Tensor> Tensor::chunk(int64_t chunks, int64_t dim) const {
+    return type().chunk(*this, chunks, dim);
+}
+inline Tensor Tensor::clamp(Scalar min, Scalar max) const {
+    return type().clamp(*this, min, max);
+}
+inline Tensor & Tensor::clamp_(Scalar min, Scalar max) {
+    return type().clamp_(*this, min, max);
+}
+inline Tensor Tensor::clamp_max(Scalar max) const {
+    return type().clamp_max(*this, max);
+}
+inline Tensor & Tensor::clamp_max_(Scalar max) {
+    return type().clamp_max_(*this, max);
+}
+inline Tensor Tensor::clamp_min(Scalar min) const {
+    return type().clamp_min(*this, min);
+}
+inline Tensor & Tensor::clamp_min_(Scalar min) {
+    return type().clamp_min_(*this, min);
+}
+inline Tensor Tensor::cos() const {
+    return type().cos(*this);
+}
+inline Tensor & Tensor::cos_() {
+    return type().cos_(*this);
+}
+inline Tensor Tensor::cosh() const {
+    return type().cosh(*this);
+}
+inline Tensor & Tensor::cosh_() {
+    return type().cosh_(*this);
+}
+inline Tensor Tensor::cumsum(int64_t dim, ScalarType dtype) const {
+    return type().cumsum(*this, dim, dtype);
+}
+inline Tensor Tensor::cumsum(int64_t dim) const {
+    return type().cumsum(*this, dim);
+}
+inline Tensor Tensor::cumprod(int64_t dim, ScalarType dtype) const {
+    return type().cumprod(*this, dim, dtype);
+}
+inline Tensor Tensor::cumprod(int64_t dim) const {
+    return type().cumprod(*this, dim);
+}
+inline Tensor Tensor::det() const {
+    return type().det(*this);
+}
+inline Tensor Tensor::diagflat(int64_t offset) const {
+    return type().diagflat(*this, offset);
+}
+inline Tensor Tensor::diagonal(int64_t offset, int64_t dim1, int64_t dim2) const {
+    return type().diagonal(*this, offset, dim1, dim2);
+}
+inline Tensor Tensor::div(const Tensor & other) const {
+    return type().div(*this, other);
+}
+inline Tensor & Tensor::div_(const Tensor & other) {
+    return type().div_(*this, other);
+}
+inline Tensor Tensor::div(Scalar other) const {
+    return type().div(*this, other);
+}
+inline Tensor & Tensor::div_(Scalar other) {
+    return type().div_(*this, other);
+}
+inline Tensor Tensor::dot(const Tensor & tensor) const {
+    return type().dot(*this, tensor);
+}
+inline Tensor Tensor::erf() const {
+    return type().erf(*this);
+}
+inline Tensor & Tensor::erf_() {
+    return type().erf_(*this);
+}
+inline Tensor Tensor::erfc() const {
+    return type().erfc(*this);
+}
+inline Tensor & Tensor::erfc_() {
+    return type().erfc_(*this);
+}
+inline Tensor Tensor::exp() const {
+    return type().exp(*this);
+}
+inline Tensor & Tensor::exp_() {
+    return type().exp_(*this);
+}
+inline Tensor Tensor::expm1() const {
+    return type().expm1(*this);
+}
+inline Tensor & Tensor::expm1_() {
+    return type().expm1_(*this);
+}
+inline Tensor Tensor::expand(IntList size, bool implicit) const {
+    return type().expand(*this, size, implicit);
+}
+inline Tensor Tensor::expand_as(const Tensor & other) const {
+    return type().expand_as(*this, other);
+}
+inline Tensor Tensor::flatten(int64_t start_dim, int64_t end_dim) const {
+    return type().flatten(*this, start_dim, end_dim);
+}
+inline Tensor & Tensor::fill_(Scalar value) {
+    return type().fill_(*this, value);
+}
+inline Tensor & Tensor::fill_(const Tensor & value) {
+    return type().fill_(*this, value);
+}
+inline Tensor Tensor::floor() const {
+    return type().floor(*this);
+}
+inline Tensor & Tensor::floor_() {
+    return type().floor_(*this);
+}
+inline Tensor Tensor::ger(const Tensor & vec2) const {
+    return type().ger(*this, vec2);
+}
+inline std::tuple<Tensor,Tensor> Tensor::gesv(const Tensor & A) const {
+    return type().gesv(*this, A);
+}
+inline Tensor Tensor::fft(int64_t signal_ndim, bool normalized) const {
+    return type().fft(*this, signal_ndim, normalized);
+}
+inline Tensor Tensor::ifft(int64_t signal_ndim, bool normalized) const {
+    return type().ifft(*this, signal_ndim, normalized);
+}
+inline Tensor Tensor::rfft(int64_t signal_ndim, bool normalized, bool onesided) const {
+    return type().rfft(*this, signal_ndim, normalized, onesided);
+}
+inline Tensor Tensor::irfft(int64_t signal_ndim, bool normalized, bool onesided, IntList signal_sizes) const {
+    return type().irfft(*this, signal_ndim, normalized, onesided, signal_sizes);
+}
+inline Tensor Tensor::index(TensorList indices) const {
+    return type().index(*this, indices);
+}
+inline Tensor & Tensor::index_copy_(int64_t dim, const Tensor & index, const Tensor & source) {
+    return type().index_copy_(*this, dim, index, source);
+}
+inline Tensor Tensor::index_put(TensorList indices, const Tensor & values) const {
+    return type().index_put(*this, indices, values);
+}
+inline Tensor & Tensor::index_put_(TensorList indices, const Tensor & values) {
+    return type().index_put_(*this, indices, values);
+}
+inline Tensor Tensor::inverse() const {
+    return type().inverse(*this);
+}
+inline Tensor Tensor::isclose(const Tensor & other, double rtol, double atol, bool equal_nan) const {
+    return type().isclose(*this, other, rtol, atol, equal_nan);
+}
+inline bool Tensor::is_cuda() const {
+    return type().is_cuda(*this);
+}
+inline bool Tensor::is_distributed() const {
+    return type().is_distributed(*this);
+}
+inline bool Tensor::is_floating_point() const {
+    return type().is_floating_point(*this);
+}
+inline bool Tensor::is_complex() const {
+    return type().is_complex(*this);
+}
+inline bool Tensor::is_nonzero() const {
+    return type().is_nonzero(*this);
+}
+inline bool Tensor::is_same_size(const Tensor & other) const {
+    return type().is_same_size(*this, other);
+}
+inline bool Tensor::is_signed() const {
+    return type().is_signed(*this);
+}
+inline bool Tensor::is_sparse() const {
+    return type().is_sparse(*this);
+}
+inline std::tuple<Tensor,Tensor> Tensor::kthvalue(int64_t k, int64_t dim, bool keepdim) const {
+    return type().kthvalue(*this, k, dim, keepdim);
+}
+inline Tensor Tensor::log() const {
+    return type().log(*this);
+}
+inline Tensor & Tensor::log_() {
+    return type().log_(*this);
+}
+inline Tensor Tensor::log10() const {
+    return type().log10(*this);
+}
+inline Tensor & Tensor::log10_() {
+    return type().log10_(*this);
+}
+inline Tensor Tensor::log1p() const {
+    return type().log1p(*this);
+}
+inline Tensor & Tensor::log1p_() {
+    return type().log1p_(*this);
+}
+inline Tensor Tensor::log2() const {
+    return type().log2(*this);
+}
+inline Tensor & Tensor::log2_() {
+    return type().log2_(*this);
+}
+inline Tensor Tensor::logdet() const {
+    return type().logdet(*this);
+}
+inline Tensor Tensor::log_softmax(int64_t dim) const {
+    return type().log_softmax(*this, dim);
+}
+inline Tensor Tensor::logsumexp(int64_t dim, bool keepdim) const {
+    return type().logsumexp(*this, dim, keepdim);
+}
+inline Tensor Tensor::matmul(const Tensor & other) const {
+    return type().matmul(*this, other);
+}
+inline Tensor Tensor::matrix_power(int64_t n) const {
+    return type().matrix_power(*this, n);
+}
+inline std::tuple<Tensor,Tensor> Tensor::max(int64_t dim, bool keepdim) const {
+    return type().max(*this, dim, keepdim);
+}
+inline Tensor Tensor::max_values(int64_t dim, bool keepdim) const {
+    return type().max_values(*this, dim, keepdim);
+}
+inline Tensor Tensor::mean(ScalarType dtype) const {
+    return type().mean(*this, dtype);
+}
+inline Tensor Tensor::mean() const {
+    return type().mean(*this);
+}
+inline Tensor Tensor::mean(int64_t dim, bool keepdim, ScalarType dtype) const {
+    return type().mean(*this, dim, keepdim, dtype);
+}
+inline Tensor Tensor::mean(int64_t dim, bool keepdim) const {
+    return type().mean(*this, dim, keepdim);
+}
+inline Tensor Tensor::mean(int64_t dim, ScalarType dtype) const {
+    return type().mean(*this, dim, dtype);
+}
+inline std::tuple<Tensor,Tensor> Tensor::median(int64_t dim, bool keepdim) const {
+    return type().median(*this, dim, keepdim);
+}
+inline std::tuple<Tensor,Tensor> Tensor::min(int64_t dim, bool keepdim) const {
+    return type().min(*this, dim, keepdim);
+}
+inline Tensor Tensor::min_values(int64_t dim, bool keepdim) const {
+    return type().min_values(*this, dim, keepdim);
+}
+inline Tensor Tensor::mm(const Tensor & mat2) const {
+    return type().mm(*this, mat2);
+}
+inline std::tuple<Tensor,Tensor> Tensor::mode(int64_t dim, bool keepdim) const {
+    return type().mode(*this, dim, keepdim);
+}
+inline Tensor Tensor::mul(const Tensor & other) const {
+    return type().mul(*this, other);
+}
+inline Tensor & Tensor::mul_(const Tensor & other) {
+    return type().mul_(*this, other);
+}
+inline Tensor Tensor::mul(Scalar other) const {
+    return type().mul(*this, other);
+}
+inline Tensor & Tensor::mul_(Scalar other) {
+    return type().mul_(*this, other);
+}
+inline Tensor Tensor::mv(const Tensor & vec) const {
+    return type().mv(*this, vec);
+}
+inline Tensor Tensor::mvlgamma(int64_t p) const {
+    return type().mvlgamma(*this, p);
+}
+inline Tensor & Tensor::mvlgamma_(int64_t p) {
+    return type().mvlgamma_(*this, p);
+}
+inline Tensor Tensor::narrow(int64_t dim, int64_t start, int64_t length) const {
+    return type().narrow(*this, dim, start, length);
+}
+inline Tensor Tensor::permute(IntList dims) const {
+    return type().permute(*this, dims);
+}
+inline Tensor Tensor::pin_memory() const {
+    return type().pin_memory(*this);
+}
+inline Tensor Tensor::pinverse(double rcond) const {
+    return type().pinverse(*this, rcond);
+}
+inline Tensor Tensor::repeat(IntList repeats) const {
+    return type().repeat(*this, repeats);
+}
+inline Tensor Tensor::reshape(IntList shape) const {
+    return type().reshape(*this, shape);
+}
+inline Tensor Tensor::reshape_as(const Tensor & other) const {
+    return type().reshape_as(*this, other);
+}
+inline Tensor Tensor::round() const {
+    return type().round(*this);
+}
+inline Tensor & Tensor::round_() {
+    return type().round_(*this);
+}
+inline Tensor Tensor::relu() const {
+    return type().relu(*this);
+}
+inline Tensor & Tensor::relu_() {
+    return type().relu_(*this);
+}
+inline Tensor Tensor::hardshrink(Scalar lambd) const {
+    return type().hardshrink(*this, lambd);
+}
+inline Tensor Tensor::hardshrink_backward(const Tensor & grad_out, Scalar lambd) const {
+    return type().hardshrink_backward(grad_out, *this, lambd);
+}
+inline Tensor Tensor::rsqrt() const {
+    return type().rsqrt(*this);
+}
+inline Tensor & Tensor::rsqrt_() {
+    return type().rsqrt_(*this);
+}
+inline Tensor Tensor::select(int64_t dim, int64_t index) const {
+    return type().select(*this, dim, index);
+}
+inline Tensor Tensor::sigmoid() const {
+    return type().sigmoid(*this);
+}
+inline Tensor & Tensor::sigmoid_() {
+    return type().sigmoid_(*this);
+}
+inline Tensor Tensor::sin() const {
+    return type().sin(*this);
+}
+inline Tensor & Tensor::sin_() {
+    return type().sin_(*this);
+}
+inline Tensor Tensor::sinh() const {
+    return type().sinh(*this);
+}
+inline Tensor & Tensor::sinh_() {
+    return type().sinh_(*this);
+}
+inline Tensor Tensor::detach() const {
+    return type().detach(*this);
+}
+inline Tensor & Tensor::detach_() {
+    return type().detach_(*this);
+}
+inline int64_t Tensor::size(int64_t dim) const {
+    return type().size(*this, dim);
+}
+inline Tensor Tensor::slice(int64_t dim, int64_t start, int64_t end, int64_t step) const {
+    return type().slice(*this, dim, start, end, step);
+}
+inline std::tuple<Tensor,Tensor> Tensor::slogdet() const {
+    return type().slogdet(*this);
+}
+inline Tensor Tensor::smm(const Tensor & mat2) const {
+    return type().smm(*this, mat2);
+}
+inline Tensor Tensor::softmax(int64_t dim) const {
+    return type().softmax(*this, dim);
+}
+inline std::vector<Tensor> Tensor::split(int64_t split_size, int64_t dim) const {
+    return type().split(*this, split_size, dim);
+}
+inline std::vector<Tensor> Tensor::split_with_sizes(IntList split_sizes, int64_t dim) const {
+    return type().split_with_sizes(*this, split_sizes, dim);
+}
+inline Tensor Tensor::squeeze() const {
+    return type().squeeze(*this);
+}
+inline Tensor Tensor::squeeze(int64_t dim) const {
+    return type().squeeze(*this, dim);
+}
+inline Tensor & Tensor::squeeze_() {
+    return type().squeeze_(*this);
+}
+inline Tensor & Tensor::squeeze_(int64_t dim) {
+    return type().squeeze_(*this, dim);
+}
+inline Tensor Tensor::sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const {
+    return type().sspaddmm(*this, mat1, mat2, beta, alpha);
+}
+inline Tensor Tensor::stft(int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window, bool normalized, bool onesided) const {
+    return type().stft(*this, n_fft, hop_length, win_length, window, normalized, onesided);
+}
+inline int64_t Tensor::stride(int64_t dim) const {
+    return type().stride(*this, dim);
+}
+inline Tensor Tensor::sum(ScalarType dtype) const {
+    return type().sum(*this, dtype);
+}
+inline Tensor Tensor::sum() const {
+    return type().sum(*this);
+}
+inline Tensor Tensor::sum(IntList dim, bool keepdim, ScalarType dtype) const {
+    return type().sum(*this, dim, keepdim, dtype);
+}
+inline Tensor Tensor::sum(IntList dim, bool keepdim) const {
+    return type().sum(*this, dim, keepdim);
+}
+inline Tensor Tensor::sum(IntList dim, ScalarType dtype) const {
+    return type().sum(*this, dim, dtype);
+}
+inline Tensor Tensor::sqrt() const {
+    return type().sqrt(*this);
+}
+inline Tensor & Tensor::sqrt_() {
+    return type().sqrt_(*this);
+}
+inline Tensor Tensor::std(bool unbiased) const {
+    return type().std(*this, unbiased);
+}
+inline Tensor Tensor::std(int64_t dim, bool unbiased, bool keepdim) const {
+    return type().std(*this, dim, unbiased, keepdim);
+}
+inline Tensor Tensor::prod(ScalarType dtype) const {
+    return type().prod(*this, dtype);
+}
+inline Tensor Tensor::prod() const {
+    return type().prod(*this);
+}
+inline Tensor Tensor::prod(int64_t dim, bool keepdim, ScalarType dtype) const {
+    return type().prod(*this, dim, keepdim, dtype);
+}
+inline Tensor Tensor::prod(int64_t dim, bool keepdim) const {
+    return type().prod(*this, dim, keepdim);
+}
+inline Tensor Tensor::prod(int64_t dim, ScalarType dtype) const {
+    return type().prod(*this, dim, dtype);
+}
+inline Tensor Tensor::t() const {
+    return type().t(*this);
+}
+inline Tensor & Tensor::t_() {
+    return type().t_(*this);
+}
+inline Tensor Tensor::tan() const {
+    return type().tan(*this);
+}
+inline Tensor & Tensor::tan_() {
+    return type().tan_(*this);
+}
+inline Tensor Tensor::tanh() const {
+    return type().tanh(*this);
+}
+inline Tensor & Tensor::tanh_() {
+    return type().tanh_(*this);
+}
+inline Tensor Tensor::transpose(int64_t dim0, int64_t dim1) const {
+    return type().transpose(*this, dim0, dim1);
+}
+inline Tensor & Tensor::transpose_(int64_t dim0, int64_t dim1) {
+    return type().transpose_(*this, dim0, dim1);
+}
+inline Tensor Tensor::flip(IntList dims) const {
+    return type().flip(*this, dims);
+}
+inline Tensor Tensor::rot90(int64_t k, IntList dims) const {
+    return type().rot90(*this, k, dims);
+}
+inline Tensor Tensor::trunc() const {
+    return type().trunc(*this);
+}
+inline Tensor & Tensor::trunc_() {
+    return type().trunc_(*this);
+}
+inline Tensor Tensor::type_as(const Tensor & other) const {
+    return type().type_as(*this, other);
+}
+inline Tensor Tensor::unsqueeze(int64_t dim) const {
+    return type().unsqueeze(*this, dim);
+}
+inline Tensor & Tensor::unsqueeze_(int64_t dim) {
+    return type().unsqueeze_(*this, dim);
+}
+inline Tensor Tensor::var(bool unbiased) const {
+    return type().var(*this, unbiased);
+}
+inline Tensor Tensor::var(int64_t dim, bool unbiased, bool keepdim) const {
+    return type().var(*this, dim, unbiased, keepdim);
+}
+inline Tensor Tensor::view_as(const Tensor & other) const {
+    return type().view_as(*this, other);
+}
+inline Tensor Tensor::where(const Tensor & condition, const Tensor & other) const {
+    return type().where(condition, *this, other);
+}
+inline Tensor Tensor::norm(Scalar p) const {
+    return type().norm(*this, p);
+}
+inline Tensor Tensor::norm(Scalar p, int64_t dim, bool keepdim) const {
+    return type().norm(*this, p, dim, keepdim);
+}
+inline Tensor Tensor::clone() const {
+    return type().clone(*this);
+}
+inline Tensor & Tensor::resize_as_(const Tensor & the_template) {
+    return type().resize_as_(*this, the_template);
+}
+inline Tensor Tensor::pow(Scalar exponent) const {
+    return type().pow(*this, exponent);
+}
+inline Tensor & Tensor::zero_() {
+    return type().zero_(*this);
+}
+inline Tensor Tensor::sub(const Tensor & other, Scalar alpha) const {
+    return type().sub(*this, other, alpha);
+}
+inline Tensor & Tensor::sub_(const Tensor & other, Scalar alpha) {
+    return type().sub_(*this, other, alpha);
+}
+inline Tensor Tensor::sub(Scalar other, Scalar alpha) const {
+    return type().sub(*this, other, alpha);
+}
+inline Tensor & Tensor::sub_(Scalar other, Scalar alpha) {
+    return type().sub_(*this, other, alpha);
+}
+inline Tensor Tensor::addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const {
+    return type().addmm(*this, mat1, mat2, beta, alpha);
+}
+inline Tensor & Tensor::addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) {
+    return type().addmm_(*this, mat1, mat2, beta, alpha);
+}
+inline Tensor & Tensor::sparse_resize_(IntList size, int64_t sparseDims, int64_t denseDims) {
+    return type().sparse_resize_(*this, size, sparseDims, denseDims);
+}
+inline Tensor & Tensor::sparse_resize_and_clear_(IntList size, int64_t sparseDims, int64_t denseDims) {
+    return type().sparse_resize_and_clear_(*this, size, sparseDims, denseDims);
+}
+inline Tensor Tensor::sparse_mask(SparseTensorRef mask) const {
+    return type().sparse_mask(*this, mask);
+}
+inline Tensor Tensor::to_dense() const {
+    return type().to_dense(*this);
+}
+inline int64_t Tensor::_sparseDims() const {
+    return type()._sparseDims(*this);
+}
+inline int64_t Tensor::_denseDims() const {
+    return type()._denseDims(*this);
+}
+inline int64_t Tensor::_nnz() const {
+    return type()._nnz(*this);
+}
+inline Tensor Tensor::coalesce() const {
+    return type().coalesce(*this);
+}
+inline bool Tensor::is_coalesced() const {
+    return type().is_coalesced(*this);
+}
+inline Tensor Tensor::_indices() const {
+    return type()._indices(*this);
+}
+inline Tensor Tensor::_values() const {
+    return type()._values(*this);
+}
+inline int64_t Tensor::numel() const {
+    return type().numel(*this);
+}
+inline std::vector<Tensor> Tensor::unbind(int64_t dim) const {
+    return type().unbind(*this, dim);
+}
+inline int64_t Tensor::get_device() const {
+    return type().get_device(*this);
+}
+inline Tensor Tensor::to(Device device, ScalarType dtype, bool non_blocking) const {
+    return type().to(*this, device, dtype, non_blocking);
+}
+inline Tensor Tensor::to(ScalarType dtype, bool non_blocking) const {
+    return type().to(*this, dtype, non_blocking);
+}
+inline Tensor Tensor::to(Device device, bool non_blocking) const {
+    return type().to(*this, device, non_blocking);
+}
+inline Tensor Tensor::to(const Tensor & other, bool non_blocking) const {
+    return type().to(*this, other, non_blocking);
+}
+inline Scalar Tensor::_local_scalar() const {
+    return type()._local_scalar(*this);
+}
+
+inline bool Tensor::is_variable() const noexcept {
+  return type().is_variable();
+}
+
+inline ScalarType Tensor::dtype() const noexcept {
+  return type().scalarType();
+}
+
+inline Layout Tensor::layout() const noexcept {
+  return type().layout();
+}
+
+inline Device Tensor::device() const {
+  return Device(type().device_type(), type().is_cuda() ? get_device() : -1);
+}
+
+#define DEFINE_CAST(T, name, _)                  \
+  template <>                                    \
+  inline T* Tensor::data() const {               \
+    AT_CHECK(                                    \
+        type().scalarType() == ScalarType::name, \
+        "expected scalar type ",                 \
+        #name,                                   \
+        " but found ",                           \
+        at::toString(type().scalarType()));      \
+    return static_cast<T*>(this->data_ptr());    \
+  }                                              \
+  inline T* Tensor::to##name##Data() const {     \
+    return data<T>();                            \
+  }
+
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_CAST)
+#undef DEFINE_CAST
+
+#define DEFINE_TO_C_TYPE(T,name,_) \
+inline T Tensor::toC##name () const { return _local_scalar().to##name (); }
+
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO_C_TYPE)
+#undef DEFINE_TO_C_TYPE
+
+} //namespace at
diff --git a/aten/src/ATen/core/Type.h b/aten/src/ATen/core/Type.h
new file mode 100644
index 00000000000000..1366f899c30b84
--- /dev/null
+++ b/aten/src/ATen/core/Type.h
@@ -0,0 +1,634 @@
+#pragma once
+
+#include "ATen/core/ATenGeneral.h"
+#include "ATen/core/Allocator.h"
+#include "ATen/core/Deprecated.h"
+#include "ATen/core/Generator.h"
+#include "ATen/core/Layout.h"
+#include "ATen/core/Scalar.h"
+#include "ATen/core/ScalarType.h"
+#include "ATen/core/SparseTensorRef.h"
+#include "ATen/core/ArrayRef.h"
+#include "ATen/core/Half.h"
+#include "ATen/core/TensorTypeIdRegistration.h"
+#include "ATen/core/Reduction.h"
+#include "ATen/core/TensorOptions.h"
+
+#include <array>
+#include <cstddef>
+#include <functional>
+#include <limits>
+#include <memory>
+
+// To solve the conflict of s_addr in inaddr.h
+#ifdef _MSC_VER
+#ifdef s_addr
+#undef s_addr
+#endif
+#endif
+
+namespace at {
+
+class Context;
+struct Allocator;
+struct Generator;
+struct Storage;
+struct Tensor;
+
+static inline void noop_deleter(void*) {}
+
+enum class TypeID {
+  CPUByte,
+  CPUChar,
+  CPUDouble,
+  CPUFloat,
+  CPUInt,
+  CPULong,
+  CPUShort,
+  CPUHalf,
+  SparseCPUByte,
+  SparseCPUChar,
+  SparseCPUDouble,
+  SparseCPUFloat,
+  SparseCPUInt,
+  SparseCPULong,
+  SparseCPUShort,
+  CUDAByte,
+  CUDAChar,
+  CUDADouble,
+  CUDAFloat,
+  CUDAInt,
+  CUDALong,
+  CUDAShort,
+  CUDAHalf,
+  SparseCUDAByte,
+  SparseCUDAChar,
+  SparseCUDADouble,
+  SparseCUDAFloat,
+  SparseCUDAInt,
+  SparseCUDALong,
+  SparseCUDAShort,
+  CPUComplexFloat,
+  CPUComplexDouble,
+  CUDAComplexFloat,
+  CUDAComplexDouble,
+  Undefined,
+  NumOptions
+};
+
+struct AT_API Type {
+  explicit Type(TensorTypeId type_id, bool is_variable, bool is_undefined)
+      : type_id_(type_id), is_variable_(is_variable), is_undefined_(is_undefined) {}
+
+  virtual ~Type() {}
+  virtual ScalarType scalarType() const = 0;
+  virtual caffe2::TypeMeta typeMeta() const = 0;
+  virtual Backend backend() const = 0;
+  Layout layout() const noexcept { return layout_from_backend(backend()); }
+  virtual bool is_cuda() const = 0;
+  virtual bool is_sparse() const = 0;
+  virtual bool is_distributed() const = 0;
+  bool is_variable() const noexcept { return is_variable_; }
+  bool is_undefined() const noexcept { return is_undefined_; }
+  virtual Allocator * allocator() const = 0;
+  virtual Device getDeviceFromPtr(void * data) const = 0;
+  virtual Storage storage(bool resizable = false) const = 0;
+  virtual Storage storage(size_t size, bool resizable = false) const = 0;
+  virtual Storage storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
+  virtual Storage storageWithAllocator(int64_t size, Allocator* allocator) const = 0;
+  virtual std::unique_ptr<Generator> generator() const = 0;
+  virtual Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const = 0;
+  virtual Storage unsafeStorageFromTH(void * th_pointer, bool retain) const = 0;
+  virtual const char * toString() const = 0;
+  virtual size_t elementSizeInBytes() const = 0;
+  virtual Type & toBackend(Backend b) const = 0;
+  virtual Type & toScalarType(ScalarType s) const = 0;
+  Type & toSparse() const {
+    return this->toBackend(at::toSparse(this->backend()));
+  }
+  Type & toDense() const {
+    return this->toBackend(at::toDense(this->backend()));
+  }
+  Type & cpu() const {
+    return this->toBackend(at::backendToCPU(this->backend()));
+  }
+  Type & cuda() const {
+    return this->toBackend(at::backendToCUDA(this->backend()));
+  }
+  // contiguous IDs for all types in the system
+  // for external dispatch
+  virtual TypeID ID() const = 0;
+
+  // New-style TensorTypeId that supports open registration.
+  TensorTypeId type_id() const { return type_id_; }
+
+  // NB: This will return DeviceType::CPU for Backend::SparseCPU
+  DeviceType device_type() const {
+    return backendToDeviceType(backend());
+  }
+
+  virtual Tensor copy(const Tensor & src, bool non_blocking=false, optional<Device> to_device={}) const = 0;
+  virtual Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const = 0;
+  virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0;
+  virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const = 0;
+
+  virtual void backward(Tensor & self, at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const = 0;
+  virtual void set_data(Tensor & self, Tensor new_data) const = 0;
+
+  virtual Tensor tensorFromBlob(void * data, IntList sizes, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
+  virtual Tensor tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
+  virtual Tensor tensorWithAllocator(IntList sizes, Allocator* allocator) const = 0;
+  virtual Tensor tensorWithAllocator(IntList sizes, IntList strides, Allocator* allocator) const = 0;
+  virtual Tensor scalarTensor(Scalar s) const = 0;
+
+  bool operator==(const Type& other) const {
+    return this == &other;
+  }
+  bool operator!=(const Type& other) const {
+    return this != &other;
+  }
+
+  /// Constructs the `TensorOptions` from a type and a `device_index`.
+  TensorOptions options(int32_t device_index = -1) const {
+    TensorOptions r;
+    r.dtype(scalarType());
+    r.device({backendToDeviceType(backend()), device_index});
+    r.layout(layout());
+    r.is_variable(is_variable());
+    return r;
+  }
+
+  operator TensorOptions() const {
+    return options();
+  }
+
+  // example
+  // virtual Tensor * add(Tensor & a, Tensor & b) = 0;
+  virtual int64_t storage_offset(const Tensor & self) const = 0;
+  virtual Tensor & resize_(Tensor & self, IntList size) const = 0;
+  virtual Tensor & set_(Tensor & self, Storage source) const = 0;
+  virtual Tensor & set_(Tensor & self, Storage source, int64_t storage_offset, IntList size, IntList stride) const = 0;
+  virtual Tensor & set_(Tensor & self, const Tensor & source) const = 0;
+  virtual Tensor & set_(Tensor & self) const = 0;
+  virtual bool is_contiguous(const Tensor & self) const = 0;
+  virtual bool is_set_to(const Tensor & self, const Tensor & tensor) const = 0;
+  virtual Tensor & s_masked_fill_(Tensor & self, const Tensor & mask, Scalar value) const = 0;
+  virtual Tensor & masked_fill_(Tensor & self, const Tensor & mask, Scalar value) const = 0;
+  virtual Tensor & s_masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value) const = 0;
+  virtual Tensor & masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value) const = 0;
+  virtual Tensor & s_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) const = 0;
+  virtual Tensor & masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) const = 0;
+  virtual Tensor s_masked_select(const Tensor & self, const Tensor & mask) const = 0;
+  virtual Tensor masked_select(const Tensor & self, const Tensor & mask) const = 0;
+  virtual Tensor nonzero(const Tensor & self) const = 0;
+  virtual Tensor contiguous(const Tensor & self) const = 0;
+  virtual Tensor view(const Tensor & self, IntList size) const = 0;
+  virtual Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index) const = 0;
+  virtual Tensor take(const Tensor & self, const Tensor & index) const = 0;
+  virtual Tensor & put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) const = 0;
+  virtual Tensor & index_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0;
+  virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const = 0;
+  virtual Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & value) const = 0;
+  virtual Tensor unfold(const Tensor & self, int64_t dimension, int64_t size, int64_t step) const = 0;
+  virtual Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) const = 0;
+  virtual Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) const = 0;
+  virtual Tensor & scatter_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) const = 0;
+  virtual Tensor gather(const Tensor & self, int64_t dim, const Tensor & index) const = 0;
+  virtual void* data_ptr(const Tensor & self) const = 0;
+  virtual bool equal(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __and__(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s___and__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __and__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __iand__(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s___iand__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __iand__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __or__(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s___or__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __or__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __ior__(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s___ior__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __ior__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __xor__(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s___xor__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __xor__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __ixor__(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s___ixor__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __ixor__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __lshift__(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s___lshift__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __lshift__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __ilshift__(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s___ilshift__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __ilshift__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __rshift__(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s___rshift__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor __rshift__(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __irshift__(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s___irshift__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & __irshift__(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor lt(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_lt(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor lt(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & lt_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_lt_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & lt_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor gt(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_gt(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor gt(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & gt_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_gt_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & gt_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor le(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_le(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor le(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & le_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_le_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & le_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor ge(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_ge(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor ge(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & ge_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_ge_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & ge_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor eq(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_eq(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor eq(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & eq_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_eq_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & eq_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor ne(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_ne(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor ne(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & ne_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_ne_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & ne_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor s_min(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor min(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor min(const Tensor & self) const = 0;
+  virtual Tensor s_max(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor max(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor max(const Tensor & self) const = 0;
+  virtual Tensor median(const Tensor & self) const = 0;
+  virtual std::tuple<Tensor,Tensor> sort(const Tensor & self, int64_t dim, bool descending) const = 0;
+  virtual std::tuple<Tensor,Tensor> topk(const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted) const = 0;
+  virtual Tensor all(const Tensor & self) const = 0;
+  virtual Tensor any(const Tensor & self) const = 0;
+  virtual Tensor lgamma(const Tensor & self) const = 0;
+  virtual Tensor & lgamma_(Tensor & self) const = 0;
+  virtual Tensor digamma(const Tensor & self) const = 0;
+  virtual Tensor & digamma_(Tensor & self) const = 0;
+  virtual Tensor polygamma(int64_t n, const Tensor & self) const = 0;
+  virtual Tensor & polygamma_(Tensor & self, int64_t n) const = 0;
+  virtual Tensor & erfinv_(Tensor & self) const = 0;
+  virtual Tensor erfinv(const Tensor & self) const = 0;
+  virtual Tensor & frac_(Tensor & self) const = 0;
+  virtual Tensor frac(const Tensor & self) const = 0;
+  virtual Tensor renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0;
+  virtual Tensor & renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0;
+  virtual Tensor s_dist(const Tensor & self, const Tensor & other, Scalar p) const = 0;
+  virtual Tensor dist(const Tensor & self, const Tensor & other, Scalar p) const = 0;
+  virtual Tensor reciprocal(const Tensor & self) const = 0;
+  virtual Tensor & reciprocal_(Tensor & self) const = 0;
+  virtual Tensor neg(const Tensor & self) const = 0;
+  virtual Tensor & neg_(Tensor & self) const = 0;
+  virtual Tensor s_atan2(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor atan2(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & s_atan2_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & atan2_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor s_pow(const Tensor & self, const Tensor & exponent) const = 0;
+  virtual Tensor pow(const Tensor & self, const Tensor & exponent) const = 0;
+  virtual Tensor pow(Scalar base, const Tensor & self) const = 0;
+  virtual Tensor & pow_(Tensor & self, Scalar exponent) const = 0;
+  virtual Tensor & s_pow_(Tensor & self, const Tensor & exponent) const = 0;
+  virtual Tensor & pow_(Tensor & self, const Tensor & exponent) const = 0;
+  virtual Tensor s_lerp(const Tensor & self, const Tensor & end, Scalar weight) const = 0;
+  virtual Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight) const = 0;
+  virtual Tensor & s_lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0;
+  virtual Tensor & lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0;
+  virtual Tensor histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) const = 0;
+  virtual Tensor sign(const Tensor & self) const = 0;
+  virtual Tensor & sign_(Tensor & self) const = 0;
+  virtual Tensor trace(const Tensor & self) const = 0;
+  virtual Tensor fmod(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_fmod(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor fmod(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & fmod_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_fmod_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & fmod_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor remainder(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor s_remainder(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor remainder(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & remainder_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor & s_remainder_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & remainder_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor tril(const Tensor & self, int64_t diagonal) const = 0;
+  virtual Tensor & tril_(Tensor & self, int64_t diagonal) const = 0;
+  virtual Tensor triu(const Tensor & self, int64_t diagonal) const = 0;
+  virtual Tensor & triu_(Tensor & self, int64_t diagonal) const = 0;
+  virtual Tensor cross(const Tensor & self, const Tensor & other, int64_t dim) const = 0;
+  virtual Tensor diag(const Tensor & self, int64_t diagonal) const = 0;
+  virtual Tensor s_addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor & addbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor s_addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor & s_addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor & addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor s_addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor & s_addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual Tensor & addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, Scalar value) const = 0;
+  virtual std::tuple<Tensor,Tensor> gels(const Tensor & self, const Tensor & A) const = 0;
+  virtual std::tuple<Tensor,Tensor> trtrs(const Tensor & self, const Tensor & A, bool upper, bool transpose, bool unitriangular) const = 0;
+  virtual std::tuple<Tensor,Tensor> symeig(const Tensor & self, bool eigenvectors, bool upper) const = 0;
+  virtual std::tuple<Tensor,Tensor> eig(const Tensor & self, bool eigenvectors) const = 0;
+  virtual std::tuple<Tensor,Tensor,Tensor> svd(const Tensor & self, bool some) const = 0;
+  virtual Tensor potrf(const Tensor & self, bool upper) const = 0;
+  virtual Tensor potrs(const Tensor & self, const Tensor & input2, bool upper) const = 0;
+  virtual Tensor potri(const Tensor & self, bool upper) const = 0;
+  virtual std::tuple<Tensor,Tensor> pstrf(const Tensor & self, bool upper, Scalar tol) const = 0;
+  virtual std::tuple<Tensor,Tensor> qr(const Tensor & self) const = 0;
+  virtual std::tuple<Tensor,Tensor> geqrf(const Tensor & self) const = 0;
+  virtual Tensor orgqr(const Tensor & self, const Tensor & input2) const = 0;
+  virtual Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) const = 0;
+  virtual std::tuple<Tensor,Tensor> btrifact(const Tensor & self, bool pivot) const = 0;
+  virtual std::tuple<Tensor,Tensor,Tensor> btrifact_with_info(const Tensor & self, bool pivot) const = 0;
+  virtual Tensor btrisolve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots) const = 0;
+  virtual Tensor & random_(Tensor & self, int64_t from, int64_t to, Generator * generator) const = 0;
+  virtual Tensor & random_(Tensor & self, int64_t to, Generator * generator) const = 0;
+  virtual Tensor & random_(Tensor & self, Generator * generator) const = 0;
+  virtual Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement, Generator * generator) const = 0;
+  virtual Tensor & uniform_(Tensor & self, double from, double to, Generator * generator) const = 0;
+  virtual Tensor & normal_(Tensor & self, double mean, double std, Generator * generator) const = 0;
+  virtual Tensor & cauchy_(Tensor & self, double median, double sigma, Generator * generator) const = 0;
+  virtual Tensor & log_normal_(Tensor & self, double mean, double std, Generator * generator) const = 0;
+  virtual Tensor & exponential_(Tensor & self, double lambd, Generator * generator) const = 0;
+  virtual Tensor & geometric_(Tensor & self, double p, Generator * generator) const = 0;
+  virtual Tensor tensor(Storage storage, int64_t storageOffset, IntList size, IntList stride) const = 0;
+  virtual Tensor tensor(IntList size, IntList stride) const = 0;
+  virtual Tensor abs(const Tensor & self) const = 0;
+  virtual Tensor & abs_(Tensor & self) const = 0;
+  virtual Tensor acos(const Tensor & self) const = 0;
+  virtual Tensor & acos_(Tensor & self) const = 0;
+  virtual Tensor add(const Tensor & self, const Tensor & other, Scalar alpha) const = 0;
+  virtual Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha) const = 0;
+  virtual Tensor add(const Tensor & self, Scalar other, Scalar alpha) const = 0;
+  virtual Tensor & add_(Tensor & self, Scalar other, Scalar alpha) const = 0;
+  virtual Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor all(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual bool allclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan) const = 0;
+  virtual Tensor any(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  AT_DEPRECATED(virtual Tensor arange(Scalar start, Scalar end, Scalar step) const = 0);
+  AT_DEPRECATED(virtual Tensor arange(Scalar end) const = 0);
+  virtual Tensor argmax(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor argmax(const Tensor & self) const = 0;
+  virtual Tensor argmin(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor argmin(const Tensor & self) const = 0;
+  virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride) const = 0;
+  virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride) const = 0;
+  virtual Tensor as_strided(const Tensor & self, IntList size, IntList stride, int64_t storage_offset) const = 0;
+  virtual Tensor & as_strided_(Tensor & self, IntList size, IntList stride, int64_t storage_offset) const = 0;
+  virtual Tensor asin(const Tensor & self) const = 0;
+  virtual Tensor & asin_(Tensor & self) const = 0;
+  virtual Tensor atan(const Tensor & self) const = 0;
+  virtual Tensor & atan_(Tensor & self) const = 0;
+  virtual Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor bernoulli(const Tensor & self, const Tensor & p, Generator * generator) const = 0;
+  virtual Tensor bernoulli(const Tensor & self, double p, Generator * generator) const = 0;
+  virtual Tensor bernoulli(const Tensor & self) const = 0;
+  virtual Tensor & bernoulli_(Tensor & self, const Tensor & p, Generator * generator) const = 0;
+  virtual Tensor & bernoulli_(Tensor & self, double p, Generator * generator) const = 0;
+  virtual Tensor & bernoulli_(Tensor & self) const = 0;
+  virtual Tensor bincount(const Tensor & self, const Tensor & weights, int64_t minlength) const = 0;
+  virtual Tensor bmm(const Tensor & self, const Tensor & mat2) const = 0;
+  virtual Tensor ceil(const Tensor & self) const = 0;
+  virtual Tensor & ceil_(Tensor & self) const = 0;
+  virtual std::vector<Tensor> chunk(const Tensor & self, int64_t chunks, int64_t dim) const = 0;
+  virtual Tensor clamp(const Tensor & self, Scalar min, Scalar max) const = 0;
+  virtual Tensor & clamp_(Tensor & self, Scalar min, Scalar max) const = 0;
+  virtual Tensor clamp_max(const Tensor & self, Scalar max) const = 0;
+  virtual Tensor & clamp_max_(Tensor & self, Scalar max) const = 0;
+  virtual Tensor clamp_min(const Tensor & self, Scalar min) const = 0;
+  virtual Tensor & clamp_min_(Tensor & self, Scalar min) const = 0;
+  virtual Tensor cos(const Tensor & self) const = 0;
+  virtual Tensor & cos_(Tensor & self) const = 0;
+  virtual Tensor cosh(const Tensor & self) const = 0;
+  virtual Tensor & cosh_(Tensor & self) const = 0;
+  virtual Tensor cumsum(const Tensor & self, int64_t dim, ScalarType dtype) const = 0;
+  virtual Tensor cumsum(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor cumprod(const Tensor & self, int64_t dim, ScalarType dtype) const = 0;
+  virtual Tensor cumprod(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor det(const Tensor & self) const = 0;
+  virtual Tensor diagflat(const Tensor & self, int64_t offset) const = 0;
+  virtual Tensor diagonal(const Tensor & self, int64_t offset, int64_t dim1, int64_t dim2) const = 0;
+  virtual Tensor div(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & div_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor div(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor & div_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor dot(const Tensor & self, const Tensor & tensor) const = 0;
+  AT_DEPRECATED(virtual Tensor empty(IntList size) const = 0);
+  virtual Tensor erf(const Tensor & self) const = 0;
+  virtual Tensor & erf_(Tensor & self) const = 0;
+  virtual Tensor erfc(const Tensor & self) const = 0;
+  virtual Tensor & erfc_(Tensor & self) const = 0;
+  virtual Tensor exp(const Tensor & self) const = 0;
+  virtual Tensor & exp_(Tensor & self) const = 0;
+  virtual Tensor expm1(const Tensor & self) const = 0;
+  virtual Tensor & expm1_(Tensor & self) const = 0;
+  virtual Tensor expand(const Tensor & self, IntList size, bool implicit) const = 0;
+  virtual Tensor expand_as(const Tensor & self, const Tensor & other) const = 0;
+  AT_DEPRECATED(virtual Tensor eye(int64_t n, int64_t m) const = 0);
+  virtual Tensor flatten(const Tensor & self, int64_t start_dim, int64_t end_dim) const = 0;
+  virtual Tensor & fill_(Tensor & self, Scalar value) const = 0;
+  virtual Tensor & fill_(Tensor & self, const Tensor & value) const = 0;
+  virtual Tensor floor(const Tensor & self) const = 0;
+  virtual Tensor & floor_(Tensor & self) const = 0;
+  AT_DEPRECATED(virtual Tensor full(IntList size, Scalar fill_value) const = 0);
+  virtual Tensor ger(const Tensor & self, const Tensor & vec2) const = 0;
+  virtual std::tuple<Tensor,Tensor> gesv(const Tensor & self, const Tensor & A) const = 0;
+  virtual Tensor fft(const Tensor & self, int64_t signal_ndim, bool normalized) const = 0;
+  virtual Tensor ifft(const Tensor & self, int64_t signal_ndim, bool normalized) const = 0;
+  virtual Tensor rfft(const Tensor & self, int64_t signal_ndim, bool normalized, bool onesided) const = 0;
+  virtual Tensor irfft(const Tensor & self, int64_t signal_ndim, bool normalized, bool onesided, IntList signal_sizes) const = 0;
+  virtual Tensor index(const Tensor & self, TensorList indices) const = 0;
+  virtual Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0;
+  virtual Tensor index_put(const Tensor & self, TensorList indices, const Tensor & values) const = 0;
+  virtual Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values) const = 0;
+  virtual Tensor inverse(const Tensor & self) const = 0;
+  virtual Tensor isclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan) const = 0;
+  virtual bool is_cuda(const Tensor & self) const = 0;
+  virtual bool is_distributed(const Tensor & self) const = 0;
+  virtual bool is_floating_point(const Tensor & self) const = 0;
+  virtual bool is_complex(const Tensor & self) const = 0;
+  virtual bool is_nonzero(const Tensor & self) const = 0;
+  virtual bool is_same_size(const Tensor & self, const Tensor & other) const = 0;
+  virtual bool is_signed(const Tensor & self) const = 0;
+  virtual bool is_sparse(const Tensor & self) const = 0;
+  virtual std::tuple<Tensor,Tensor> kthvalue(const Tensor & self, int64_t k, int64_t dim, bool keepdim) const = 0;
+  AT_DEPRECATED(virtual Tensor linspace(Scalar start, Scalar end, int64_t steps) const = 0);
+  virtual Tensor log(const Tensor & self) const = 0;
+  virtual Tensor & log_(Tensor & self) const = 0;
+  virtual Tensor log10(const Tensor & self) const = 0;
+  virtual Tensor & log10_(Tensor & self) const = 0;
+  virtual Tensor log1p(const Tensor & self) const = 0;
+  virtual Tensor & log1p_(Tensor & self) const = 0;
+  virtual Tensor log2(const Tensor & self) const = 0;
+  virtual Tensor & log2_(Tensor & self) const = 0;
+  virtual Tensor logdet(const Tensor & self) const = 0;
+  AT_DEPRECATED(virtual Tensor logspace(Scalar start, Scalar end, int64_t steps) const = 0);
+  virtual Tensor log_softmax(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor logsumexp(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor matmul(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor matrix_power(const Tensor & self, int64_t n) const = 0;
+  virtual std::tuple<Tensor,Tensor> max(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor max_values(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor mean(const Tensor & self, ScalarType dtype) const = 0;
+  virtual Tensor mean(const Tensor & self) const = 0;
+  virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0;
+  virtual Tensor mean(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor mean(const Tensor & self, int64_t dim, ScalarType dtype) const = 0;
+  virtual std::tuple<Tensor,Tensor> median(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual std::tuple<Tensor,Tensor> min(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor min_values(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor mm(const Tensor & self, const Tensor & mat2) const = 0;
+  virtual std::tuple<Tensor,Tensor> mode(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor mul(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor & mul_(Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor mul(const Tensor & self, Scalar other) const = 0;
+  virtual Tensor & mul_(Tensor & self, Scalar other) const = 0;
+  virtual Tensor mv(const Tensor & self, const Tensor & vec) const = 0;
+  virtual Tensor mvlgamma(const Tensor & self, int64_t p) const = 0;
+  virtual Tensor & mvlgamma_(Tensor & self, int64_t p) const = 0;
+  virtual Tensor narrow(const Tensor & self, int64_t dim, int64_t start, int64_t length) const = 0;
+  AT_DEPRECATED(virtual Tensor ones(IntList size) const = 0);
+  virtual Tensor permute(const Tensor & self, IntList dims) const = 0;
+  virtual Tensor pin_memory(const Tensor & self) const = 0;
+  virtual Tensor pinverse(const Tensor & self, double rcond) const = 0;
+  AT_DEPRECATED(virtual Tensor rand(IntList size, Generator * generator) const = 0);
+  AT_DEPRECATED(virtual Tensor randint(int64_t high, IntList size, Generator * generator) const = 0);
+  AT_DEPRECATED(virtual Tensor randint(int64_t low, int64_t high, IntList size, Generator * generator) const = 0);
+  AT_DEPRECATED(virtual Tensor randn(IntList size, Generator * generator) const = 0);
+  AT_DEPRECATED(virtual Tensor randperm(int64_t n, Generator * generator) const = 0);
+  AT_DEPRECATED(virtual Tensor range(Scalar start, Scalar end, Scalar step) const = 0);
+  virtual Tensor repeat(const Tensor & self, IntList repeats) const = 0;
+  virtual Tensor reshape(const Tensor & self, IntList shape) const = 0;
+  virtual Tensor reshape_as(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor round(const Tensor & self) const = 0;
+  virtual Tensor & round_(Tensor & self) const = 0;
+  virtual Tensor relu(const Tensor & self) const = 0;
+  virtual Tensor & relu_(Tensor & self) const = 0;
+  virtual Tensor hardshrink(const Tensor & self, Scalar lambd) const = 0;
+  virtual Tensor hardshrink_backward(const Tensor & grad_out, const Tensor & self, Scalar lambd) const = 0;
+  virtual Tensor rsqrt(const Tensor & self) const = 0;
+  virtual Tensor & rsqrt_(Tensor & self) const = 0;
+  virtual Tensor select(const Tensor & self, int64_t dim, int64_t index) const = 0;
+  virtual Tensor sigmoid(const Tensor & self) const = 0;
+  virtual Tensor & sigmoid_(Tensor & self) const = 0;
+  virtual Tensor sin(const Tensor & self) const = 0;
+  virtual Tensor & sin_(Tensor & self) const = 0;
+  virtual Tensor sinh(const Tensor & self) const = 0;
+  virtual Tensor & sinh_(Tensor & self) const = 0;
+  virtual Tensor detach(const Tensor & self) const = 0;
+  virtual Tensor & detach_(Tensor & self) const = 0;
+  virtual int64_t size(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor slice(const Tensor & self, int64_t dim, int64_t start, int64_t end, int64_t step) const = 0;
+  virtual std::tuple<Tensor,Tensor> slogdet(const Tensor & self) const = 0;
+  virtual Tensor smm(const Tensor & self, const Tensor & mat2) const = 0;
+  virtual Tensor softmax(const Tensor & self, int64_t dim) const = 0;
+  virtual std::vector<Tensor> split(const Tensor & self, int64_t split_size, int64_t dim) const = 0;
+  virtual std::vector<Tensor> split_with_sizes(const Tensor & self, IntList split_sizes, int64_t dim) const = 0;
+  virtual Tensor squeeze(const Tensor & self) const = 0;
+  virtual Tensor squeeze(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor & squeeze_(Tensor & self) const = 0;
+  virtual Tensor & squeeze_(Tensor & self, int64_t dim) const = 0;
+  virtual Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor stft(const Tensor & self, int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor & window, bool normalized, bool onesided) const = 0;
+  virtual int64_t stride(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor sum(const Tensor & self, ScalarType dtype) const = 0;
+  virtual Tensor sum(const Tensor & self) const = 0;
+  virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim, ScalarType dtype) const = 0;
+  virtual Tensor sum(const Tensor & self, IntList dim, bool keepdim) const = 0;
+  virtual Tensor sum(const Tensor & self, IntList dim, ScalarType dtype) const = 0;
+  virtual Tensor sqrt(const Tensor & self) const = 0;
+  virtual Tensor & sqrt_(Tensor & self) const = 0;
+  virtual Tensor std(const Tensor & self, bool unbiased) const = 0;
+  virtual Tensor std(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) const = 0;
+  virtual Tensor prod(const Tensor & self, ScalarType dtype) const = 0;
+  virtual Tensor prod(const Tensor & self) const = 0;
+  virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim, ScalarType dtype) const = 0;
+  virtual Tensor prod(const Tensor & self, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor prod(const Tensor & self, int64_t dim, ScalarType dtype) const = 0;
+  virtual Tensor t(const Tensor & self) const = 0;
+  virtual Tensor & t_(Tensor & self) const = 0;
+  virtual Tensor tan(const Tensor & self) const = 0;
+  virtual Tensor & tan_(Tensor & self) const = 0;
+  virtual Tensor tanh(const Tensor & self) const = 0;
+  virtual Tensor & tanh_(Tensor & self) const = 0;
+  virtual Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) const = 0;
+  virtual Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) const = 0;
+  virtual Tensor flip(const Tensor & self, IntList dims) const = 0;
+  virtual Tensor rot90(const Tensor & self, int64_t k, IntList dims) const = 0;
+  virtual Tensor trunc(const Tensor & self) const = 0;
+  virtual Tensor & trunc_(Tensor & self) const = 0;
+  virtual Tensor type_as(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor unsqueeze(const Tensor & self, int64_t dim) const = 0;
+  virtual Tensor & unsqueeze_(Tensor & self, int64_t dim) const = 0;
+  virtual Tensor var(const Tensor & self, bool unbiased) const = 0;
+  virtual Tensor var(const Tensor & self, int64_t dim, bool unbiased, bool keepdim) const = 0;
+  virtual Tensor view_as(const Tensor & self, const Tensor & other) const = 0;
+  virtual Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other) const = 0;
+  AT_DEPRECATED(virtual Tensor zeros(IntList size) const = 0);
+  virtual Tensor norm(const Tensor & self, Scalar p) const = 0;
+  virtual Tensor norm(const Tensor & self, Scalar p, int64_t dim, bool keepdim) const = 0;
+  virtual Tensor clone(const Tensor & self) const = 0;
+  virtual Tensor & resize_as_(Tensor & self, const Tensor & the_template) const = 0;
+  virtual Tensor pow(const Tensor & self, Scalar exponent) const = 0;
+  virtual Tensor & zero_(Tensor & self) const = 0;
+  virtual Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha) const = 0;
+  virtual Tensor & sub_(Tensor & self, const Tensor & other, Scalar alpha) const = 0;
+  virtual Tensor sub(const Tensor & self, Scalar other, Scalar alpha) const = 0;
+  virtual Tensor & sub_(Tensor & self, Scalar other, Scalar alpha) const = 0;
+  virtual Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const = 0;
+  virtual Tensor tensor() const = 0;
+  virtual Tensor tensor(IntList size) const = 0;
+  virtual Tensor native_sparse_coo_tensor(IntList size) const = 0;
+  virtual Tensor native_sparse_coo_tensor(const Tensor & indices, const Tensor & values) const = 0;
+  virtual Tensor native_sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const = 0;
+  virtual Tensor sparse_coo_tensor(IntList size) const = 0;
+  virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values) const = 0;
+  virtual Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntList size) const = 0;
+  virtual Tensor _native_sparse_coo_tensor_unsafe(const Tensor & indices, const Tensor & values, IntList size) const = 0;
+  virtual Tensor & sparse_resize_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0;
+  virtual Tensor & sparse_resize_and_clear_(Tensor & self, IntList size, int64_t sparseDims, int64_t denseDims) const = 0;
+  virtual Tensor sparse_mask(const Tensor & self, SparseTensorRef mask) const = 0;
+  virtual Tensor to_dense(const Tensor & self) const = 0;
+  virtual int64_t _sparseDims(const Tensor & self) const = 0;
+  virtual int64_t _denseDims(const Tensor & self) const = 0;
+  virtual int64_t _nnz(const Tensor & self) const = 0;
+  virtual Tensor coalesce(const Tensor & self) const = 0;
+  virtual bool is_coalesced(const Tensor & self) const = 0;
+  virtual Tensor _indices(const Tensor & self) const = 0;
+  virtual Tensor _values(const Tensor & self) const = 0;
+  virtual int64_t numel(const Tensor & self) const = 0;
+  virtual std::vector<Tensor> unbind(const Tensor & self, int64_t dim) const = 0;
+  virtual int64_t get_device(const Tensor & self) const = 0;
+  virtual Tensor to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking) const = 0;
+  virtual Tensor to(const Tensor & self, ScalarType dtype, bool non_blocking) const = 0;
+  virtual Tensor to(const Tensor & self, Device device, bool non_blocking) const = 0;
+  virtual Tensor to(const Tensor & self, const Tensor & other, bool non_blocking) const = 0;
+  virtual Scalar _local_scalar(const Tensor & self) const = 0;
+protected:
+  TensorTypeId type_id_;
+  bool is_variable_;
+  bool is_undefined_;
+
+};
+
+} // namespace at
+
+#include "ATen/core/Tensor.h"
diff --git a/aten/src/ATen/core/UndefinedTensorImpl.cpp b/aten/src/ATen/core/UndefinedTensorImpl.cpp
new file mode 100644
index 00000000000000..e26b61a03c87e0
--- /dev/null
+++ b/aten/src/ATen/core/UndefinedTensorImpl.cpp
@@ -0,0 +1,40 @@
+#include "ATen/core/UndefinedTensorImpl.h"
+#include "ATen/core/Error.h"
+
+namespace at {
+
+// should this use the globalContext?  Can it get a context passed in somehow?
+UndefinedTensorImpl::UndefinedTensorImpl()
+: TensorImpl(UndefinedTensorId(), caffe2::TypeMeta(), nullptr, /* is variable */ false) {
+}
+
+IntList UndefinedTensorImpl::sizes() const {
+  AT_ERROR("sizes() called on undefined Tensor");
+}
+
+int64_t UndefinedTensorImpl::size(int64_t d) const {
+  AT_ERROR("size(dim) called on an undefined Tensor");
+}
+
+int64_t UndefinedTensorImpl::stride(int64_t d) const {
+  AT_ERROR("stride(dim) called on an undefined Tensor");
+}
+
+int64_t UndefinedTensorImpl::dim() const {
+  AT_ERROR("dim() called on undefined Tensor");
+}
+
+const Storage& UndefinedTensorImpl::storage() const {
+  AT_ERROR("storage() called on undefined Tensor");
+}
+
+int64_t UndefinedTensorImpl::storage_offset() const {
+  AT_ERROR("storage_offset() called on an undefined Tensor");
+}
+
+IntList UndefinedTensorImpl::strides() const {
+  AT_ERROR("strides() called on undefined Tensor");
+}
+UndefinedTensorImpl UndefinedTensorImpl::_singleton;
+
+}
diff --git a/aten/src/ATen/UndefinedTensor.h b/aten/src/ATen/core/UndefinedTensorImpl.h
similarity index 72%
rename from aten/src/ATen/UndefinedTensor.h
rename to aten/src/ATen/core/UndefinedTensorImpl.h
index 86faf028802c1e..6c734950d90cad 100644
--- a/aten/src/ATen/UndefinedTensor.h
+++ b/aten/src/ATen/core/UndefinedTensorImpl.h
@@ -1,13 +1,13 @@
 #pragma once
 
-#include "ATen/TensorImpl.h"
+#include "ATen/core/TensorImpl.h"
 
 namespace at {
 
-struct AT_API UndefinedTensor final : public TensorImpl {
+struct AT_API UndefinedTensorImpl final : public TensorImpl {
 public:
   // Without this, we get:
-  //  error: identifier "at::UndefinedTensor::_singleton" is undefined in device code
+  //  error: identifier "at::UndefinedTensorImpl::_singleton" is undefined in device code
   // (ostensibly because the constexpr tricks MSVC into trying to compile this
   // function for device as well).
 #ifdef _WIN32
@@ -25,8 +25,8 @@ struct AT_API UndefinedTensor final : public TensorImpl {
   const Storage& storage() const override;
   int64_t storage_offset() const override;
 private:
-  UndefinedTensor();
-  static UndefinedTensor _singleton;
+  UndefinedTensorImpl();
+  static UndefinedTensorImpl _singleton;
 public:
   friend struct UndefinedType;
 };
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
index 2ca9a7f6851102..7cf1b7cc174980 100644
--- a/aten/src/ATen/core/context_base.h
+++ b/aten/src/ATen/core/context_base.h
@@ -61,7 +61,7 @@ class AT_CORE_API BaseContext {
   virtual BaseStaticContext* GetStaticContext() const = 0;
 
   /* Sorry for the naming, will get rid of this in future diff */
-  virtual DeviceType GetDevicetype() const = 0;
+  virtual DeviceType device_type() const = 0;
 
   virtual void SwitchToDevice(int /*stream_id*/) = 0;
 
@@ -96,13 +96,13 @@ class AT_CORE_API BaseContext {
       DeviceType type) {
     if (type == DeviceType::CPU) {
       CopyBytesToCPU(nbytes, src, dst);
-    } else if (type == GetDevicetype()) {
+    } else if (type == device_type()) {
       CopyBytesSameDevice(nbytes, src, dst);
     } else {
       AT_ERROR(
           "CopyBytesToDevice can only copy to CPU or between same "
           "device. Can't copy from: ",
-          GetDevicetype(),
+          device_type(),
           " to",
           type);
     }
diff --git a/aten/src/ATen/core/intrusive_ptr.h b/aten/src/ATen/core/intrusive_ptr.h
index 65c6b5e702f2a0..961915555a3756 100644
--- a/aten/src/ATen/core/intrusive_ptr.h
+++ b/aten/src/ATen/core/intrusive_ptr.h
@@ -119,6 +119,15 @@ struct AT_CORE_EXPORT intrusive_target_default_null_type final {
     return nullptr;
   }
 };
+
+template<class TTarget, class ToNullType, class FromNullType>
+TTarget* assign_ptr_(TTarget* rhs) {
+  if (FromNullType::singleton() == rhs) {
+    return ToNullType::singleton();
+  } else {
+    return rhs;
+  }
+}
 } // namespace detail
 
 template <class TTarget, class NullType>
@@ -191,17 +200,10 @@ class AT_CORE_EXPORT intrusive_ptr final {
 
   template <class From, class FromNullType>
   /* implicit */ intrusive_ptr(intrusive_ptr<From, FromNullType>&& rhs) noexcept
-      : target_(rhs.target_) {
+      : target_(detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. intrusive_ptr move constructor got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. intrusive_ptr move constructor got pointer with differing null value.");
-#endif
     rhs.target_ = FromNullType::singleton();
   }
 
@@ -212,17 +214,10 @@ class AT_CORE_EXPORT intrusive_ptr final {
   template <class From, class FromNullType>
   /* implicit */ intrusive_ptr(
       const intrusive_ptr<From, FromNullType>& rhs)
-      : target_(rhs.target_) {
+      : target_(detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. intrusive_ptr copy constructor got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. intrusive_ptr copy constructor got pointer with differing null value.");
-#endif
     retain_();
   }
 
@@ -240,13 +235,6 @@ class AT_CORE_EXPORT intrusive_ptr final {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. intrusive_ptr move assignment got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. intrusive_ptr move assignment got pointer with differing null value.");
-#endif
     intrusive_ptr tmp = std::move(rhs);
     swap(tmp);
     return *this;
@@ -261,13 +249,6 @@ class AT_CORE_EXPORT intrusive_ptr final {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. intrusive_ptr copy assignment got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. intrusive_ptr copy assignment got pointer with differing null value.");
-#endif
     intrusive_ptr tmp = rhs;
     swap(tmp);
     return *this;
@@ -464,17 +445,10 @@ class AT_CORE_EXPORT weak_intrusive_ptr final {
   template <class From, class FromNullType>
   /* implicit */ weak_intrusive_ptr(
       weak_intrusive_ptr<From, FromNullType>&& rhs) noexcept
-      : target_(rhs.target_) {
+      : target_(detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. weak_intrusive_ptr move constructor got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. weak_intrusive_ptr move constructor got pointer with differing null value.");
-#endif
     rhs.target_ = FromNullType::singleton();
   }
 
@@ -486,17 +460,10 @@ class AT_CORE_EXPORT weak_intrusive_ptr final {
   template <class From, class FromNullType>
   /* implicit */ weak_intrusive_ptr(
       const weak_intrusive_ptr<From, FromNullType>& rhs)
-      : target_(rhs.target_) {
+      : target_(detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. weak_intrusive_ptr copy constructor got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. weak_intrusive_ptr copy constructor got pointer with differing null value.");
-#endif
     retain_();
   }
 
@@ -515,13 +482,6 @@ class AT_CORE_EXPORT weak_intrusive_ptr final {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. weak_intrusive_ptr move assignment got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. weak_intrusive_ptr move assignment got pointer with differing null value.");
-#endif
     weak_intrusive_ptr tmp = std::move(rhs);
     swap(tmp);
     return *this;
@@ -537,13 +497,6 @@ class AT_CORE_EXPORT weak_intrusive_ptr final {
     static_assert(
         std::is_convertible<From*, TTarget*>::value,
         "Type mismatch. weak_intrusive_ptr copy assignment got pointer of wrong type.");
-#ifndef _WIN32
-    // This static_assert triggers on MSVC
-    //  error C2131: expression did not evaluate to a constant
-    static_assert(
-        NullType::singleton() == FromNullType::singleton(),
-        "NullType mismatch. weak_intrusive_ptr copy assignment got pointer with differing null value.");
-#endif
     weak_intrusive_ptr tmp = rhs;
     swap(tmp);
     return *this;
diff --git a/aten/src/ATen/core/intrusive_ptr_test.cpp b/aten/src/ATen/core/intrusive_ptr_test.cpp
index df98563f1ff1fc..4d0701ebe164d7 100644
--- a/aten/src/ATen/core/intrusive_ptr_test.cpp
+++ b/aten/src/ATen/core/intrusive_ptr_test.cpp
@@ -59,6 +59,23 @@ class ChildDestructableMock final : public DestructableMock {
   ChildDestructableMock(bool* resourcesReleased, bool* wasDestructed)
       : DestructableMock(resourcesReleased, wasDestructed) {}
 };
+class NullType1 final {
+  static SomeClass singleton_;
+public:
+  static constexpr SomeClass* singleton() {
+    return &singleton_;
+  }
+};
+SomeClass NullType1::singleton_;
+class NullType2 final {
+  static SomeClass singleton_;
+public:
+  static constexpr SomeClass* singleton() {
+    return &singleton_;
+  }
+};
+SomeClass NullType2::singleton_;
+static_assert(NullType1::singleton() != NullType2::singleton(), "");
 } // namespace
 
 static_assert(
@@ -262,6 +279,19 @@ TEST(
   EXPECT_FALSE(obj2.defined());
 }
 
+TEST(
+    IntrusivePtrTest,
+    givenNullPtr_whenMoveAssigningToDifferentNullptr_thenHasNewNullptr) {
+  intrusive_ptr<SomeClass, NullType1> obj1;
+  intrusive_ptr<SomeClass, NullType2> obj2;
+  obj2 = std::move(obj1);
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_EQ(NullType1::singleton(), obj1.get());
+  EXPECT_EQ(NullType2::singleton(), obj2.get());
+  EXPECT_FALSE(obj1.defined());
+  EXPECT_FALSE(obj2.defined());
+}
+
 TEST(IntrusivePtrTest, givenValidPtr_whenCopyAssigning_thenPointsToSameObject) {
   intrusive_ptr<SomeClass> obj1 = make_intrusive<SomeClass>();
   intrusive_ptr<SomeClass> obj2 = make_intrusive<SomeClass>();
@@ -359,6 +389,19 @@ TEST(
   EXPECT_FALSE(obj2.defined());
 }
 
+TEST(
+    IntrusivePtrTest,
+    givenNullPtr_whenCopyAssigningToDifferentNullptr_thenHasNewNullptr) {
+  intrusive_ptr<SomeClass, NullType1> obj1;
+  intrusive_ptr<SomeClass, NullType2> obj2;
+  obj2 = obj1;
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_EQ(NullType1::singleton(), obj1.get());
+  EXPECT_EQ(NullType2::singleton(), obj2.get());
+  EXPECT_FALSE(obj1.defined());
+  EXPECT_FALSE(obj2.defined());
+}
+
 TEST(IntrusivePtrTest, givenPtr_whenMoveConstructing_thenPointsToSameObject) {
   intrusive_ptr<SomeClass> obj1 = make_intrusive<SomeClass>();
   SomeClass* obj1ptr = obj1.get();
@@ -420,6 +463,18 @@ TEST(
   EXPECT_FALSE(obj2.defined());
 }
 
+TEST(
+    IntrusivePtrTest,
+    givenNullPtr_whenMoveConstructingToDifferentNullptr_thenHasNewNullptr) {
+  intrusive_ptr<SomeClass, NullType1> obj1;
+  intrusive_ptr<SomeClass, NullType2> obj2 = std::move(obj1);
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_EQ(NullType1::singleton(), obj1.get());
+  EXPECT_EQ(NullType2::singleton(), obj2.get());
+  EXPECT_FALSE(obj1.defined());
+  EXPECT_FALSE(obj2.defined());
+}
+
 TEST(IntrusivePtrTest, givenPtr_whenCopyConstructing_thenPointsToSameObject) {
   intrusive_ptr<SomeClass> obj1 = make_intrusive<SomeClass>();
   SomeClass* obj1ptr = obj1.get();
@@ -482,6 +537,18 @@ TEST(
   EXPECT_FALSE(obj2.defined());
 }
 
+TEST(
+    IntrusivePtrTest,
+    givenNullPtr_whenCopyConstructingToDifferentNullptr_thenHasNewNullptr) {
+  intrusive_ptr<SomeClass, NullType1> obj1;
+  intrusive_ptr<SomeClass, NullType2> obj2 = obj1;
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_EQ(NullType1::singleton(), obj1.get());
+  EXPECT_EQ(NullType2::singleton(), obj2.get());
+  EXPECT_FALSE(obj1.defined());
+  EXPECT_FALSE(obj2.defined());
+}
+
 TEST(IntrusivePtrTest, SwapFunction) {
   intrusive_ptr<SomeClass> obj1 = make_intrusive<SomeClass>();
   intrusive_ptr<SomeClass> obj2 = make_intrusive<SomeClass>();
@@ -1520,9 +1587,9 @@ weak_intrusive_ptr<T> make_weak_only(Args&&... args) {
   auto intrusive = make_intrusive<T>(std::forward<Args>(args)...);
   return weak_intrusive_ptr<T>(intrusive);
 }
-template <class T>
-weak_intrusive_ptr<T> make_invalid_weak() {
-  return weak_intrusive_ptr<T>(intrusive_ptr<T>());
+template <class T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
+weak_intrusive_ptr<T, NullType> make_invalid_weak() {
+  return weak_intrusive_ptr<T, NullType>(intrusive_ptr<T, NullType>());
 }
 } // namespace
 
@@ -1752,6 +1819,17 @@ TEST(
   EXPECT_TRUE(obj2.weak.expired());
 }
 
+TEST(
+    WeakIntrusivePtrTest,
+    givenNullPtr_whenMoveAssigningToDifferentNullptr_thenHasNewNullptr) {
+  weak_intrusive_ptr<SomeClass, NullType1> obj1 = make_invalid_weak<SomeClass, NullType1>();
+  weak_intrusive_ptr<SomeClass, NullType2> obj2 = make_invalid_weak<SomeClass, NullType2>();
+  obj2 = std::move(obj1);
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_TRUE(obj1.expired());
+  EXPECT_TRUE(obj2.expired());
+}
+
 TEST(
     WeakIntrusivePtrTest,
     givenValidPtr_whenCopyAssigning_thenPointsToSameObject) {
@@ -1930,6 +2008,17 @@ TEST(
   EXPECT_TRUE(obj2.weak.expired());
 }
 
+TEST(
+    WeakIntrusivePtrTest,
+    givenNullPtr_whenCopyAssigningToDifferentNullptr_thenHasNewNullptr) {
+  weak_intrusive_ptr<SomeClass, NullType1> obj1 = make_invalid_weak<SomeClass, NullType1>();
+  weak_intrusive_ptr<SomeClass, NullType2> obj2 = make_invalid_weak<SomeClass, NullType2>();
+  obj2 = obj1;
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_TRUE(obj1.expired());
+  EXPECT_TRUE(obj2.expired());
+}
+
 TEST(
     WeakIntrusivePtrTest,
     givenPtr_whenMoveConstructing_thenPointsToSameObject) {
@@ -2014,6 +2103,16 @@ TEST(
   EXPECT_TRUE(obj2.expired());
 }
 
+TEST(
+    WeakIntrusivePtrTest,
+    givenNullPtr_whenMoveConstructingToDifferentNullptr_thenHasNewNullptr) {
+  weak_intrusive_ptr<SomeClass, NullType1> obj1 = make_invalid_weak<SomeClass, NullType1>();
+  weak_intrusive_ptr<SomeClass, NullType2> obj2 = std::move(obj1);
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_TRUE(obj1.expired());
+  EXPECT_TRUE(obj2.expired());
+}
+
 TEST(
     WeakIntrusivePtrTest,
     givenPtr_whenCopyConstructing_thenPointsToSameObject) {
@@ -2097,6 +2196,16 @@ TEST(
   EXPECT_TRUE(obj2.expired());
 }
 
+TEST(
+    WeakIntrusivePtrTest,
+    givenNullPtr_whenCopyConstructingToDifferentNullptr_thenHasNewNullptr) {
+  weak_intrusive_ptr<SomeClass, NullType1> obj1 = make_invalid_weak<SomeClass, NullType1>();
+  weak_intrusive_ptr<SomeClass, NullType2> obj2 = obj1;
+  EXPECT_NE(NullType1::singleton(), NullType2::singleton());
+  EXPECT_TRUE(obj1.expired());
+  EXPECT_TRUE(obj2.expired());
+}
+
 TEST(WeakIntrusivePtrTest, SwapFunction) {
   IntrusiveAndWeak<SomeClass> obj1 = make_weak_intrusive<SomeClass>();
   IntrusiveAndWeak<SomeClass> obj2 = make_weak_intrusive<SomeClass>();
diff --git a/torch/csrc/jit/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
similarity index 86%
rename from torch/csrc/jit/ivalue.cpp
rename to aten/src/ATen/core/ivalue.cpp
index 315da36deb196f..3d2b56893e7188 100644
--- a/torch/csrc/jit/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -1,12 +1,15 @@
-#include "torch/csrc/jit/assertions.h"
-#include "torch/csrc/jit/ivalue.h"
-#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/Formatting.h>
 
 #define TORCH_FORALL_TAGS(_) \
   _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList)
 
 namespace torch { namespace jit {
 
+AT_API c10::intrusive_ptr<ConstantString> ConstantString::create(std::string str_) {
+  return c10::make_intrusive<ConstantString>(std::move(str_));
+}
+
 namespace {
 
 template<typename Elem>
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
new file mode 100644
index 00000000000000..914598f6ceb426
--- /dev/null
+++ b/aten/src/ATen/core/ivalue.h
@@ -0,0 +1,425 @@
+#pragma once
+
+#include <ATen/core/Scalar.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/TensorImpl.h>
+#include <ATen/core/UndefinedTensorImpl.h>
+#include <ATen/core/intrusive_ptr.h>
+
+#include <type_traits>
+
+namespace torch { namespace jit {
+
+template <typename T>
+using Shared = c10::intrusive_ptr<T>;
+
+// string
+struct AT_API ConstantString final : c10::intrusive_ptr_target {
+ private:
+  const std::string str_;
+ public:
+  ConstantString(std::string str)
+  : str_(std::move(str)) {}
+  static c10::intrusive_ptr<ConstantString> create(std::string str_);
+  const std::string & string() const {
+    return str_;
+  }
+  operator const std::string & () const {
+    return string();
+  }
+  AT_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const ConstantString& v);
+};
+
+// non-mutable list
+template <typename Elem>
+struct AT_CORE_EXPORT ConstantList final : c10::intrusive_ptr_target {
+ private:
+  const std::vector<Elem> elements_;
+ public:
+  ConstantList(std::vector<Elem> elements_)
+  : elements_(std::move(elements_)) {}
+  static c10::intrusive_ptr<ConstantList<Elem>> create(std::vector<Elem> elements_) {
+    return c10::make_intrusive<ConstantList<Elem>>(std::move(elements_));
+  }
+  const std::vector<Elem>& elements() const {
+    return elements_;
+  }
+  operator const std::vector<Elem>&() const {
+    return elements();
+  }
+};
+
+struct IValue;
+using Tuple = ConstantList<IValue>;
+using IntList = ConstantList<int64_t>;
+using TensorList = ConstantList<at::Tensor>;
+using DoubleList = ConstantList<double>;
+
+// IValue is the generic tagged union used by the interpreter to hold
+// all value types.
+// It is a 16-byte object with an 8-byte payload and an 8-byte tag.
+// The tag is currently 4 bytes to determine the type, and 1 byte
+// to mark whether that type is a subtype of c10::intrusive_ptr_target and needs
+// retain/release calls.
+
+#define TORCH_FORALL_TAGS(_) \
+  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList)
+
+struct AT_API IValue final {
+  IValue()
+  : payload{0}
+  , tag(Tag::None)
+  , is_intrusive_ptr(false) {}
+  IValue(const IValue& rhs)
+      : payload(rhs.payload),
+        tag(rhs.tag),
+        is_intrusive_ptr(rhs.is_intrusive_ptr) {
+    if (is_intrusive_ptr) {
+      c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr);
+    }
+  }
+  IValue(IValue&& rhs) noexcept : IValue() {
+    swap(rhs);
+  }
+  ~IValue() {
+    if (is_intrusive_ptr) {
+      c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr);
+    }
+  }
+  IValue & operator=(IValue && rhs) & noexcept {
+    IValue(std::move(rhs)).swap(*this); // this also sets rhs to None
+    return *this;
+  }
+  IValue & operator=(IValue const & rhs) & {
+    IValue(rhs).swap(*this);
+    return *this;
+  }
+  void swap(IValue & rhs) noexcept {
+    std::swap(payload, rhs.payload);
+    std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr);
+    std::swap(tag, rhs.tag);
+  }
+  // Accessors for subtypes are arranged together below
+  // While some of these accessors could be generated through templates,
+  // we prefer to write them manually for clarity
+
+  // Tensor
+  IValue(at::Tensor t)
+  : tag(Tag::Tensor), is_intrusive_ptr(t.defined())  {
+    // Note: the undefined tensor is not refcounted, so while it
+    // is tagged as a tensor, is_intrusive_ptr is set to false.
+    // This is not an optional optimization: our incref call
+    // *will not* do the right thing when called on an
+    // undefined tensor.
+    payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl();
+  }
+  bool isTensor() const { return Tag::Tensor == tag; }
+  at::Tensor toTensor() && {
+    AT_ASSERT(isTensor());
+    return at::Tensor(moveToIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
+  }
+  at::Tensor toTensor() const & {
+    AT_ASSERT(isTensor());
+    return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
+  }
+
+  // Tuple
+  IValue(c10::intrusive_ptr<Tuple> v);
+  bool isTuple() const { return Tag::Tuple == tag; }
+  c10::intrusive_ptr<Tuple> toTuple() && {
+    AT_ASSERT(isTuple());
+    return moveToIntrusivePtr<Tuple>();
+  }
+  c10::intrusive_ptr<Tuple> toTuple() const & {
+    AT_ASSERT(isTuple());
+    return toIntrusivePtr<Tuple>();
+  }
+
+  // Double
+  IValue(double d)
+  : tag(Tag::Double), is_intrusive_ptr(false) {
+    payload.as_double = d;
+  }
+  bool isDouble() const { return Tag::Double == tag; }
+  double toDouble() const {
+    AT_ASSERT(isDouble());
+    return payload.as_double;
+  }
+
+  // Int
+  IValue(int64_t i)
+  : tag(Tag::Int), is_intrusive_ptr(false) {
+    payload.as_int = i;
+  }
+
+  // allow you to pass literals (3, 4) without ambiguity
+  IValue(int32_t i)
+  : IValue(static_cast<int64_t>(i)) {}
+  IValue(bool b)
+  : IValue(static_cast<int64_t>(b)) {}
+
+  bool isInt() const { return Tag::Int == tag; }
+
+  int64_t toInt() const {
+    AT_ASSERT(isInt());
+    return payload.as_int;
+  }
+
+  // IntList
+  IValue(c10::intrusive_ptr<IntList> v);
+  IValue(std::vector<int64_t> v);
+  IValue(at::ArrayRef<int64_t> v)
+  : IValue(v.vec()) {}
+  bool isIntList() const { return Tag::IntList == tag; }
+  c10::intrusive_ptr<IntList> toIntList() && {
+    AT_ASSERT(isIntList());
+    return moveToIntrusivePtr<IntList>();
+  }
+  c10::intrusive_ptr<IntList> toIntList() const & {
+    AT_ASSERT(isIntList());
+    return toIntrusivePtr<IntList>();
+  }
+
+  const std::vector<int64_t>& toIntListRef() const;
+  const std::vector<double>& toDoubleListRef() const;
+  const std::vector<at::Tensor>& toTensorListRef() const;
+
+  // ConstantString
+  IValue(c10::intrusive_ptr<ConstantString> v);
+  IValue(std::string v);
+  bool isString() const { return Tag::String == tag; }
+  c10::intrusive_ptr<ConstantString> toString() && {
+    AT_ASSERT(isString());
+    return moveToIntrusivePtr<ConstantString>();
+  }
+  c10::intrusive_ptr<ConstantString> toString() const & {
+    AT_ASSERT(isString());
+    return toIntrusivePtr<ConstantString>();
+  }
+
+  // DoubleList
+  IValue(c10::intrusive_ptr<DoubleList> v);
+  IValue(std::vector<double> v);
+  bool isDoubleList() const { return Tag::DoubleList == tag; }
+  c10::intrusive_ptr<DoubleList> toDoubleList() && {
+    AT_ASSERT(isDoubleList());
+    return moveToIntrusivePtr<DoubleList>();
+  }
+  c10::intrusive_ptr<DoubleList> toDoubleList() const & {
+    AT_ASSERT(isDoubleList());
+    return toIntrusivePtr<DoubleList>();
+  }
+
+  //TensorList
+  IValue(c10::intrusive_ptr<TensorList> v);
+  IValue(std::vector<at::Tensor> v);
+  bool isTensorList() const { return Tag::TensorList == tag; }
+  c10::intrusive_ptr<TensorList> toTensorList() && {
+    AT_ASSERT(isTensorList());
+    return moveToIntrusivePtr<TensorList>();
+  }
+  c10::intrusive_ptr<TensorList> toTensorList() const & {
+    AT_ASSERT(isTensorList());
+    return toIntrusivePtr<TensorList>();
+  }
+
+  // None
+  bool isNone() {
+    return Tag::None == tag;
+  }
+  std::string toNone() const {
+    return "None";
+  }
+  // Scalar, which gets encoded as either an Int or a Double
+  IValue(at::Scalar s)
+  : IValue() {
+    if(s.isFloatingPoint()) {
+      *this = s.toDouble();
+    } else {
+      *this = s.toLong();
+    }
+  }
+  bool isScalar() {
+    return isDouble() || isInt();
+  }
+  at::Scalar toScalar() const {
+    if(isDouble())
+      return toDouble();
+    else if(isInt())
+      return toInt();
+    else
+      throw std::runtime_error("IValue is not a Scalar");
+  }
+
+  // for debugging
+  std::string tagKind() const {
+    switch(tag) {
+      #define DEFINE_CASE(x) case Tag::x: return #x;
+      TORCH_FORALL_TAGS(DEFINE_CASE)
+      #undef DEFINE_CASE
+    }
+    return "Invalid Tag";
+  }
+
+  // generic v.to<at::Tensor>() implementations
+  // that can be used in special functions like pop/push
+  // that use template meta-programming.
+  // prefer the directly named methods when you can,
+  // since they are simpler to understand
+
+  // Note: if you get linker errors saying one of these is missing,
+  // change it to ... && = delete; and you will see better error messages for why
+  // However, we cannot commit this because some compiler versions barf on it.
+  template<typename T>
+  T to() &&;
+  template<typename T>
+  T to() const &;
+
+  AT_API friend std::ostream& operator<<(std::ostream& out, const IValue& v);
+
+ private:
+  // NOTE: IValue tags are intentionally private. In the future we may encode
+  // this value different (e.g. using NaN boxing), and this would make it more
+  // costly to determine the tag for all types vs just determining if something
+  // is a particular type. Instead we want clients to use the `isX` methods when
+  // possible. If for perf. reasons you really, absolutely, must have a jump
+  // table, then we can revisit this.
+  enum class Tag : uint32_t {
+#define DEFINE_TAG(x) x,
+    TORCH_FORALL_TAGS(DEFINE_TAG)
+#undef DEFINE_TAG
+  };
+
+  template<class T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
+  c10::intrusive_ptr<T, NullType> moveToIntrusivePtr() {
+    auto t = c10::intrusive_ptr<T, NullType>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
+    clearToNone();
+    return t;
+  }
+  template<typename T, class NullType = c10::detail::intrusive_target_default_null_type<T>>
+  c10::intrusive_ptr<T, NullType> toIntrusivePtr() const {
+    auto r = c10::intrusive_ptr<T, NullType>::reclaim(static_cast<T*>(payload.as_intrusive_ptr));
+    auto p = r;
+    r.release();
+    return p;
+  }
+  void clearToNone() {
+    payload.as_int = 0;
+    tag = Tag::None;
+    is_intrusive_ptr = false;
+  }
+  union {
+    int64_t as_int;
+    double as_double;
+    c10::intrusive_ptr_target* as_intrusive_ptr;
+  } payload;
+  Tag tag;
+  bool is_intrusive_ptr;
+};
+
+#undef TORCH_FORALL_TAGS
+
+
+#define DEFINE_TO(type, method_name) \
+template<> \
+inline type IValue::to<type>() && { \
+  return std::move(*this).method_name(); \
+} \
+template<> \
+inline type IValue::to<type>() const & { \
+  return this->method_name(); \
+}
+DEFINE_TO(at::Tensor, toTensor)
+DEFINE_TO(c10::intrusive_ptr<Tuple>, toTuple)
+DEFINE_TO(double, toDouble)
+DEFINE_TO(int64_t, toInt)
+DEFINE_TO(c10::intrusive_ptr<DoubleList>, toDoubleList)
+DEFINE_TO(c10::intrusive_ptr<IntList>, toIntList)
+DEFINE_TO(c10::intrusive_ptr<TensorList>, toTensorList)
+DEFINE_TO(c10::intrusive_ptr<ConstantString>, toString)
+DEFINE_TO(at::Scalar, toScalar)
+DEFINE_TO(bool, toInt)
+DEFINE_TO(std::vector<int64_t>, toIntListRef)
+DEFINE_TO(std::vector<double>, toDoubleListRef)
+DEFINE_TO(std::vector<at::Tensor>, toTensorListRef)
+
+#undef DEFINE_TO
+
+#define DEFINE_TO_WITH_BODY(type, body) \
+template<> \
+inline type IValue::to<type>() && { \
+  body(std::move(*this)); \
+} \
+template<> \
+inline type IValue::to<type>() const & { \
+  body((*this)); \
+}
+
+#define SCALAR_TYPE_BODY(this) return static_cast<at::ScalarType>(this.toInt());
+#define LAYOUT_BODY(this) return static_cast<at::Layout>(this.toInt());
+#define DEVICE_BODY(this)                                           \
+  /* NB: const_list might be a move of the vector, so we need to */ \
+  /*     assign it to prevent its deallocation.                  */ \
+  auto&& const_list = this.toIntList();                             \
+  const auto& elems = const_list->elements();                       \
+  AT_ASSERT(elems.size() == 2);                                     \
+  return at::Device(static_cast<at::Device::Type>(elems[0]), elems[1]);
+
+DEFINE_TO_WITH_BODY(at::ScalarType, SCALAR_TYPE_BODY)
+DEFINE_TO_WITH_BODY(at::Layout, LAYOUT_BODY)
+DEFINE_TO_WITH_BODY(at::Device, DEVICE_BODY)
+
+#undef DEFINE_TO_WITH_BODY
+#undef SCALAR_TYPE_BODY
+#undef LAYOUT_BODY
+#undef DEVICE_BODY
+
+inline IValue::IValue(c10::intrusive_ptr<Tuple> v)
+: tag(Tag::Tuple), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+
+inline IValue::IValue(c10::intrusive_ptr<IntList> v)
+: tag(Tag::IntList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<int64_t> v)
+: IValue(IntList::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<ConstantString> v)
+: tag(Tag::String), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::string v)
+: IValue(ConstantString::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<DoubleList> v)
+: tag(Tag::DoubleList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<double> v)
+: IValue(DoubleList::create(std::move(v))) {}
+
+inline IValue::IValue(c10::intrusive_ptr<TensorList> v)
+: tag(Tag::TensorList), is_intrusive_ptr(true) {
+  payload.as_intrusive_ptr = v.release();
+}
+inline IValue::IValue(std::vector<at::Tensor> v)
+: IValue(TensorList::create(std::move(v))) {}
+
+inline const std::vector<int64_t>& IValue::toIntListRef() const {
+  return toIntList()->elements();
+}
+
+inline const std::vector<double>& IValue::toDoubleListRef() const {
+  return toDoubleList()->elements();
+}
+
+inline const std::vector<at::Tensor>& IValue::toTensorListRef() const {
+  return toTensorList()->elements();
+}
+
+
+}}
diff --git a/aten/src/ATen/core/typeid.h b/aten/src/ATen/core/typeid.h
index e258d3be6a7fa3..a19035de1ba034 100644
--- a/aten/src/ATen/core/typeid.h
+++ b/aten/src/ATen/core/typeid.h
@@ -49,9 +49,9 @@ class AT_CORE_API TypeIdentifier final : public at::IdWrapper<TypeIdentifier, ui
       TypeIdentifier typeId);
   friend bool operator<(TypeIdentifier lhs, TypeIdentifier rhs);
 
-  // This is 8, because 0 is uint8_t (due to ScalarType BC constraint)
+  // 0 is uint8_t (due to ScalarType BC constraint)
   static constexpr TypeIdentifier uninitialized() {
-    return TypeIdentifier(8);
+    return TypeIdentifier(11);
   }
 
  private:
@@ -222,7 +222,7 @@ class AT_CORE_API TypeMeta {
   friend bool operator==(const TypeMeta& lhs, const TypeMeta& rhs) noexcept;
 
   template <typename T>
-  bool Match() const {
+  bool Match() const noexcept {
     return (id_ == Id<T>());
   }
 
@@ -452,10 +452,8 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
 
 class Tensor;
 
-// Note: we have preallocated the numbers 0-8 so they line up exactly
+// Note: we have preallocated the numbers so they line up exactly
 // with at::ScalarType's numbering.  All other numbers do not matter.
-//
-// Notably, the "uninitialized" type id is 8, not 0, for hysterical raisins.
 
 struct _CaffeHighestPreallocatedTypeId final {};
 
@@ -470,7 +468,7 @@ CAFFE_DECLARE_KNOWN_TYPE(7, double)
 CAFFE_DECLARE_KNOWN_TYPE(8, at::ComplexHalf)
 CAFFE_DECLARE_KNOWN_TYPE(9, std::complex<float>)
 CAFFE_DECLARE_KNOWN_TYPE(10, std::complex<double>)
-// 10 = undefined type id
+// 11 = undefined type id
 
 CAFFE_DECLARE_KNOWN_TYPE(12, Tensor)
 CAFFE_DECLARE_KNOWN_TYPE(13, std::string)
diff --git a/aten/src/ATen/cpu/vec256/intrinsics.h b/aten/src/ATen/cpu/vec256/intrinsics.h
index 442e8fd0511fc7..76779aada7a0b3 100644
--- a/aten/src/ATen/cpu/vec256/intrinsics.h
+++ b/aten/src/ATen/cpu/vec256/intrinsics.h
@@ -19,7 +19,7 @@
 /* GCC-compatible compiler, targeting ARM with WMMX */
 #include <mmintrin.h>
 #elif (defined(__GNUC__) || defined(__xlC__)) &&                               \
-    (defined(__VEC__) || defined(__ALTIVEC__))
+	(defined(__VEC__) || defined(__ALTIVEC__))
 /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
 #include <altivec.h>
 #elif defined(__GNUC__) && defined(__SPE__)
diff --git a/aten/src/ATen/cpu/vec256/vec256.h b/aten/src/ATen/cpu/vec256/vec256.h
index 98f1158465f2f7..71688bd48e5090 100644
--- a/aten/src/ATen/cpu/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec256/vec256.h
@@ -32,4 +32,194 @@ std::ostream& operator<<(std::ostream& stream, const Vec256<T>& vec) {
   return stream;
 }
 
+
+#if defined(__AVX__) && !defined(_MSC_VER)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<>
+Vec256<float> cast<float, double>(const Vec256<double>& src) {
+  return _mm256_castpd_ps(src);
+}
+
+template<>
+Vec256<double> cast<double, float>(const Vec256<float>& src) {
+  return _mm256_castps_pd(src);
+}
+
+#if defined(__AVX2__)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+#define DEFINE_FLOAT_INT_CAST(int_t, float_t, float_ch)            \
+template<>                                                         \
+Vec256<int_t> cast<int_t, float_t>(const Vec256<float_t>& src) {   \
+  return _mm256_castp ## float_ch ## _si256(src);                  \
+}                                                                  \
+template<>                                                         \
+Vec256<float_t> cast<float_t, int_t>(const Vec256<int_t>& src) {   \
+  return _mm256_castsi256_p ## float_ch (src);                     \
+}
+
+DEFINE_FLOAT_INT_CAST(int64_t, double, d)
+DEFINE_FLOAT_INT_CAST(int32_t, double, d)
+DEFINE_FLOAT_INT_CAST(int16_t, double, d)
+DEFINE_FLOAT_INT_CAST(int64_t, float, s)
+DEFINE_FLOAT_INT_CAST(int32_t, float, s)
+DEFINE_FLOAT_INT_CAST(int16_t, float, s)
+
+#undef DEFINE_FLOAT_INT_CAST
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<int64_t scale = 1>
+c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<double>>
+inline gather(const double* base_addr, const Vec256<int64_t>& vindex) {
+  return _mm256_i64gather_pd(base_addr, vindex, scale);
+}
+
+template<int64_t scale = 1>
+c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<float>>
+inline gather(const float* base_addr, const Vec256<int32_t>& vindex) {
+  return _mm256_i32gather_ps(base_addr, vindex, scale);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<int64_t scale = 1>
+c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<double>>
+inline mask_gather(const Vec256<double>& src, const double* base_addr,
+                   const Vec256<int64_t>& vindex, const Vec256<double>& mask) {
+  return _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale);
+}
+
+template<int64_t scale = 1>
+c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<float>>
+inline mask_gather(const Vec256<float>& src, const float* base_addr,
+                   const Vec256<int32_t>& vindex, const Vec256<float>& mask) {
+  return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+// Only works for inputs in the range: [-2^51, 2^51]
+// From: https://stackoverflow.com/a/41148578
+template<>
+Vec256<int64_t>
+inline convert_to_int_of_same_size<double>(const Vec256<double> &src) {
+  auto x = _mm256_add_pd(src, _mm256_set1_pd(0x0018000000000000));
+  return _mm256_sub_epi64(
+      _mm256_castpd_si256(x),
+      _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))
+  );
+}
+
+template<>
+Vec256<int32_t>
+inline convert_to_int_of_same_size<float>(const Vec256<float> &src) {
+  return _mm256_cvttps_epi32(src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vec256<double>, Vec256<double>>
+inline interleave2<double>(const Vec256<double>& a, const Vec256<double>& b) {
+  // inputs:
+  //   a = {a0, a1, a3, a3}
+  //   b = {b0, b1, b2, b3}
+
+  // swap lanes:
+  //   a_swapped = {a0, a1, b0, b1}
+  //   b_swapped = {a2, a3, b2, b3}
+  static constexpr int swap_ctrl_a = 0 | (2 << 4);  // 0, 2.   4 bits apart
+  static constexpr int swap_ctrl_b = 1 | (3 << 4);  // 1, 3.   4 bits apart
+  auto a_swapped = _mm256_permute2f128_pd(a, b, swap_ctrl_a);
+  auto b_swapped = _mm256_permute2f128_pd(a, b, swap_ctrl_b);
+
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1}
+  //          {a2, b2, a3, b3}
+  static constexpr int group_ctrl = 0 | (2 << 2) | (1 << 4) | (3 << 6);  // 0, 2, 1, 3
+  return std::make_pair(_mm256_permute4x64_pd(a_swapped, group_ctrl),
+                        _mm256_permute4x64_pd(b_swapped, group_ctrl));
+}
+
+template <>
+std::pair<Vec256<float>, Vec256<float>>
+inline interleave2<float>(const Vec256<float>& a, const Vec256<float>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
+
+  // swap lanes:
+  //   a_swapped = {a0, a1, a2, a3, b0, b1, b2, b3}
+  //   b_swapped = {a4, a5, a6, a7, b4, b5, b6, b7}
+  // TODO: can we support caching this?
+  static constexpr int swap_ctrl_a = 0 | (2 << 4);  // 0, 2.   4 bits apart
+  static constexpr int swap_ctrl_b = 1 | (3 << 4);  // 1, 3.   4 bits apart
+  auto a_swapped = _mm256_permute2f128_ps(a, b, swap_ctrl_a);
+  auto b_swapped = _mm256_permute2f128_ps(a, b, swap_ctrl_b);
+
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5, a6, b6, a7, b7}
+  const __m256i group_ctrl = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+  return std::make_pair(_mm256_permutevar8x32_ps(a_swapped, group_ctrl),
+                        _mm256_permutevar8x32_ps(b_swapped, group_ctrl));
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vec256<double>, Vec256<double>>
+inline deinterleave2<double>(const Vec256<double>& a, const Vec256<double>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1}
+  //   b = {a2, b2, a3, b3}
+
+  // group cols crossing lanes:
+  //   a_grouped = {a0, a1, b0, b1}
+  //   b_grouped = {a2, a3, b2, b3}
+  static constexpr int group_ctrl = 0 | (2 << 2) | (1 << 4) | (3 << 6);  // 0, 2, 1, 3
+  auto a_grouped = _mm256_permute4x64_pd(a, group_ctrl);
+  auto b_grouped = _mm256_permute4x64_pd(b, group_ctrl);
+
+  // swap lanes:
+  //   return {a0, a1, a2, a3}
+  //          {b0, b1, b2, b3}
+  static constexpr int swap_ctrl_a = 0 | (2 << 4);  // 0, 2.   4 bits apart
+  static constexpr int swap_ctrl_b = 1 | (3 << 4);  // 1, 3.   4 bits apart
+  return std::make_pair(_mm256_permute2f128_pd(a_grouped, b_grouped, swap_ctrl_a),
+                        _mm256_permute2f128_pd(a_grouped, b_grouped, swap_ctrl_b));
+}
+
+template <>
+std::pair<Vec256<float>, Vec256<float>>
+inline deinterleave2<float>(const Vec256<float>& a, const Vec256<float>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
+
+  // group cols crossing lanes:
+  //   a_grouped = {a0, a1, a2, a3, b0, b1, b2, b3}
+  //   b_grouped = {a4, a5, a6, a7, b4, b5, b6, b7}
+  // TODO: can we support caching this?
+  const __m256i group_ctrl = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
+  auto a_grouped = _mm256_permutevar8x32_ps(a, group_ctrl);
+  auto b_grouped = _mm256_permutevar8x32_ps(b, group_ctrl);
+
+  // swap lanes:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7}
+  static constexpr int swap_ctrl_a = 0 | (2 << 4);  // 0, 2.   4 bits apart
+  static constexpr int swap_ctrl_b = 1 | (3 << 4);  // 1, 3.   4 bits apart
+  return std::make_pair(_mm256_permute2f128_ps(a_grouped, b_grouped, swap_ctrl_a),
+                        _mm256_permute2f128_ps(a_grouped, b_grouped, swap_ctrl_b));
+}
+
+#endif  // defined(__AVX2__)
+
+#endif // defined(__AVX__) && !defined(_MSC_VER)
+
 }}}
diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
index fa3c4e550d6a32..f1eba7e2d3c428 100644
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -3,8 +3,11 @@
 #include <cstring>
 #include <functional>
 #include <cmath>
+#include <type_traits>
+#include <bitset>
 
 #include "ATen/Utils.h"
+#include "ATen/core/C++17.h"
 
 #if defined(__GNUC__)
 #define __at_align32__ __attribute__((aligned(32)))
@@ -18,6 +21,21 @@ namespace at {
 namespace vec256 {
 namespace {
 
+template<size_t n> struct int_of_size;
+
+#define DEFINE_INT_OF_SIZE(int_t) \
+template<> struct int_of_size<sizeof(int_t)> { using type = int_t; }
+
+DEFINE_INT_OF_SIZE(int64_t);
+DEFINE_INT_OF_SIZE(int32_t);
+DEFINE_INT_OF_SIZE(int16_t);
+DEFINE_INT_OF_SIZE(int8_t);
+
+#undef DEFINE_INT_OF_SIZE
+
+template <typename T>
+using int_same_size_t = typename int_of_size<sizeof(T)>::type;
+
 // NOTE: If you specialize on a type, you must define all operations!
 
 // emulates vectorized types
@@ -33,8 +51,13 @@ struct Vec256 {
       values[i] = val;
     }
   }
+  template<typename... Args,
+           typename = c10::guts::enable_if_t<(sizeof...(Args) == size)>>
+  Vec256(Args... vals) {
+    values = { vals... };
+  }
   template <int64_t mask_>
-  static Vec256<T> blend(Vec256<T> a, Vec256<T> b) {
+  static Vec256<T> blend(const Vec256<T>& a, const Vec256<T>& b) {
     int64_t mask = mask_;
     Vec256 vec;
     for (int64_t i = 0; i < size; i++) {
@@ -47,7 +70,29 @@ struct Vec256 {
     }
     return vec;
   }
-  static Vec256<T> set(Vec256<T> a, Vec256<T> b, int64_t count = size) {
+  static Vec256<T> blendv(const Vec256<T>& a, const Vec256<T>& b,
+                          const Vec256<T>& mask) {
+    Vec256 vec;
+    int_same_size_t<T> buffer[size];
+    mask.store(buffer);
+    for (int64_t i = 0; i < size; i++) {
+      if (buffer[i] & 0x01)
+       {
+        vec[i] = b[i];
+      } else {
+        vec[i] = a[i];
+      }
+    }
+    return vec;
+  }
+  static Vec256<T> arange(T base = static_cast<T>(0), T step = static_cast<T>(1)) {
+    Vec256 vec;
+    for (int64_t i = 0; i < size; i++) {
+      vec.values[i] = base + i * step;
+    }
+    return vec;
+  }
+  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size) {
     Vec256 vec;
     for (int64_t i = 0; i < size; i++) {
       if (i < count) {
@@ -173,9 +218,28 @@ struct Vec256 {
     }
     return ret;
   }
+#define DEFINE_COMP(binary_pred)                                              \
+  Vec256<T> operator binary_pred(const Vec256<T> &other) const {              \
+    Vec256<T> vec;                                                            \
+    for (int64_t i = 0; i != size; i++) {                                     \
+      if (values[i] binary_pred other.values[i]) {                            \
+        std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T));     \
+      } else {                                                                \
+        std::memset(static_cast<void*>(vec.values + i), 0, sizeof(T));        \
+      }                                                                       \
+    }                                                                         \
+    return vec;                                                               \
+  }
+  DEFINE_COMP(==)
+  DEFINE_COMP(!=)
+  DEFINE_COMP(>=)
+  DEFINE_COMP(<=)
+  DEFINE_COMP(>)
+  DEFINE_COMP(<)
+#undef DEFINE_COMP
 };
 
-template <class T> Vec256<T> operator+(const Vec256<T> &a, const Vec256<T> &b) {
+template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T> &b) {
   Vec256<T> c = Vec256<T>();
   for (int i = 0; i != Vec256<T>::size; i++) {
     c[i] = a[i] + b[i];
@@ -183,7 +247,7 @@ template <class T> Vec256<T> operator+(const Vec256<T> &a, const Vec256<T> &b) {
   return c;
 }
 
-template <class T> Vec256<T> operator-(const Vec256<T> &a, const Vec256<T> &b) {
+template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T> &b) {
   Vec256<T> c = Vec256<T>();
   for (int i = 0; i != Vec256<T>::size; i++) {
     c[i] = a[i] - b[i];
@@ -191,7 +255,7 @@ template <class T> Vec256<T> operator-(const Vec256<T> &a, const Vec256<T> &b) {
   return c;
 }
 
-template <class T> Vec256<T> operator*(const Vec256<T> &a, const Vec256<T> &b) {
+template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T> &b) {
   Vec256<T> c = Vec256<T>();
   for (int i = 0; i != Vec256<T>::size; i++) {
     c[i] = a[i] * b[i];
@@ -199,7 +263,7 @@ template <class T> Vec256<T> operator*(const Vec256<T> &a, const Vec256<T> &b) {
   return c;
 }
 
-template <class T> Vec256<T> operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
+template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
   Vec256<T> c = Vec256<T>();
   for (int i = 0; i != Vec256<T>::size; i++) {
     c[i] = a[i] / b[i];
@@ -207,7 +271,8 @@ template <class T> Vec256<T> operator/(const Vec256<T> &a, const Vec256<T> &b) _
   return c;
 }
 
-template <class T> Vec256<T> max(const Vec256<T> &a, const Vec256<T> &b) {
+
+template <class T> Vec256<T> inline max(const Vec256<T> &a, const Vec256<T> &b) {
   Vec256<T> c = Vec256<T>();
   for (int i = 0; i != Vec256<T>::size; i++) {
     c[i] = std::max(a[i], b[i]);
@@ -215,7 +280,7 @@ template <class T> Vec256<T> max(const Vec256<T> &a, const Vec256<T> &b) {
   return c;
 }
 
-template <class T> Vec256<T> min(const Vec256<T> &a, const Vec256<T> &b) {
+template <class T> Vec256<T> inline min(const Vec256<T> &a, const Vec256<T> &b) {
   Vec256<T> c = Vec256<T>();
   for (int i = 0; i != Vec256<T>::size; i++) {
     c[i] = std::min(a[i], b[i]);
@@ -223,9 +288,154 @@ template <class T> Vec256<T> min(const Vec256<T> &a, const Vec256<T> &b) {
   return c;
 }
 
+#define DEFINE_BITWISE_OP(op)                                               \
+template <class T>                                                          \
+Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) {      \
+  using iT = int_same_size_t<T>;                                            \
+  iT buffer[Vec256<T>::size];                                               \
+  for (int64_t i = 0; i != Vec256<T>::size; i++) {                          \
+    auto a_val = a[i];                                                      \
+    auto b_val = b[i];                                                      \
+    iT *i_a_ptr = reinterpret_cast<iT*>(&a_val);                            \
+    iT *i_b_ptr = reinterpret_cast<iT*>(&b_val);                            \
+    buffer[i] = *i_a_ptr op *i_b_ptr;                                       \
+  }                                                                         \
+  return Vec256<T>::loadu(buffer);                                          \
+}
+DEFINE_BITWISE_OP(&)
+DEFINE_BITWISE_OP(|)
+DEFINE_BITWISE_OP(^)
+#undef DEFINE_BITWISE_OP
+
 template <typename T>
-T fmadd(const T& a, const T& b, const T& c) {
+inline T fmadd(const T& a, const T& b, const T& c) {
   return a * b + c;
 }
 
+template <int64_t scale = 1, typename T = void>
+c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
+inline gather(T const* base_addr, const Vec256<int_same_size_t<T>>& vindex) {
+  static constexpr int size = Vec256<T>::size;
+  int_same_size_t<T> index_arr[size];
+  vindex.store(static_cast<void*>(index_arr));
+  T buffer[size];
+  for (int64_t i = 0; i < size; i++) {
+    buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
+  }
+  return Vec256<T>::loadu(static_cast<void*>(buffer));
+}
+
+template <int64_t scale = 1, typename T = void>
+c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
+inline mask_gather(const Vec256<T>& src, T const* base_addr,
+                   const Vec256<int_same_size_t<T>>& vindex, Vec256<T>& mask) {
+  static constexpr int size = Vec256<T>::size;
+  T src_arr[size];
+  int_same_size_t<T> mask_arr[size];  // use int type so we can logical and
+  int_same_size_t<T> index_arr[size];
+  src.store(static_cast<void*>(src_arr));
+  mask.store(static_cast<void*>(mask_arr));
+  vindex.store(static_cast<void*>(index_arr));
+  T buffer[size];
+  for (int64_t i = 0; i < size; i++) {
+    if (mask_arr[i] & 0x01) {  // check highest bit
+      buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
+    } else {
+      buffer[i] = src_arr[i];
+    }
+  }
+  mask = Vec256<T>();  // "zero out" mask
+  return Vec256<T>::loadu(static_cast<void*>(buffer));
+}
+
+// Cast a given vector to another type without changing the bits representation.
+// So a Vec<double> of 256 bits containing all ones can be cast to a
+// Vec<int64_t> of 256 bits containing all ones (i.e., four negative 1s).
+namespace {
+  // There is a struct here because we don't have static_if and I can't
+  // partially specialize a templated function.
+  template<typename dst_t, typename src_t>
+  struct CastImpl {
+    static inline Vec256<dst_t> apply(const Vec256<src_t>& src) {
+      src_t src_arr[Vec256<src_t>::size];
+      src.store(static_cast<void*>(src_arr));
+      return Vec256<dst_t>::loadu(static_cast<const void*>(src_arr));
+    }
+  };
+
+  template<typename scalar_t>
+  struct CastImpl<scalar_t, scalar_t> {
+    static inline Vec256<scalar_t> apply(const Vec256<scalar_t>& src) {
+      return src;
+    }
+  };
+}
+template<typename dst_t, typename src_t>
+Vec256<dst_t> cast(const Vec256<src_t>& src) {
+  return CastImpl<dst_t, src_t>::apply(src);
+}
+
+template <typename T>
+inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& src) {
+  static constexpr int size = Vec256<T>::size;
+  T src_arr[size];
+  src.store(static_cast<void*>(src_arr));
+  int_same_size_t<T> buffer[size];
+  for (int64_t i = 0; i < size; i++) {
+    buffer[i] = static_cast<int_same_size_t<T>>(src_arr[i]);
+  }
+  return Vec256<int_same_size_t<T>>::loadu(static_cast<void*>(buffer));
+}
+
+// E.g., inputs: a           Vec256<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
+//               b           Vec256<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
+//       returns:            Vec256<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
+//                           Vec256<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
+template <typename T>
+inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
+deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
+  static constexpr int size = Vec256<T>::size;
+  static constexpr int half_size = size / 2;
+  T a_arr[size];
+  T b_arr[size];
+  T buffer1[size];
+  T buffer2[size];
+  a.store(static_cast<void*>(a_arr));
+  b.store(static_cast<void*>(b_arr));
+  for (int64_t i = 0; i < half_size; i++) {
+    buffer1[i] = a_arr[i * 2];
+    buffer1[half_size + i] = b_arr[i * 2];
+    buffer2[i] = a_arr[i * 2 + 1];
+    buffer2[half_size + i] = b_arr[i * 2 + 1];
+  }
+  return std::make_pair(Vec256<T>::loadu(static_cast<void*>(buffer1)),
+                        Vec256<T>::loadu(static_cast<void*>(buffer2)));
+}
+
+// inverse operation of deinterleave2
+// E.g., inputs: a           Vec256<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
+//               b           Vec256<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
+//       returns:            Vec256<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
+//                           Vec256<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
+template <typename T>
+inline c10::guts::enable_if_t<Vec256<T>::size % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
+interleave2(const Vec256<T>& a, const Vec256<T>& b) {
+  static constexpr int size = Vec256<T>::size;
+  static constexpr int half_size = size / 2;
+  T a_arr[size];
+  T b_arr[size];
+  T buffer1[size];
+  T buffer2[size];
+  a.store(static_cast<void*>(a_arr));
+  b.store(static_cast<void*>(b_arr));
+  for (int64_t i = 0; i < half_size; i++) {
+    buffer1[i * 2] = a_arr[i];
+    buffer1[i * 2 + 1] = b_arr[i];
+    buffer2[i * 2] = a_arr[half_size + i];
+    buffer2[i * 2 + 1] = b_arr[half_size + i];
+  }
+  return std::make_pair(Vec256<T>::loadu(static_cast<void*>(buffer1)),
+                        Vec256<T>::loadu(static_cast<void*>(buffer2)));
+}
+
 }}}
diff --git a/aten/src/ATen/cpu/vec256/vec256_double.h b/aten/src/ATen/cpu/vec256/vec256_double.h
index 05c21634659754..ced6fa6a37b6aa 100644
--- a/aten/src/ATen/cpu/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@@ -22,14 +22,25 @@ template <> class Vec256<double> {
   Vec256(double val) {
     values = _mm256_set1_pd(val);
   }
+  Vec256(double val1, double val2, double val3, double val4) {
+    values = _mm256_setr_pd(val1, val2, val3, val4);
+  }
   operator __m256d() const {
     return values;
   }
   template <int64_t mask>
-  static Vec256<double> blend(Vec256<double> a, Vec256<double> b) {
+  static Vec256<double> blend(const Vec256<double>& a, const Vec256<double>& b) {
     return _mm256_blend_pd(a.values, b.values, mask);
   }
-  static Vec256<double> set(Vec256<double> a, Vec256<double> b, int64_t count = size) {
+  static Vec256<double> blendv(const Vec256<double>& a, const Vec256<double>& b,
+                               const Vec256<double>& mask) {
+    return _mm256_blendv_pd(a.values, b.values, mask.values);
+  }
+  static Vec256<double> arange(double base = 0., double step = 1.) {
+    return Vec256<double>(base, base + step, base + 2 * step, base + 3 * step);
+  }
+  static Vec256<double> set(const Vec256<double>& a, const Vec256<double>& b,
+                            int64_t count = size) {
     switch (count) {
       case 0:
         return a;
@@ -56,7 +67,7 @@ template <> class Vec256<double> {
   void store(void* ptr, int count = size) const {
     if (count == size) {
       _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
-    } else {
+    } else if (count > 0) {
       double tmp_values[size];
       _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(double));
@@ -154,6 +165,32 @@ template <> class Vec256<double> {
   Vec256<double> pow(const Vec256<double> &b) const {
     return Vec256<double>(Sleef_powd4_u10(values, b));
   }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vec256<double> operator==(const Vec256<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ);
+  }
+
+  Vec256<double> operator!=(const Vec256<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_NEQ_OQ);
+  }
+
+  Vec256<double> operator<(const Vec256<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_LT_OQ);
+  }
+
+  Vec256<double> operator<=(const Vec256<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_LE_OQ);
+  }
+
+  Vec256<double> operator>(const Vec256<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_GT_OQ);
+  }
+
+  Vec256<double> operator>=(const Vec256<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_GE_OQ);
+  }
 };
 
 template <>
@@ -186,9 +223,24 @@ Vec256<double> inline min(const Vec256<double>& a, const Vec256<double>& b) {
   return _mm256_min_pd(a, b);
 }
 
+template <>
+Vec256<double> inline operator&(const Vec256<double>& a, const Vec256<double>& b) {
+  return _mm256_and_pd(a, b);
+}
+
+template <>
+Vec256<double> inline operator|(const Vec256<double>& a, const Vec256<double>& b) {
+  return _mm256_or_pd(a, b);
+}
+
+template <>
+Vec256<double> inline operator^(const Vec256<double>& a, const Vec256<double>& b) {
+  return _mm256_xor_pd(a, b);
+}
+
 #ifdef __AVX2__
 template <>
-Vec256<double> fmadd(const Vec256<double>& a, const Vec256<double>& b, const Vec256<double>& c) {
+Vec256<double> inline fmadd(const Vec256<double>& a, const Vec256<double>& b, const Vec256<double>& c) {
   return _mm256_fmadd_pd(a, b, c);
 }
 #endif
diff --git a/aten/src/ATen/cpu/vec256/vec256_float.h b/aten/src/ATen/cpu/vec256/vec256_float.h
index c38fae11c24863..ebd0c10d2b62d1 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@@ -16,20 +16,34 @@ template <> class Vec256<float> {
 private:
   __m256 values;
 public:
-  static constexpr int64_t size = 8;
+  static constexpr int size = 8;
   Vec256() {}
   Vec256(__m256 v) : values(v) {}
   Vec256(float val) {
     values = _mm256_set1_ps(val);
   }
+  Vec256(float val1, float val2, float val3, float val4,
+         float val5, float val6, float val7, float val8) {
+    values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
   operator __m256() const {
     return values;
   }
   template <int64_t mask>
-  static Vec256<float> blend(Vec256<float> a, Vec256<float> b) {
+  static Vec256<float> blend(const Vec256<float>& a, const Vec256<float>& b) {
     return _mm256_blend_ps(a.values, b.values, mask);
   }
-  static Vec256<float> set(Vec256<float> a, Vec256<float> b, int64_t count = size) {
+  static Vec256<float> blendv(const Vec256<float>& a, const Vec256<float>& b,
+                              const Vec256<float>& mask) {
+    return _mm256_blendv_ps(a.values, b.values, mask.values);
+  }
+  static Vec256<float> arange(float base = 0.f, float step = 1.f) {
+    return Vec256<float>(
+      base,            base +     step, base + 2 * step, base + 3 * step,
+      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
+  }
+  static Vec256<float> set(const Vec256<float>& a, const Vec256<float>& b,
+                           int64_t count = size) {
     switch (count) {
       case 0:
         return a;
@@ -61,7 +75,7 @@ template <> class Vec256<float> {
   void store(void* ptr, int64_t count = size) const {
     if (count == size) {
       _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
-    } else {
+    } else if (count > 0) {
       float tmp_values[size];
       _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(float));
@@ -159,6 +173,32 @@ template <> class Vec256<float> {
   Vec256<float> pow(const Vec256<float> &b) const {
     return Vec256<float>(Sleef_powf8_u10(values, b));
   }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vec256<float> operator==(const Vec256<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ);
+  }
+
+  Vec256<float> operator!=(const Vec256<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_NEQ_OQ);
+  }
+
+  Vec256<float> operator<(const Vec256<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_LT_OQ);
+  }
+
+  Vec256<float> operator<=(const Vec256<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_LE_OQ);
+  }
+
+  Vec256<float> operator>(const Vec256<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_GT_OQ);
+  }
+
+  Vec256<float> operator>=(const Vec256<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_GE_OQ);
+  }
 };
 
 template <>
@@ -191,9 +231,24 @@ Vec256<float> inline min(const Vec256<float>& a, const Vec256<float>& b) {
   return _mm256_min_ps(a, b);
 }
 
+template <>
+Vec256<float> inline operator&(const Vec256<float>& a, const Vec256<float>& b) {
+  return _mm256_and_ps(a, b);
+}
+
+template <>
+Vec256<float> inline operator|(const Vec256<float>& a, const Vec256<float>& b) {
+  return _mm256_or_ps(a, b);
+}
+
+template <>
+Vec256<float> inline operator^(const Vec256<float>& a, const Vec256<float>& b) {
+  return _mm256_xor_ps(a, b);
+}
+
 #ifdef __AVX2__
 template <>
-Vec256<float> fmadd(const Vec256<float>& a, const Vec256<float>& b, const Vec256<float>& c) {
+Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const Vec256<float>& c) {
   return _mm256_fmadd_ps(a, b, c);
 }
 #endif
diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h
index c9b643e7d4bb09..2ca4d614c21e7b 100644
--- a/aten/src/ATen/cpu/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@@ -26,6 +26,9 @@ struct Vec256<int64_t> : public Vec256i {
   using Vec256i::Vec256i;
   Vec256() {}
   Vec256(int64_t v) { values = _mm256_set1_epi64x(v); }
+  Vec256(int64_t val1, int64_t val2, int64_t val3, int64_t val4) {
+    values = _mm256_setr_epi64x(val1, val2, val3, val4);
+  }
   template <int64_t mask>
   static Vec256<int64_t> blend(Vec256<int64_t> a, Vec256<int64_t> b) {
     __at_align32__ int64_t tmp_values[size];
@@ -40,6 +43,13 @@ struct Vec256<int64_t> : public Vec256i {
       tmp_values[3] = _mm256_extract_epi64(b.values, 3);
     return loadu(tmp_values);
   }
+  static Vec256<int64_t> blendv(const Vec256<int64_t>& a, const Vec256<int64_t>& b,
+                                const Vec256<int64_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  static Vec256<int64_t> arange(int64_t base = 0, int64_t step = 1) {
+    return Vec256<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
+  }
   static Vec256<int64_t>
   set(Vec256<int64_t> a, Vec256<int64_t> b, int64_t count = size) {
     switch (count) {
@@ -65,7 +75,7 @@ struct Vec256<int64_t> : public Vec256i {
   void store(void* ptr, int count = size) const {
     if (count == size) {
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
-    } else {
+    } else if (count > 0) {
       __at_align32__ int64_t tmp_values[size];
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
@@ -79,6 +89,30 @@ struct Vec256<int64_t> : public Vec256i {
     auto inverse = _mm256_xor_si256(values, is_larger);
     return _mm256_sub_epi64(inverse, is_larger);
   }
+  Vec256<int64_t> operator==(const Vec256<int64_t>& other) const {
+    return _mm256_cmpeq_epi64(values, other.values);
+  }
+  Vec256<int64_t> operator!=(const Vec256<int64_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto eq = _mm256_cmpeq_epi64(values, other.values);
+    return _mm256_xor_si256(zero, eq);  // invert
+  }
+  Vec256<int64_t> operator<(const Vec256<int64_t>& other) const {
+    return _mm256_cmpgt_epi64(other.values, values);
+  }
+  Vec256<int64_t> operator<=(const Vec256<int64_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto gt = _mm256_cmpgt_epi64(values, other.values);
+    return _mm256_xor_si256(zero, gt);  // invert
+  }
+  Vec256<int64_t> operator>(const Vec256<int64_t>& other) const {
+    return _mm256_cmpgt_epi64(values, other.values);
+  }
+  Vec256<int64_t> operator>=(const Vec256<int64_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto lt = _mm256_cmpgt_epi64(other.values, values);
+    return _mm256_xor_si256(zero, lt);  // invert
+  }
 };
 
 template <>
@@ -87,10 +121,23 @@ struct Vec256<int32_t> : public Vec256i {
   using Vec256i::Vec256i;
   Vec256() {}
   Vec256(int32_t v) { values = _mm256_set1_epi32(v); }
+  Vec256(int32_t val1, int32_t val2, int32_t val3, int32_t val4,
+         int32_t val5, int32_t val6, int32_t val7, int32_t val8) {
+    values = _mm256_setr_epi32(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
   template <int64_t mask>
   static Vec256<int32_t> blend(Vec256<int32_t> a, Vec256<int32_t> b) {
     return _mm256_blend_epi32(a, b, mask);
   }
+  static Vec256<int32_t> blendv(const Vec256<int32_t>& a, const Vec256<int32_t>& b,
+                                const Vec256<int32_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  static Vec256<int32_t> arange(int32_t base = 0, int32_t step = 1) {
+    return Vec256<int32_t>(
+      base,            base +     step, base + 2 * step, base + 3 * step,
+      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
+  }
   static Vec256<int32_t>
   set(Vec256<int32_t> a, Vec256<int32_t> b, int32_t count = size) {
     switch (count) {
@@ -124,7 +171,7 @@ struct Vec256<int32_t> : public Vec256i {
   void store(void* ptr, int count = size) const {
     if (count == size) {
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
-    } else {
+    } else if (count > 0) {
       __at_align32__ int32_t tmp_values[size];
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
@@ -135,6 +182,30 @@ struct Vec256<int32_t> : public Vec256i {
   Vec256<int32_t> abs() const {
     return _mm256_abs_epi32(values);
   }
+  Vec256<int32_t> operator==(const Vec256<int32_t>& other) const {
+    return _mm256_cmpeq_epi32(values, other.values);
+  }
+  Vec256<int32_t> operator!=(const Vec256<int32_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto eq = _mm256_cmpeq_epi32(values, other.values);
+    return _mm256_xor_si256(zero, eq);  // invert
+  }
+  Vec256<int32_t> operator<(const Vec256<int32_t>& other) const {
+    return _mm256_cmpgt_epi32(other.values, values);
+  }
+  Vec256<int32_t> operator<=(const Vec256<int32_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto gt = _mm256_cmpgt_epi32(values, other.values);
+    return _mm256_xor_si256(zero, gt);  // invert
+  }
+  Vec256<int32_t> operator>(const Vec256<int32_t>& other) const {
+    return _mm256_cmpgt_epi32(values, other.values);
+  }
+  Vec256<int32_t> operator>=(const Vec256<int32_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto lt = _mm256_cmpgt_epi32(other.values, values);
+    return _mm256_xor_si256(zero, lt);  // invert
+  }
 };
 
 template <>
@@ -143,6 +214,13 @@ struct Vec256<int16_t> : public Vec256i {
   using Vec256i::Vec256i;
   Vec256() {}
   Vec256(int16_t v) { values = _mm256_set1_epi16(v); }
+  Vec256(int16_t val1, int16_t val2, int16_t val3, int16_t val4,
+         int16_t val5, int16_t val6, int16_t val7, int16_t val8,
+         int16_t val9, int16_t val10, int16_t val11, int16_t val12,
+         int16_t val13, int16_t val14, int16_t val15, int16_t val16) {
+    values = _mm256_setr_epi16(val1, val2, val3, val4, val5, val6, val7, val8,
+                               val9, val10, val11, val12, val13, val14, val15, val16);
+  }
   template <int64_t mask>
   static Vec256<int16_t> blend(Vec256<int16_t> a, Vec256<int16_t> b) {
     __at_align32__ int16_t tmp_values[size];
@@ -181,6 +259,17 @@ struct Vec256<int16_t> : public Vec256i {
       tmp_values[15] = _mm256_extract_epi16(b.values, 15);
     return loadu(tmp_values);
   }
+  static Vec256<int16_t> blendv(const Vec256<int16_t>& a, const Vec256<int16_t>& b,
+                                const Vec256<int16_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  static Vec256<int16_t> arange(int16_t base = 0, int16_t step = 1) {
+    return Vec256<int16_t>(
+      base,             base +      step, base +  2 * step, base +  3 * step,
+      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
+      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
+      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
+  }
   static Vec256<int16_t>
   set(Vec256<int16_t> a, Vec256<int16_t> b, int16_t count = size) {
     switch (count) {
@@ -230,7 +319,7 @@ struct Vec256<int16_t> : public Vec256i {
   void store(void* ptr, int count = size) const {
     if (count == size) {
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
-    } else {
+    } else if (count > 0) {
       __at_align32__ int16_t tmp_values[size];
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
@@ -241,6 +330,30 @@ struct Vec256<int16_t> : public Vec256i {
   Vec256<int16_t> abs() const {
     return _mm256_abs_epi16(values);
   }
+  Vec256<int16_t> operator==(const Vec256<int16_t>& other) const {
+    return _mm256_cmpeq_epi16(values, other.values);
+  }
+  Vec256<int16_t> operator!=(const Vec256<int16_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto eq = _mm256_cmpeq_epi16(values, other.values);
+    return _mm256_xor_si256(zero, eq);  // invert
+  }
+  Vec256<int16_t> operator<(const Vec256<int16_t>& other) const {
+    return _mm256_cmpgt_epi16(other.values, values);
+  }
+  Vec256<int16_t> operator<=(const Vec256<int16_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto gt = _mm256_cmpgt_epi16(values, other.values);
+    return _mm256_xor_si256(zero, gt);  // invert
+  }
+  Vec256<int16_t> operator>(const Vec256<int16_t>& other) const {
+    return _mm256_cmpgt_epi16(values, other.values);
+  }
+  Vec256<int16_t> operator>=(const Vec256<int16_t>& other) const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto lt = _mm256_cmpgt_epi16(other.values, values);
+    return _mm256_xor_si256(zero, lt);  // invert
+  }
 };
 
 template <>
@@ -258,6 +371,21 @@ Vec256<int16_t> inline operator+(const Vec256<int16_t>& a, const Vec256<int16_t>
   return _mm256_add_epi16(a, b);
 }
 
+template <>
+Vec256<int64_t> inline operator-(const Vec256<int64_t>& a, const Vec256<int64_t>& b) {
+  return _mm256_sub_epi64(a, b);
+}
+
+template <>
+Vec256<int32_t> inline operator-(const Vec256<int32_t>& a, const Vec256<int32_t>& b) {
+  return _mm256_sub_epi32(a, b);
+}
+
+template <>
+Vec256<int16_t> inline operator-(const Vec256<int16_t>& a, const Vec256<int16_t>& b) {
+  return _mm256_sub_epi16(a, b);
+}
+
 // AVX2 has no intrinsic for int64_t multiply so it needs to be emulated
 // This could be implemented more efficiently using epi32 instructions
 // This is also technically avx compatible, but then we'll need AVX
@@ -293,7 +421,7 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>
 }
 
 template <typename T>
-Vec256<T> intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
+Vec256<T> inline intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
   T values_a[Vec256<T>::size];
   T values_b[Vec256<T>::size];
   a.store(values_a);
@@ -304,20 +432,26 @@ Vec256<T> intdiv_256(const Vec256<T>& a, const Vec256<T>& b) {
   return Vec256<T>::loadu(values_a);
 }
 
-template <>
-Vec256<int64_t> inline operator/(const Vec256<int64_t>& a, const Vec256<int64_t>& b) {
-  return intdiv_256(a, b);
+#define DEFINE_INTEGER_BINARY_OP(op, func)                                                \
+template <>                                                                               \
+Vec256<int64_t> inline operator op(const Vec256<int64_t>& a, const Vec256<int64_t>& b) {  \
+  return func(a, b);                                                                      \
+}                                                                                         \
+template <>                                                                               \
+Vec256<int32_t> inline operator op(const Vec256<int32_t>& a, const Vec256<int32_t>& b) {  \
+  return func(a, b);                                                                      \
+}                                                                                         \
+template <>                                                                               \
+Vec256<int16_t> inline operator op(const Vec256<int16_t>& a, const Vec256<int16_t>& b) {  \
+  return func(a, b);                                                                      \
 }
 
-template <>
-Vec256<int32_t> inline operator/(const Vec256<int32_t>& a, const Vec256<int32_t>& b) {
-  return intdiv_256(a, b);
-}
+DEFINE_INTEGER_BINARY_OP(/, intdiv_256)
+DEFINE_INTEGER_BINARY_OP(&, _mm256_and_si256)
+DEFINE_INTEGER_BINARY_OP(|, _mm256_or_si256)
+DEFINE_INTEGER_BINARY_OP(^, _mm256_xor_si256)
 
-template <>
-Vec256<int16_t> inline operator/(const Vec256<int16_t>& a, const Vec256<int16_t>& b) {
-  return intdiv_256(a, b);
-}
+#undef DEFINE_INTEGER_BINARY_OP
 
 #endif
 
diff --git a/aten/src/ATen/cuda/ATenCUDAGeneral.h b/aten/src/ATen/cuda/ATenCUDAGeneral.h
index 7b41f1fe3f7235..761551f808610b 100644
--- a/aten/src/ATen/cuda/ATenCUDAGeneral.h
+++ b/aten/src/ATen/cuda/ATenCUDAGeneral.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
 #ifdef _WIN32
 # if defined(ATen_cuda_EXPORTS) || defined(caffe2_gpu_EXPORTS) || defined(CAFFE2_CUDA_BUILD_MAIN_LIB)
 #  define AT_CUDA_API __declspec(dllexport)
diff --git a/aten/src/ATen/cuda/CUDAHalf.cu b/aten/src/ATen/cuda/CUDAHalf.cu
deleted file mode 100644
index bd121250ee4847..00000000000000
--- a/aten/src/ATen/cuda/CUDAHalf.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "ATen/core/Half.h"
-#include "ATen/cuda/CUDAHalf.cuh"
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-
-namespace at {
-#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
-
-half Converter<half, Half>::operator()(Half aten_half) {
-  return half{aten_half.x};
-}
-
-half Converter<half, double>::operator()(double value) {
-  return half{Half(value).x};
-}
-
-Half Converter<Half, half>::operator()(half cuda_half) {
-  return Half(cuda_half.x, Half::from_bits);
-}
-#else
-half Converter<half, Half>::operator()(Half aten_half) {
-  __half_raw x_raw;
-  x_raw.x = aten_half.x;
-  return half(x_raw);
-}
-
-Half Converter<Half, half>::operator()(half cuda_half) {
-  __half_raw raw(cuda_half);
-  return Half(raw.x, Half::from_bits);
-}
-
-half Converter<half, double>::operator()(double value) {
-  __half_raw raw;
-  raw.x = Half(value).x;
-  return half {raw};
-}
-
-template <> __half HalfFix(Half h) {
-  __half_raw raw;
-  raw.x = h.x;
-  return __half{raw};
-}
-
-template <> Half HalfFix(__half h) {
-  __half_raw raw(h);
-  return Half(raw.x, Half::from_bits);
-}
-#endif
-} // namespace at
diff --git a/aten/src/ATen/cuda/CUDAHalf.cuh b/aten/src/ATen/cuda/CUDAHalf.cuh
deleted file mode 100644
index 6558ed518ac1fd..00000000000000
--- a/aten/src/ATen/cuda/CUDAHalf.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-#pragma once
-
-#include "ATen/cuda/ATenCUDAGeneral.h"
-#include "ATen/core/Half.h"
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-
-namespace at {
-
-template <>
-struct AT_CUDA_API Converter<half, Half> {
-  half operator()(Half);
-};
-
-template <>
-struct AT_CUDA_API Converter<Half, half> {
-  Half operator()(half);
-};
-
-template <>
-struct AT_CUDA_API Converter<half, double> {
-  half operator()(double);
-};
-
-#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
-template <> __half HalfFix(Half h);
-template <> Half HalfFix(__half h);
-#endif
-} // namespace at
diff --git a/aten/src/ATen/cuda/NumericLimits.cuh b/aten/src/ATen/cuda/NumericLimits.cuh
index 325cbce737dd51..981bf8c1c34efa 100644
--- a/aten/src/ATen/cuda/NumericLimits.cuh
+++ b/aten/src/ATen/cuda/NumericLimits.cuh
@@ -3,6 +3,7 @@
 #include <cuda.h>
 #include <limits.h>
 #include <math.h>
+#include <float.h>
 
 // NumericLimits.cuh is a holder for numeric limits definitions of commonly used
 // types. This header is very specific to ROCm HIP and may be removed in the future.
@@ -101,4 +102,4 @@ struct numeric_limits<double> {
   static inline __host__ __device__ double upper_bound() { return inf; }
 };
 
-} // namespace at
\ No newline at end of file
+} // namespace at
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index e87a7bb88f8eb4..5df218a89cc06d 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -39,7 +39,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 #    declaration under Type.h  (right now, we call this template
 #    BROADCAST but it also handles default arguments)
 TYPE_METHOD_DECLARATION_BROADCAST = CodeTemplate("""\
-${return_type} ${api_name}(${type_method_formals_with_defaults}) const override;
+${return_type} ${api_name}(${type_method_formals}) const override;
 """)
 # 2. broadcasting functions are implemented in Type.cpp
 TYPE_METHOD_DEFINITION_BROADCAST = CodeTemplate("""\
@@ -60,18 +60,18 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 #    for 'native' declarations (so the native dispatch is hardcoded into
 #    the template here.)
 PURE_VIRTUAL_TYPE_METHOD_DECLARATION = CodeTemplate("""\
-virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const = 0;
+virtual ${return_type} ${method_prefix_derived}${api_name}(${type_method_formals}) const = 0;
 """)
 DEPRECATED_PURE_VIRTUAL_TYPE_METHOD_DECLARATION = CodeTemplate("""\
 AT_DEPRECATED(virtual ${return_type} \
-${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const = 0);
+${method_prefix_derived}${api_name}(${type_method_formals}) const = 0);
 """)
 PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST = CodeTemplate("""\
-virtual ${return_type} ${api_name}(${type_method_formals_with_defaults}) const = 0;
+virtual ${return_type} ${api_name}(${type_method_formals}) const = 0;
 """)
 
 TYPE_METHOD_DECLARATION_ABSTRACT = CodeTemplate("""\
-${return_type} ${method_prefix_derived}${api_name}(${type_method_formals_with_defaults}) const override;
+${return_type} ${method_prefix_derived}${api_name}(${type_method_formals}) const override;
 """)
 TYPE_METHOD_DEFINITION_ABSTRACT = CodeTemplate("""\
 ${return_type} TypeDefault::${method_prefix_derived}${api_name}(${type_method_formals}) const {
@@ -79,7 +79,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 }
 """)
 TYPE_METHOD_DECLARATION_CONCRETE = CodeTemplate("""\
-${return_type} ${api_name}(${type_method_formals_with_defaults}) const override;
+${return_type} ${api_name}(${type_method_formals}) const override;
 """)
 TYPE_METHOD_DEFINITION_CONCRETE = CodeTemplate("""\
 ${return_type} TypeDefault::${api_name}(${type_method_formals}) const {
@@ -107,6 +107,10 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 # NB: As far as ezyang can tell, we don't *have* to codegen this,
 # because we will inherit it from the TYPE_METHOD_DEFINITION_CONCRETE in
 # the superclass.  But it doesn't seem to be harmful.
+#
+# TODO: self_ty is a hack to make things work for native methods which need to
+# take a dtype, but also need to dispatch differently for different types.
+# Eliminate it at some point.
 TYPE_DERIVED_DEFINITION_NATIVE = CodeTemplate("""\
 ${return_type} ${Type}::${api_name}(${type_method_formals}) const {
     ${device_guard_declaration}
@@ -173,7 +177,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 # the same name (but different signature) already
 ZERO_DIM_CHECK = CodeTemplate("""\
 if (${check_name}.dim() == 0) {
-    return static_cast<const Type*>(this)->${api_name}(${zero_dim_actuals});
+    return static_cast<const TypeExtendedInterface*>(this)->${api_name}(${zero_dim_actuals});
 }""")
 
 ZERO_DIM_ONLY = CodeTemplate("""\
@@ -183,12 +187,12 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 
 SPARSE_CHECK = CodeTemplate("""\
 if(${check_name}.type().is_sparse()) {
-    return static_cast<const Type*>(this)->${api_name}(${sparse_actuals});
+    return static_cast<const TypeExtendedInterface*>(this)->${api_name}(${sparse_actuals});
 }""")
 
 BUFFER_DEFINITION = CodeTemplate("""\
-auto ${name}_ = c10::make_intrusive<TensorImpl, UndefinedTensor>(
-    ${Backend}TensorId(), ScalarType::${ScalarName}, ${THTensor}_new(), false).release();
+auto ${name}_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
+    ${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), ${THTensor}_new(), false).release();
 auto ${name} = Tensor(${name}_, false);""")
 
 CONDITIONAL_INITIALIZER = CodeTemplate("""\
@@ -198,8 +202,6 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 
 CALL_TEMPLATE = CodeTemplate("${cname}(${actuals})")
 
-HALF_CONVERSION = CodeTemplate("convert<half>(${value})")
-
 
 class NYIError(Exception):
     """Indicates we don't support this declaration yet"""
@@ -330,18 +332,19 @@ def __init__(self, reason):
 CHECKED_USE_NULLABLE = CodeTemplate('${arg_name}_ ? ${usage} : NULL')
 
 ALLOC_NOARGS_WRAP = {
-    'THTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
-                 '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()',
-    'THBoolTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
-                     '(${Backend}TensorId(), ScalarType::Byte, allocator(), false).release()',
-    'THIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
-                      '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()',
-    'THIntegerTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
-                        '(${Backend}TensorId(), ScalarType::Int, allocator(), false).release()',
-    'THDenseTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
-                      '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()',
-    'THDenseIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
-                           '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()'
+    'THTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
+                 '(${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), allocator(), false).release()',
+    'THBoolTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
+                     '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Byte), allocator(), false).release()',
+    'THIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
+                      '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Long), allocator(), false).release()',
+    'THIntegerTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
+                        '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Int), allocator(), false).release()',
+    'THDenseTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
+                      '(${Backend}TensorId(), caffe2::TypeMeta::Make<${ScalarType}>(), allocator(), false).release()',
+    'THDenseIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensorImpl>'
+                           '(${Backend}TensorId(), scalarTypeToTypeMeta(ScalarType::Long), '
+                           'allocator(), false).release()'
 }
 
 ALLOC_WRAP = {
@@ -390,6 +393,7 @@ def __getitem__(self, x):
     'type_registrations': List[str],
     'type_headers': List[str],
     'pure_virtual_type_method_declarations': List[str],
+    'pure_virtual_extended_type_method_declarations': List[str],
     'type_method_declarations': List[str],
     'type_method_definitions': List[str],
     'type_method_inline_definitions': List[str],
@@ -490,6 +494,9 @@ def __getitem__(self, x):
     'formals': List[str],
     'inferred_type': str,
     'inplace': bool,
+    # This controls whether or not we generate the interface in Type or
+    # TypeExtendedInterface
+    'extended_method': bool,
     'method_actuals': List[str],
     'method_formals_with_defaults': List[str],
     'method_formals': List[str],
@@ -509,7 +516,6 @@ def __getitem__(self, x):
     'type_definition_body': List[str],
     'type_method_actuals': List[str],
     'type_method_definition_dispatch': str,
-    'type_method_formals_with_defaults': List[str],
     'type_method_formals': List[str],
     'variants': str,
     'when_spares_dispatch': str,
@@ -811,7 +817,6 @@ def process_option(option, output_options):
 
         # There are no cases where these differ, but they do in native_functions
         option['type_method_formals'] = option['formals']
-        option['type_method_formals_with_defaults'] = option['formals_with_defaults']
         option['type_method_actuals'] = option['actuals']
 
         option['const_mark'] = '' if option['inplace'] else ' const'
@@ -836,8 +841,12 @@ def process_option(option, output_options):
             # NN function with no _forward/_backward suffix don't have cimpls.
             # They call the _forward function and discard any buffer returns
             abstract = False
-            top_env['pure_virtual_type_method_declarations'].append(
-                PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+            if option['extended_method']:
+                top_env['pure_virtual_extended_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+            else:
+                top_env['pure_virtual_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
             top_env['type_method_declarations'].append(
                 TYPE_METHOD_DECLARATION_CONCRETE.substitute(env))
             body = emit_nn_body(option)
@@ -845,17 +854,27 @@ def process_option(option, output_options):
                 TYPE_METHOD_DEFINITION_CONCRETE.substitute(
                     env, type_definition_body=body))
         elif broadcast_arg is None:
-            top_env['pure_virtual_type_method_declarations'].append(
-                PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+            if option['extended_method']:
+                top_env['pure_virtual_extended_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+            else:
+                top_env['pure_virtual_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
             top_env['type_method_declarations'].append(
                 TYPE_METHOD_DECLARATION_ABSTRACT.substitute(env))
             top_env['type_method_definitions'].append(
                 TYPE_METHOD_DEFINITION_ABSTRACT.substitute(env))
         else:
-            top_env['pure_virtual_type_method_declarations'].append(
-                PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST.substitute(env))
-            top_env['pure_virtual_type_method_declarations'].append(
-                PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+            if option['extended_method']:
+                top_env['pure_virtual_extended_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+                top_env['pure_virtual_extended_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST.substitute(env))
+            else:
+                top_env['pure_virtual_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+                top_env['pure_virtual_type_method_declarations'].append(
+                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION_BROADCAST.substitute(env))
             top_env['type_method_declarations'].append(
                 TYPE_METHOD_DECLARATION_BROADCAST.substitute(env))
             top_env['type_method_declarations'].append(
@@ -888,7 +907,7 @@ def process_option(option, output_options):
             method_of.append('Tensor')
 
         if is_namespace_function:
-            option['inferred_type'] = 'infer_type({})'.format(dispatch_tensor)
+            option['inferred_type'] = 'detail::infer_type({})'.format(dispatch_tensor)
             top_env['function_declarations'].append(
                 FUNCTION_DECLARATION.substitute(env))
             top_env['function_definitions'].append(
@@ -1031,7 +1050,6 @@ def find_formal(formal_name, formals):
             dispatch_type['is_type_dispatched'] = True
 
         option['type_method_formals'] = [format_formal(f) for f in formals if f != dispatch_type]
-        option['type_method_formals_with_defaults'] = [formal_with_default(f) for f in formals if f != dispatch_type]
         option['type_method_actuals'] = [f['name'] for f in formals if f != dispatch_type]
         option['native_actuals'] = [f['name'] if f != dispatch_type else '*this' for f in formals]
 
@@ -1060,11 +1078,21 @@ def find_formal(formal_name, formals):
         # Factory methods are not dispatched over `Type`.
         if not is_factory_method:
             if option['deprecated']:
+                # Deprecated functions are always non-extended,
+                # because they need to be made available from Type
+                # (the public interface) so that code like
+                # tensor.type().arange(...) keeps working.  Once
+                # we remove the deprecated functions, we can eliminate
+                # these methods entirely.
                 top_env['pure_virtual_type_method_declarations'].append(
                     DEPRECATED_PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
             else:
-                top_env['pure_virtual_type_method_declarations'].append(
-                    PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+                if option['extended_method']:
+                    top_env['pure_virtual_extended_type_method_declarations'].append(
+                        PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
+                else:
+                    top_env['pure_virtual_type_method_declarations'].append(
+                        PURE_VIRTUAL_TYPE_METHOD_DECLARATION.substitute(env))
             top_env['type_method_declarations'].append(TYPE_METHOD_DECLARATION_CONCRETE.substitute(env))
         dispatch = option['type_method_definition_dispatch']
         option['native_type_method_dispatch'] = dispatch
@@ -1116,9 +1144,9 @@ def find_formal(formal_name, formals):
 
         if is_namespace_function:
             if dispatch_type:
-                option['inferred_type'] = dispatch_type['name']
+                option['inferred_type'] = 'static_cast<const TypeExtendedInterface&>({})'.format(dispatch_type['name'])
             elif dispatch_tensor:
-                option['inferred_type'] = 'infer_type({})'.format(dispatch_tensor)
+                option['inferred_type'] = 'detail::infer_type({})'.format(dispatch_tensor)
             else:
                 # doesn't depend on a specific type, use undefined float
                 option['inferred_type'] = 'at::getNonVariableType(at::Backend::Undefined, at::ScalarType::Float)'
@@ -1170,8 +1198,6 @@ def create_derived(backend_type_env, declarations):
 
     is_cuda = 'CUDA' in backend_type_env['Backend']
 
-    real_is_half = backend_type_env['ScalarName'] == 'Half'
-
     def replace_with_null(argument):
         # type: (THFormal) -> bool
         return (argument['type'] == 'THGenerator*' and
@@ -1198,8 +1224,6 @@ def get_argument(argument, option):
         elif requires_checked_cast(argument):
             checked_use = CHECKED_USE.get(
                 argument['type'], '{}_').format(argument['name'])
-            if real_is_half and argument['type'] == 'real':
-                checked_use = HALF_CONVERSION.substitute(value=checked_use)
             if nullable_argument(argument):
                 checked_use = CHECKED_USE_NULLABLE.substitute(
                     env={}, arg_name=argument['name'], usage=checked_use)
@@ -1295,11 +1319,12 @@ def allocate_arg(env, arg, output_count):
         tensor_arg = '{}_'.format(name)
         if arg.get('mask', False):
             allocation = 'output_mask[{}] ? {} : nullptr'.format(output_count, allocation)
-            tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensor::singleton() : (TensorImpl*){}_'
+            tensor_arg = ('{}_ == nullptr ? (TensorImpl*)UndefinedTensorImpl::singleton() : (TensorImpl*){}_'
                           .format(name, name))
+        intrusive_ptr_type = 'c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>'
         return [
             'auto {}_ = {};'.format(name, allocation),
-            'auto {} = Tensor({}, false);'.format(name, tensor_arg),
+            'auto {} = Tensor({}::reclaim({}));'.format(name, intrusive_ptr_type, tensor_arg),
         ]
 
     def resize_arg(arg):
@@ -1507,7 +1532,10 @@ def emit_body(env, option):
                                else ""
                 wrapped_tensor = CodeTemplate(ALLOC_WRAP[ret['type']]).substitute(
                     env, arguments=[call])
-                return_tensor = "return Tensor((${wrapped_tensor})${maybe_scalar},false);"
+                return_tensor = (
+                    "return Tensor(" +
+                    "c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(" +
+                    "(${wrapped_tensor})${maybe_scalar}));")
                 body.append(CodeTemplate(return_tensor).substitute(
                     env, wrapped_tensor=wrapped_tensor, maybe_scalar=maybe_scalar))
             # return the same underlying Tensor type for both real and accreal; this ensures
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 3f962961f55812..3112e5ff0424ab 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+import filecmp
 
 import yaml
 from collections import OrderedDict
@@ -40,19 +41,27 @@
 parser.add_argument(
     '-d', '--install_dir', help='output directory', default='ATen')
 options = parser.parse_args()
+gen_to_source = os.environ.get('GEN_TO_SOURCE')  # update source directly as part of gen
+if not gen_to_source:
+    core_install_dir = os.path.join(options.install_dir, 'core_tmp') if options.install_dir is not None else None
+else:
+    core_install_dir = os.path.join(options.source_path, 'core')
 
 if options.install_dir is not None and not os.path.exists(options.install_dir):
     os.makedirs(options.install_dir)
+if core_install_dir is not None and not os.path.exists(core_install_dir):
+    os.makedirs(core_install_dir)
 
 
 class FileManager(object):
-    def __init__(self):
+    def __init__(self, install_dir=None):
+        self.install_dir = install_dir if install_dir else options.install_dir
         self.filenames = set()
         self.outputs_written = False
         self.undeclared_files = []
 
     def will_write(self, filename):
-        filename = '{}/{}'.format(options.install_dir, filename)
+        filename = '{}/{}'.format(self.install_dir, filename)
         if self.outputs_written:
             raise Exception("'will_write' can only be called before " +
                             "the call to write_outputs, refactor so outputs are registered " +
@@ -78,7 +87,7 @@ def write_outputs(self, filename):
         self.outputs_written = True
 
     def write(self, filename, s, env=None):
-        filename = '{}/{}'.format(options.install_dir, filename)
+        filename = '{}/{}'.format(self.install_dir, filename)
         if isinstance(s, CodeTemplate):
             assert env is not None
             env['generated_comment'] = "@" + "generated by aten/src/ATen/gen.py"
@@ -107,6 +116,7 @@ def check_all_files_written(self):
 SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp")
 TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h")
 TYPE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/Type.h")
+TYPE_EXTENDED_INTERFACE_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeExtendedInterface.h")
 TYPE_DEFAULT_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDefault.h")
 TYPE_DEFAULT_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDefault.cpp")
 
@@ -127,6 +137,7 @@ def check_all_files_written(self):
 context->registerType(Backend::${backend}, ScalarType::${scalar_type}, new ${type_name}());
 """)
 
+core_file_manager = FileManager(core_install_dir)
 file_manager = FileManager()
 cuda_file_manager = FileManager()
 
@@ -155,7 +166,7 @@ def check_all_files_written(self):
     ('Int', 'int', 'Long', 'int32_t', False),
     ('Long', 'int64_t', 'Long', 'int64_t', False),
     ('Short', 'int16_t', 'Long', 'int16_t', False),
-    ('Half', 'Half', 'Double', 'THHalf', True),
+    ('Half', 'Half', 'Double', 'at::Half', True),
 ]
 
 # shared environment for non-derived base classes Type.h Tensor.h Storage.h
@@ -165,6 +176,7 @@ def check_all_files_written(self):
     'cuda_type_registrations': [],
     'cuda_type_headers': [],
     'pure_virtual_type_method_declarations': [],
+    'pure_virtual_extended_type_method_declarations': [],
     'type_method_declarations': [],
     'type_method_definitions': [],
     'type_method_inline_definitions': [],
@@ -236,7 +248,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
     env['DenseBackend'] = backend
     env['storage_tensor_headers'] = []
     if density != 'Sparse':
-        env['storage_tensor_headers'] = ['#include "ATen/TensorImpl.h"']
+        env['storage_tensor_headers'] = ['#include "ATen/core/TensorImpl.h"']
 
     # used for generating switch logic for external functions
     tag = density_tag + backend + scalar_name
@@ -251,7 +263,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
             '#undef THNN_',
             '#undef THCIndexTensor_',
         ]
-        env['extra_cuda_headers'] = ['#include <ATen/cuda/CUDAHalf.cuh>']
+        env['extra_cuda_headers'] = ['#include <ATen/cuda/ATenCUDAGeneral.h>']
         env['extra_cuda_headers'].append('#include <ATen/DeviceGuard.h>')
         env['extra_cuda_headers'].append('#include <ATen/cuda/CUDADevice.h>')
         env['extra_cuda_headers'].append('#include <ATen/cuda/CUDATypeDefault.h>')
@@ -284,7 +296,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
     if scalar_name == "Half":
         env['SparseTensor'] = 'Tensor'
         if backend == "CUDA":
-            env['AS_REAL'] = 'convert<half,double>'
+            env['AS_REAL'] = 'convert<at::Half,double>'
 
     declarations, definitions = function_wrapper.create_derived(
         env, declarations)
@@ -330,9 +342,11 @@ def iterate_types():
 # so that the script runs quickly when we are just querying the
 # outputs
 def declare_outputs():
-    files = ['Declarations.yaml', 'Type.h', 'TypeDefault.cpp', 'TypeDefault.h', 'Tensor.h',
-             'TensorMethods.h', 'Functions.h',
-             'CPUCopy.cpp', 'NativeFunctions.h',
+    core_files = ['Type.h', 'Tensor.h', 'TensorMethods.h']
+    for f in core_files:
+        core_file_manager.will_write(f)
+    files = ['Declarations.yaml', 'TypeExtendedInterface.h', 'TypeDefault.cpp', 'TypeDefault.h',
+             'Functions.h', 'CPUCopy.cpp', 'NativeFunctions.h',
              'RegisterCPU.cpp', 'RegisterCPU.h']
     for f in files:
         file_manager.will_write(f)
@@ -399,7 +413,16 @@ def generate_outputs():
         all_types.append(generate_storage_type_and_tensor(
             backend, density, scalar_type, declarations))
 
-    file_manager.write('Type.h', TYPE_H, top_env)
+    core_files = {
+        'Type.h': TYPE_H,
+        'Tensor.h': TENSOR_H,
+        'TensorMethods.h': TENSOR_METHODS_H
+    }
+
+    for core_file, core_template_file in core_files.items():
+        core_file_manager.write(core_file, core_template_file, top_env)
+
+    file_manager.write('TypeExtendedInterface.h', TYPE_EXTENDED_INTERFACE_H, top_env)
     file_manager.write('TypeDefault.h', TYPE_DEFAULT_H, top_env)
     file_manager.write('TypeDefault.cpp', TYPE_DEFAULT_CPP, top_env)
 
@@ -409,8 +432,6 @@ def generate_outputs():
     cuda_file_manager.write('RegisterCUDA.h', REGISTER_CUDA_H, top_env)
     cuda_file_manager.write('RegisterCUDA.cpp', REGISTER_CUDA_CPP, top_env)
 
-    file_manager.write('Tensor.h', TENSOR_H, top_env)
-    file_manager.write('TensorMethods.h', TENSOR_METHODS_H, top_env)
     file_manager.write('Functions.h', FUNCTIONS_H, top_env)
 
     file_manager.write('CPUCopy.cpp', copy_wrapper.create(all_types, 'CPU'))
@@ -420,10 +441,25 @@ def generate_outputs():
     file_manager.check_all_files_written()
     cuda_file_manager.check_all_files_written()
 
+    # check that generated files match source files
+    core_source_path = os.path.join(options.source_path, 'core')
+    match, mismatch, errors = filecmp.cmpfiles(core_install_dir, core_source_path, core_files.keys(), shallow=False)
+    if errors:
+        raise RuntimeError("Error while trying to compare source and generated files for {}. "
+                           "Source directory: {}.  Generated directory: {}."
+                           .format(errors, core_source_path, core_install_dir))
+    if mismatch:
+        file_component = '{}'.format(','.join(mismatch))
+        if len(mismatch) > 1:
+            file_component = '{' + file_component + '}'
+        update_cmd = "cp {}/{} {}".format(core_install_dir, file_component, core_source_path)
+        raise RuntimeError("Source files: {} did not match generated files.  To update the source files, "
+                           "run \"{}\"".format(mismatch, update_cmd))
 
 declare_outputs()
 if options.output_dependencies is not None:
     file_manager.write_outputs(options.output_dependencies)
+    core_file_manager.write_outputs(options.output_dependencies + "-core")
     cuda_file_manager.write_outputs(options.output_dependencies + "-cuda")
 else:
     generate_outputs()
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index f359d67c72e786..96ddb5ae3928b1 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -155,70 +155,51 @@ static void check_input_shape_forward(const at::Tensor& input,
   int64_t k = input.ndimension();
   int64_t weight_dim = weight.ndimension();
 
-  if (weight_dim != k) {
-    std::stringstream ss;
-    ss << "Expected " << weight_dim << "-dimensional input for " << weight_dim
-       << "-dimensional weight " << weight.sizes() << ", but got input of size "
-       << input.sizes() << " instead";
-    throw std::runtime_error(ss.str());
-  }
-  if (weight.size(0) < groups) {
-    std::stringstream ss;
-    ss << "Given groups=" << groups << ", expected weight to be at least "
-       << groups << " at dimension 0, but got weight of size " << weight.sizes()
-       << " instead";
-    throw std::runtime_error(ss.str());
-  }
-  if (weight.size(0) % groups != 0) {
-    std::stringstream ss;
-    ss << "Given groups=" << groups << ", expected weight to be divisible by "
-       << groups << " at dimension 0, but got weight of size " << weight.sizes()
-       << " instead";
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(weight_dim == k,
+           "Expected ", weight_dim, "-dimensional input for ", weight_dim,
+           "-dimensional weight ", weight.sizes(), ", but got ", k, "-dimensional input of size ",
+           input.sizes(), " instead");
+  AT_CHECK(weight.size(0) >= groups,
+           "Given groups=", groups, ", expected weight to be at least ", groups,
+           " at dimension 0, but got weight of size ", weight.sizes(), " instead");
+  AT_CHECK(weight.size(0) % groups == 0,
+           "Given groups=", groups, ", expected weight to be divisible by ",
+           groups, " at dimension 0, but got weight of size ", weight.sizes(),
+           " instead");
 
   if (!transposed) {
-    if (input.size(1) != (weight.size(1) * groups)) {
-      std::stringstream ss;
-      ss << "Given groups=" << groups << ", weight of size " << weight.sizes()
-         << ", expected input" << input.sizes() << " to have "
-         << (weight.size(1) * groups) << " channels, but got " << input.size(1)
-         << " channels instead";
-      throw std::runtime_error(ss.str());
-    }
-    if (bias.defined() && (bias.ndimension() != 1 || bias.size(0) != weight.size(0))) {
-      std::stringstream ss;
-      ss << "Given weight of size " << weight.sizes()
-         << ", expected bias to be 1-dimensional with " << weight.size(0) << " elements"
-         << ", but got bias of size " << bias.sizes() << " instead";
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(input.size(1) == (weight.size(1) * groups),
+             "Given groups=", groups, ", weight of size ", weight.sizes(),
+             ", expected input", input.sizes(), " to have ",
+             (weight.size(1) * groups), " channels, but got ", input.size(1),
+             " channels instead");
+    AT_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight.size(0)),
+             "Given weight of size ", weight.sizes(),
+             ", expected bias to be 1-dimensional with ", weight.size(0), " elements",
+             ", but got bias of size ", bias.sizes(), " instead");
   } else { // transposed
-    if (input.size(1) != weight.size(0)) {
-      std::stringstream ss;
-      ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes()
-         << ", expected input" << input.sizes() << " to have "
-         << weight.size(0) << " channels, but got " << input.size(1)
-         << " channels instead";
-      throw std::runtime_error(ss.str());
-    }
-    if (bias.defined() && (bias.ndimension() != 1 || bias.size(0) != weight.size(1) * groups)) {
-      std::stringstream ss;
-      ss << "Given transposed=" << transposed << ", weight of size " << weight.sizes()
-         << ", expected bias to be 1-dimensional with " << weight.size(1) * groups << " elements"
-         << ", but got bias of size " << bias.sizes() << " instead";
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(input.size(1) == weight.size(0),
+             "Given transposed=", transposed, ", weight of size ", weight.sizes(),
+             ", expected input", input.sizes(), " to have ", weight.size(0),
+             " channels, but got ", input.size(1), " channels instead");
+    AT_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight.size(1) * groups),
+             "Given transposed=", transposed, ", weight of size ", weight.sizes(),
+             ", expected bias to be 1-dimensional with ", weight.size(1) * groups, " elements",
+             ", but got bias of size ", bias.sizes(), " instead");
   }
 }
 
 static auto view4d(const at::Tensor& tensor) -> at::Tensor {
-  if (tensor.ndimension() != 3) throw std::runtime_error("expected 3D tensor");
+  AT_CHECK(tensor.ndimension() == 3,
+           "expected 3D tensor, got tensor with ", tensor.ndimension(),
+           " dimensions instead");
   return tensor.unsqueeze(2);
 }
 
 static auto view3d(const at::Tensor& tensor) -> at::Tensor {
-  if (tensor.ndimension() != 4) throw std::runtime_error("expected 4D tensor");
+  AT_CHECK(tensor.ndimension() == 4,
+           "expected 4D tensor, got tensor with ", tensor.ndimension(),
+           " dimensions instead");
   return tensor.squeeze(2);
 }
 
@@ -293,7 +274,7 @@ static inline std::vector<int64_t> convolution_expand_param_if_needed(
     ss << "expected " << param_name << " to be a single integer value or a "
        << "list of " << expected_dim << " values to match the convolution "
        << "dimensions, but got " << param_name << "=" << list_param;
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   } else {
     return list_param.vec();
   }
@@ -311,9 +292,7 @@ at::Tensor _convolution(
   auto k = weight.ndimension();
   int64_t dim = k - 2;
 
-  if (dim <= 0) {
-    throw std::runtime_error("weight should have at least two dimensions");
-  }
+  AT_CHECK(dim > 0, "weight should at least have at least two dimensions");
 
   ConvParams params;
   params.stride = convolution_expand_param_if_needed(stride_, "stride", dim);
@@ -326,8 +305,8 @@ at::Tensor _convolution(
   params.deterministic = deterministic;
   params.cudnn_enabled = cudnn_enabled;
 
-  if (params.is_padding_neg()) throw std::runtime_error("negative padding is not supported");
-  if (params.is_output_padding_neg()) throw std::runtime_error("negative output_padding is not supported");
+  AT_CHECK(!params.is_padding_neg(), "negative padding is not supported");
+  AT_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported");
 
   check_input_shape_forward(input, weight, bias, params.groups, params.transposed);
 
@@ -349,16 +328,12 @@ at::Tensor _convolution(
 
       output = at::thnn_conv_depthwise2d(input, weight, kernel_size, bias, stride, padding, dilation);
   } else if (params.use_cudnn(input)) {
-    if (input.type() != weight.type()){
-      std::stringstream ss;
-      ss << "Input type (" << input.type().toString() << ") and weight type (" << weight.type().toString() << ") should be the same";
-      throw std::runtime_error(ss.str());
-    }
-    if (bias.defined() && input.type() != bias.type()){
-      std::stringstream ss;
-      ss << "Input type (" << input.type().toString() << ") and bias type (" << bias.type().toString() << ") should be the same";
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(input.type() == weight.type(),
+             "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(),
+             ") should be the same");
+    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
+             "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
+             ") should be the same");
 
     if (params.transposed) {
       output = at::cudnn_convolution_transpose(
@@ -370,16 +345,12 @@ at::Tensor _convolution(
           params.padding, params.stride, params.dilation, params.groups, params.benchmark, params.deterministic);
     }
   } else if (params.use_miopen(input)) {
-    if (input.type() != weight.type()){
-      std::stringstream ss;
-      ss << "Input type (" << input.type().toString() << ") and weight type (" << weight.type().toString() << ") should be the same";
-      throw std::runtime_error(ss.str());
-    }
-    if (bias.defined() && input.type() != bias.type()){
-      std::stringstream ss;
-      ss << "Input type (" << input.type().toString() << ") and bias type (" << bias.type().toString() << ") should be the same";
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(input.type() == weight.type(),
+             "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(),
+             ") should be the same");
+    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
+             "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
+             ") should be the same");
 
     if (params.transposed) {
       output = at::miopen_convolution_transpose(
@@ -392,16 +363,12 @@ at::Tensor _convolution(
     }
   } else if (params.use_mkldnn(input)) {
 #if AT_MKLDNN_ENABLED()
-    if (input.type() != weight.type()){
-      std::stringstream ss;
-      ss << "Input type (" << input.toString() << ") and weight type (" << weight.toString() << ") should be the same";
-      throw std::runtime_error(ss.str());
-    }
-    if (bias.defined() && input.type() != bias.type()){
-      std::stringstream ss;
-      ss << "Input type (" << input.toString() << ") and bias type (" << bias.toString() << ") should be the same";
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(input.type() == weight.type(),
+             "Input type (", input.type().toString(), ") and weight type (", weight.type().toString(),
+             ") should be the same");
+    AT_CHECK(!bias.defined() || (input.type() == bias.type()),
+             "Input type (", input.type().toString(), ") and bias type (", bias.type().toString(),
+             ") should be the same");
 
     output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups);
 #endif
@@ -487,7 +454,7 @@ at::Tensor _convolution_nogroup(
     }
   }
 
-  throw std::runtime_error("unsupported ConvNd parameters");
+  AT_ERROR("unsupported ConvNd parameters");
 }
 
 static Tensor subvariable(const Tensor& var, int dim, int groups, int g) {
diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
index dad05dcf8b47a8..42ef6a4f6bb5f1 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -22,7 +22,10 @@
 //   DEFINE_DISPATCH(stub);
 //
 // In native/cpu/MyKernel.cpp:
-//   void kernel(const Tensor& x) { ... }
+//   namespace {
+//     // use anonymous namespace so that different cpu versions won't conflict
+//     void kernel(const Tensor& x) { ... }
+//   }
 //   REGISTER_DISPATCH(stub, &kernel);
 //
 // To call:
@@ -46,19 +49,22 @@ enum class CPUCapability {
 CPUCapability get_cpu_capability();
 
 template <typename FnPtr, typename T>
-struct AT_API DispatchStub {
-  static_assert(std::is_pointer<FnPtr>::value, "FnPtr should be a pointer type");
+struct AT_API DispatchStub;
+
+template <typename rT, typename T, typename... Args>
+struct AT_API DispatchStub<rT (*) (Args...), T> {
+  using FnPtr = rT (*) (Args...);
 
   template <typename... ArgTypes>
-  void operator()(DeviceType device_type, ArgTypes&&... args) {
+  rT operator()(DeviceType device_type, ArgTypes&&... args) {
     if (device_type == DeviceType::CPU) {
       if (!cpu_dispatch_ptr) {
         cpu_dispatch_ptr = choose_cpu_impl();
       }
-      (*cpu_dispatch_ptr)(std::forward<ArgTypes>(args)...);
+      return (*cpu_dispatch_ptr)(std::forward<ArgTypes>(args)...);
     } else if (device_type == DeviceType::CUDA) {
       AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel");
-      (*cuda_dispatch_ptr)(std::forward<ArgTypes>(args)...);
+      return (*cuda_dispatch_ptr)(std::forward<ArgTypes>(args)...);
     } else {
       AT_ERROR("DispatchStub: unsupported device type", device_type);
     }
@@ -103,6 +109,11 @@ struct RegisterDispatch {
 };
 } // anonymous namespace
 
+// Compiler will complain if you put things like std::tuple<Tensor, Tensor> in
+// the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g.,
+// adding parentheses and using helper struct to get rid of the parentheses, do
+// not work with MSVC. So do a `using`-declaration if you need to pass in such
+// `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h.
 #define DECLARE_DISPATCH(fn, name) \
   struct name : DispatchStub<fn, name> {}; \
   extern AT_API struct name name
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 67c0877f9fa072..99fa4c701d4bbf 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -67,18 +67,17 @@ Tensor embedding_sparse_backward(
   int64_t num_features = grad_.size(-1);
   auto weight_size = std::array<int64_t, 2>{{ num_weights, num_features }};
   auto& dense_type = grad.type();
-  auto& sparse_type = dense_type.toBackend(grad.is_cuda() ? Backend::SparseCUDA : Backend::SparseCPU);
 
   // check if all our grad come from padding_idx
   if (grad.numel() == 0) {
-    return sparse_type._sparse_coo_tensor_unsafe(indices_.type().tensor({1, 0}),
-                                                 dense_type.tensor({0, num_features}),
-                                                 weight_size);
+    return at::_sparse_coo_tensor_unsafe(indices_.type().tensor({1, 0}),
+                                         dense_type.tensor({0, num_features}),
+                                         weight_size);
   }
 
   auto index = indices.reshape({1, -1});
   auto values = grad.reshape({-1, num_features});
-  return sparse_type._sparse_coo_tensor_unsafe(index, values, weight_size);
+  return at::_sparse_coo_tensor_unsafe(index, values, weight_size);
 }
 
 Tensor embedding_dense_backward_cpu(
diff --git a/aten/src/ATen/native/Gesv.cpp b/aten/src/ATen/native/Gesv.cpp
index b45e2a4f98860e..dcb8a0964d2f90 100644
--- a/aten/src/ATen/native/Gesv.cpp
+++ b/aten/src/ATen/native/Gesv.cpp
@@ -44,7 +44,7 @@ template<> void lapackGesv<double>(
 #endif
 
 template <typename scalar_t>
-static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t> infos) {
+static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t>& infos) {
 #ifndef USE_LAPACK
   AT_ERROR("gesv: LAPACK library not found in compilation");
 #endif
@@ -117,8 +117,7 @@ std::tuple<Tensor&,Tensor&> gesv_out(
     Tensor& solution, Tensor& lu, const Tensor& self, const Tensor& A) {
   if (self.dim() > 2 || A.dim() > 2) {
     AT_ERROR("torch.gesv() with the `out` keyword does not support batching. "
-                  "b.dim() (%lld) and A.dim() (%lld) must both be 2.",
-                  (long long)self.dim(), (long long)A.dim());
+             "b.dim() (", self.dim(), ") and A.dim() (", A.dim(), ") must both be 2.");
   }
   return at::_gesv_single_out(solution, lu, self, A);
 }
diff --git a/aten/src/ATen/native/Gesv.h b/aten/src/ATen/native/Gesv.h
index 2d265520f3d21c..a3ba6ec1a8f127 100644
--- a/aten/src/ATen/native/Gesv.h
+++ b/aten/src/ATen/native/Gesv.h
@@ -5,26 +5,23 @@ namespace at { namespace native {
 static inline void checkInputs(const Tensor& self, const Tensor& A) {
   if (A.size(-1) != A.size(-2)) {
     AT_ERROR("A must be batches of square matrices, "
-        "but they are %lld by %lld matrices",
+        "but they are ", A.size(-1), " by ", A.size(-2), " matrices",
         (long long)A.size(-1), (long long)A.size(-2));
   }
   if (A.size(-1) != self.size(-2)) {
     AT_ERROR("Incompatible matrix sizes for matmul: each A "
-        "matrix is %llu by %lld but each b matrix is %lld by %lld.",
-        (long long)A.size(-1), (long long)A.size(-1),
-        (long long)self.size(-2), (long long)self.size(-1));
+        "matrix is ", A.size(-1), " by ", A.size(-1),
+        " but each b matrix is ", self.size(-2), " by ", self.size(-1));
   }
 }
 
-static inline void checkErrors(std::vector<int64_t> infos) {
+static inline void checkErrors(std::vector<int64_t>& infos) {
   for (size_t i = 0; i < infos.size(); i++) {
     auto info = infos[i];
     if (info < 0) {
-      AT_ERROR("gesv: For batch %lld: Argument %lld has illegal value",
-          (long long)i, -info);
+      AT_ERROR("gesv: For batch ", i, ": Argument ", -info, " has illegal value.");
     } else if (info > 0) {
-      AT_ERROR("gesv: For batch %lld: U(%lld,%lld) is zero, singular U.",
-          (long long)i, info, info);
+      AT_ERROR("gesv: For batch ", i, ": U(", info, ",", info, ") is zero, singular U.");
     }
   }
 }
diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp
index 1547ab2c934053..4d09307bd640e0 100644
--- a/aten/src/ATen/native/GridSampler.cpp
+++ b/aten/src/ATen/native/GridSampler.cpp
@@ -3,8 +3,9 @@
 #include "ATen/Device.h"
 #include "ATen/Error.h"
 #include "ATen/NativeFunctions.h"
-#include "ATen/detail/CUDAHooksInterface.h"
 #include "ATen/native/GridSampler.h"
+#include "ATen/native/cpu/GridSamplerKernel.h"
+#include "ATen/cpu/vml.h"
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -16,6 +17,7 @@ using at::native::detail::GridSamplerInterpolation;
 using at::native::detail::GridSamplerPadding;
 
 namespace {
+
   template<typename scalar_t>
   static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
     return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
@@ -117,121 +119,10 @@ namespace {
     }
   }
 
-  template<typename scalar_t>
-  Tensor grid_sampler_2d_cpu_impl(const Tensor& input, const Tensor& grid,
-                                 GridSamplerInterpolation interpolation_mode,
-                                 GridSamplerPadding padding_mode) {
-    int64_t N = input.size(0);
-    int64_t C = input.size(1);
-    int64_t inp_H = input.size(2);
-    int64_t inp_W = input.size(3);
-    int64_t out_H = grid.size(1);
-    int64_t out_W = grid.size(2);
-    auto output = at::empty({N, C, out_H, out_W}, input.options());
-    int64_t inp_sN = input.stride(0);
-    int64_t inp_sC = input.stride(1);
-    int64_t inp_sH = input.stride(2);
-    int64_t inp_sW = input.stride(3);
-    int64_t grid_sN = grid.stride(0);
-    int64_t grid_sH = grid.stride(1);
-    int64_t grid_sW = grid.stride(2);
-    int64_t grid_sCoor = grid.stride(3);
-    int64_t out_sN = output.stride(0);
-    int64_t out_sC = output.stride(1);
-    int64_t out_sH = output.stride(2);
-    int64_t out_sW = output.stride(3);
-    scalar_t *inp_ptr = input.data<scalar_t>();
-    scalar_t *out_ptr = output.data<scalar_t>();
-    scalar_t *grid_ptr = grid.data<scalar_t>();
-    // loop over each output pixel
-    #ifdef _OPENMP
-    #pragma omp parallel for
-    #endif
-    for (int64_t n = 0; n < N; ++n) {
-      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-      for (int64_t h = 0; h < out_H; ++h) {
-        for (int64_t w = 0; w < out_W; ++w) {
-          // get the corresponding input x, y co-ordinates from grid
-          scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW];
-          scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor];
-
-          // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1]
-          ix = ((ix + 1) / 2) * (inp_W - 1);
-          iy = ((iy + 1) / 2) * (inp_H - 1);
-
-          if (padding_mode == GridSamplerPadding::Border) {
-            // clip coordinates to image borders
-            ix = clip_coordinates(ix, inp_W);
-            iy = clip_coordinates(iy, inp_H);
-          } else if (padding_mode == GridSamplerPadding::Reflection) {
-            // reflect coordinates by image borders
-            ix = reflect_coordinates(ix, inp_W);
-            iy = reflect_coordinates(iy, inp_H);
-          }
-
-          if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-            // get NE, NW, SE, SW pixel values from (x, y)
-            int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
-            int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
-            int64_t ix_ne = ix_nw + 1;
-            int64_t iy_ne = iy_nw;
-            int64_t ix_sw = ix_nw;
-            int64_t iy_sw = iy_nw + 1;
-            int64_t ix_se = ix_nw + 1;
-            int64_t iy_se = iy_nw + 1;
-
-            // get surfaces to each neighbor:
-            scalar_t nw = (ix_se - ix)    * (iy_se - iy);
-            scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
-            scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
-            scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
-
-            // calculate bilinear weighted pixel value and set output pixel
-            scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-            scalar_t *inp_ptr_NC = inp_ptr_N;
-            for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-              //   (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne
-              // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se
-              *out_ptr_NCHW = static_cast<scalar_t>(0);
-              if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-                *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-              }
-              if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-                *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-              }
-              if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-                *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-              }
-              if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-                *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
-              }
-            }
-          } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-            int64_t ix_nearest = static_cast<int64_t>(std::round(ix));
-            int64_t iy_nearest = static_cast<int64_t>(std::round(iy));
-
-            // assign nearest neighor pixel value to output pixel
-            scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-            scalar_t *inp_ptr_NC = inp_ptr_N;
-            for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-              if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
-                *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
-              } else {
-                *out_ptr_NCHW = static_cast<scalar_t>(0);
-              }
-            }
-          }
-        }
-      }
-    }
-    return output;
-  }
-
   template<typename scalar_t>
   Tensor grid_sampler_3d_cpu_impl(const Tensor& input, const Tensor& grid,
-                                 GridSamplerInterpolation interpolation_mode,
-                                 GridSamplerPadding padding_mode) {
+                                  GridSamplerInterpolation interpolation_mode,
+                                  GridSamplerPadding padding_mode) {
     int64_t N = input.size(0);
     int64_t C = input.size(1);
     int64_t inp_D = input.size(2);
@@ -395,167 +286,12 @@ namespace {
     return output;
   }
 
-  template<typename scalar_t>
-  std::tuple<Tensor, Tensor>
-  grid_sampler_2d_backward_cpu_impl(const Tensor& grad_output,
-                                   const Tensor& input, const Tensor& grid,
-                                   GridSamplerInterpolation interpolation_mode,
-                                   GridSamplerPadding padding_mode) {
-    auto grad_input = at::zeros_like(input);
-    auto grad_grid = at::empty_like(grid);
-    // If interpolation mode is Nearest, then grad_grid is not filled in the
-    // loop below.
-    if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-      grad_grid.zero_();
-    }
-    int64_t N = input.size(0);
-    int64_t C = input.size(1);
-    int64_t inp_H = input.size(2);
-    int64_t inp_W = input.size(3);
-    int64_t out_H = grid.size(1);
-    int64_t out_W = grid.size(2);
-    int64_t inp_sN = input.stride(0);
-    int64_t inp_sC = input.stride(1);
-    int64_t inp_sH = input.stride(2);
-    int64_t inp_sW = input.stride(3);
-    int64_t grid_sN = grid.stride(0);
-    int64_t grid_sH = grid.stride(1);
-    int64_t grid_sW = grid.stride(2);
-    int64_t grid_sCoor = grid.stride(3);
-    int64_t gOut_sN = grad_output.stride(0);
-    int64_t gOut_sC = grad_output.stride(1);
-    int64_t gOut_sH = grad_output.stride(2);
-    int64_t gOut_sW = grad_output.stride(3);
-    int64_t gInp_sN = grad_input.stride(0);
-    int64_t gInp_sC = grad_input.stride(1);
-    int64_t gInp_sH = grad_input.stride(2);
-    int64_t gInp_sW = grad_input.stride(3);
-    int64_t gGrid_sN = grad_grid.stride(0);
-    int64_t gGrid_sW = grad_grid.stride(2);
-    scalar_t *inp_ptr = input.data<scalar_t>();
-    scalar_t *grid_ptr = grid.data<scalar_t>();
-    scalar_t *gOut_ptr = grad_output.data<scalar_t>();
-    scalar_t *gInp_ptr = grad_input.data<scalar_t>();
-    scalar_t *gGrid_ptr = grad_grid.data<scalar_t>();
-    // loop over each output pixel
-    #ifdef _OPENMP
-    #pragma omp parallel for
-    #endif
-    for (int64_t n = 0; n < N; ++n) {
-      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-      scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
-      for (int64_t h = 0; h < out_H; ++h) {
-        for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
-          // get the corresponding input x, y co-ordinates from grid
-          scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW];
-          scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor];
-
-          // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1]
-          ix = ((ix + 1) / 2) * (inp_W - 1);
-          iy = ((iy + 1) / 2) * (inp_H - 1);
-
-          // multipliers for gradients on ix and iy
-          // E.g.,  0 for out-of-bound indices when GridSamplerPadding::Border
-          scalar_t gix_mult, giy_mult;
-          if (padding_mode == GridSamplerPadding::Border) {
-            // clip coordinates to image borders
-            ix = clip_coordinates_set_grad(ix, inp_W, &gix_mult);
-            iy = clip_coordinates_set_grad(iy, inp_H, &giy_mult);
-          } else if (padding_mode == GridSamplerPadding::Reflection) {
-            // reflect coordinates by image borders
-            ix = reflect_coordinates_set_grad(ix, inp_W, &gix_mult);
-            iy = reflect_coordinates_set_grad(iy, inp_H, &giy_mult);
-          } else {  // padding_mode == GridSamplerPadding::Zeros
-            gix_mult = static_cast<scalar_t>(1);
-            giy_mult = static_cast<scalar_t>(1);
-          }
-
-          if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-            // get NE, NW, SE, SW pixel values from (x, y)
-            int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
-            int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
-            int64_t ix_ne = ix_nw + 1;
-            int64_t iy_ne = iy_nw;
-            int64_t ix_sw = ix_nw;
-            int64_t iy_sw = iy_nw + 1;
-            int64_t ix_se = ix_nw + 1;
-            int64_t iy_se = iy_nw + 1;
-
-            // get surfaces to each neighbor:
-            scalar_t nw = (ix_se - ix)    * (iy_se - iy);
-            scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
-            scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
-            scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
-
-            scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0);
-            scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
-            scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-            scalar_t *inp_ptr_NC = inp_ptr_N;
-            // calculate bilinear weighted pixel value and set output pixel
-            for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
-              scalar_t gOut = *gOut_ptr_NCHW;
-
-              // calculate and set grad_input
-              safe_add_2d(gInp_ptr_NC, iy_nw, ix_nw, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut);
-              safe_add_2d(gInp_ptr_NC, iy_ne, ix_ne, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut);
-              safe_add_2d(gInp_ptr_NC, iy_sw, ix_sw, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut);
-              safe_add_2d(gInp_ptr_NC, iy_se, ix_se, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut);
-
-              // calculate grad_grid
-              if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-                scalar_t nw_val = inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW];
-                gix -= nw_val * (iy_se - iy) * gOut;
-                giy -= nw_val * (ix_se - ix) * gOut;
-              }
-              if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-                scalar_t ne_val = inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW];
-                gix += ne_val * (iy_sw - iy) * gOut;
-                giy -= ne_val * (ix - ix_sw) * gOut;
-              }
-              if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-                scalar_t sw_val = inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW];
-                gix -= sw_val * (iy - iy_ne) * gOut;
-                giy += sw_val * (ix_ne - ix) * gOut;
-              }
-              if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-                scalar_t se_val = inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW];
-                gix += se_val * (iy - iy_nw) * gOut;
-                giy += se_val * (ix - ix_nw) * gOut;
-              }
-            }
-
-            // un-normalize grad_grid values back to [-1, 1] constraints
-            gix = gix * (inp_W - 1) / 2;
-            giy = giy * (inp_H - 1) / 2;
-
-            // assuming grad_grid is contiguous
-            gGrid_ptr_NHW[0] = gix_mult * gix;
-            gGrid_ptr_NHW[1] = giy_mult * giy;
-          } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-            int64_t ix_nearest = static_cast<int64_t>(std::round(ix));
-            int64_t iy_nearest = static_cast<int64_t>(std::round(iy));
-
-            // assign nearest neighor pixel value to output pixel
-            scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
-            scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-            for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC) {
-              // calculate and set grad_input
-              safe_add_2d(gInp_ptr_NC, iy_nearest, ix_nearest, gInp_sH, gInp_sW, inp_H, inp_W, *gOut_ptr_NCHW);
-            }
-          }
-        }
-      }
-    }
-    return std::make_tuple(grad_input, grad_grid);
-  }
-
   template<typename scalar_t>
   std::tuple<Tensor, Tensor>
   grid_sampler_3d_backward_cpu_impl(const Tensor& grad_output,
-                                   const Tensor& input, const Tensor& grid,
-                                   GridSamplerInterpolation interpolation_mode,
-                                   GridSamplerPadding padding_mode) {
+                                    const Tensor& input, const Tensor& grid,
+                                    GridSamplerInterpolation interpolation_mode,
+                                    GridSamplerPadding padding_mode) {
     auto grad_input = at::zeros_like(input);
     auto grad_grid = at::empty_like(grid);
     // If interpolation mode is Nearest, then grad_grid is not filled in the
@@ -783,18 +519,18 @@ namespace {
     }
     return std::make_tuple(grad_input, grad_grid);
   }
-}
+
+}  // namespace
 
 // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
 Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid,
                            int64_t interpolation_mode, int64_t padding_mode) {
-  return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler2d_cpu", [&] {
-    return grid_sampler_2d_cpu_impl<scalar_t>(
-      input, grid, static_cast<GridSamplerInterpolation>(interpolation_mode),
-      static_cast<GridSamplerPadding>(padding_mode));
-  });
+  return grid_sampler_2d_cpu_kernel(kCPU, input, grid, interpolation_mode, padding_mode);
 }
 
+DEFINE_DISPATCH(grid_sampler_2d_cpu_kernel);
+
+
 // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
 Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid,
                            int64_t interpolation_mode, int64_t padding_mode) {
@@ -809,14 +545,11 @@ Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid,
 std::tuple<Tensor, Tensor>
 grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
                              int64_t interpolation_mode, int64_t padding_mode) {
-  return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu", [&] {
-    return grid_sampler_2d_backward_cpu_impl<scalar_t>(
-      grad_output, input, grid,
-      static_cast<GridSamplerInterpolation>(interpolation_mode),
-      static_cast<GridSamplerPadding>(padding_mode));
-  });
+  return grid_sampler_2d_backward_cpu_kernel(kCPU, grad_output, input, grid, interpolation_mode, padding_mode);
 }
 
+DEFINE_DISPATCH(grid_sampler_2d_backward_cpu_kernel);
+
 // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
 std::tuple<Tensor, Tensor>
 grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h
index f39b4e996469fa..ac9c72002c66cc 100644
--- a/aten/src/ATen/native/GridSampler.h
+++ b/aten/src/ATen/native/GridSampler.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "ATen/ATen.h"
 #include "ATen/NativeFunctions.h"
 
diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp
index 288fa283abe660..5566fd397320aa 100644
--- a/aten/src/ATen/native/Indexing.cpp
+++ b/aten/src/ATen/native/Indexing.cpp
@@ -39,7 +39,7 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask,
   ss << "The shape of the mask " << mask.sizes() << " at index " << maskIdx;
   ss << " does not match the shape of the indexed tensor " << self.sizes();
   ss << " at index " << idx;
-  throw std::runtime_error(ss.str());
+  AT_ERROR(ss.str());
 }
 
 static void checkIndexTensorTypes(TensorList indices) {
@@ -47,9 +47,8 @@ static void checkIndexTensorTypes(TensorList indices) {
     if (tensor.defined()) {
       auto& type = tensor.type();
       auto scalarType = type.scalarType();
-      if (scalarType != kLong && scalarType != kByte) {
-        throw std::runtime_error("tensors used as indices must be long or byte tensors");
-      }
+      AT_CHECK(scalarType == kLong || scalarType == kByte,
+               "tensors used as indices must be long or byte tensors");
     }
   }
 }
@@ -146,12 +145,10 @@ static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size)
   if (index.numel() != 0) {
     auto max_idx = index.max().toCLong();
     auto min_idx = index.min().toCLong();
-    if (max_idx >= dim_size) {
-      AT_ERROR("index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
-    }
-    if (min_idx < -dim_size) {
-      AT_ERROR("index ", min_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
-    }
+    AT_CHECK(max_idx < dim_size,
+             "index ", max_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
+    AT_CHECK(min_idx >= -dim_size,
+             "index ", min_idx, " is out of bounds for dimension ", dim, " with size ", dim_size);
   }
   return index.remainder(dim_size);
 }
@@ -230,9 +227,8 @@ static std::tuple<Tensor, Tensor> makeLinearIndex(Tensor self, TensorList orig)
 }
 
 Tensor index(const Tensor & self, TensorList indices) {
-  if (indices.size() > (size_t)self.dim()) {
-   AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
-  }
+  AT_CHECK(indices.size() <= (size_t)self.dim(),
+           "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
 
   Tensor src, linearIndex;
   std::tie(src, linearIndex) = makeLinearIndex(self, indices);
@@ -240,9 +236,8 @@ Tensor index(const Tensor & self, TensorList indices) {
 }
 
 Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value) {
-  if (indices.size() > (size_t)self.dim()) {
-   AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
-  }
+  AT_CHECK(indices.size() <= (size_t)self.dim(),
+           "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
 
   Tensor src, linearIndex, expandedValue;
   std::tie(src, linearIndex) = makeLinearIndex(self, indices);
@@ -252,9 +247,8 @@ Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value)
 }
 
 Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) {
-  if (indices.size() > (size_t)self.dim()) {
-   AT_ERROR("too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
-  }
+  AT_CHECK(indices.size() <= (size_t)self.dim(),
+           "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
 
   Tensor src, linearIndex, expandedValue;
   std::tie(src, linearIndex) = makeLinearIndex(self, indices);
@@ -265,18 +259,14 @@ Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) {
 Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
   dim = maybe_wrap_dim(dim, self.dim());
 
-  if (index.dim() >= 2) {
-   AT_ERROR(
-        "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")");
-  }
+  AT_CHECK(index.dim() < 2,
+           "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")");
+
   int64_t numIndices = index.numel();
-  if (source.dim() == 0 && numIndices != 1) {
-   AT_ERROR(
-        "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")");
-  }
-  if (index.type().scalarType() != ScalarType::Long) {
-   AT_ERROR("index_copy_(): Expected LongTensor for index");
-  }
+  AT_CHECK(source.dim() != 0 || numIndices == 1,
+           "index_copy_(): When source is scalar, index should have one element (got ", numIndices, ")");
+  AT_CHECK(index.type().scalarType() == ScalarType::Long,
+           "index_copy_(): Expected LongTensor for index");
 
   // Check that source and destination slices have the same size
   auto selfSlicedSizes = self.sizes().vec();
@@ -294,7 +284,7 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten
     ss << "index_copy_(): Source/destination tensor must have same slice shapes. ";
     ss << "Destination slice shape: " << selfSlicedSizes << " at dimension " << dim;
     ss << " and source slice shape: " << sourceSlicedSizes << " at dimension 0.";
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
   if (source.dim() > 0 && numIndices != source.size(dim)) {
      AT_ERROR(
diff --git a/aten/src/ATen/native/LegacyBridge.cpp b/aten/src/ATen/native/LegacyBridge.cpp
index 07d7e46ff79a56..0aaf2149b42a05 100644
--- a/aten/src/ATen/native/LegacyBridge.cpp
+++ b/aten/src/ATen/native/LegacyBridge.cpp
@@ -22,14 +22,6 @@ namespace {
 
 // TODO: Maybe the foo_ variants should call th_foo_
 
-Tensor norm(const Tensor & self, Scalar p) {
-  if (_has_native(self)) {
-    return native_norm(self, p);
-  } else {
-    return th_norm(self, p);
-  }
-}
-
 Tensor clone(const Tensor& self) {
   if (_has_native(self)) {
     return native_clone(self);
@@ -144,34 +136,34 @@ Tensor& addmm_(Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta
 
 Tensor tensor(const Type& dtype) {
   if (_type_has_native(dtype)) {
-    return dtype.native_tensor();
+    return at::getType(dtype.options()).native_tensor();
   } else {
-    return dtype.th_tensor();
+    return at::getType(dtype.options()).th_tensor();
   }
 }
 
 Tensor tensor(const Type& dtype, ArrayRef<int64_t> size) {
   if (_type_has_native(dtype)) {
-    return dtype.native_tensor(size);
+    return at::getType(dtype.options()).native_tensor(size);
   } else {
-    return dtype.th_tensor(size);
+    return at::getType(dtype.options()).th_tensor(size);
   }
 }
 
 Tensor sparse_coo_tensor(const Type& dtype, ArrayRef<int64_t> size) {
-  return dtype.toSparse().native_sparse_coo_tensor(size);
+  return at::getType(dtype.options().layout(at::kSparse)).native_sparse_coo_tensor(size);
 }
 
 Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values) {
-  return values.type().toSparse().native_sparse_coo_tensor(indices, values);
+  return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values);
 }
 
 Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size) {
-  return values.type().toSparse().native_sparse_coo_tensor(indices, values, size);
+  return at::getType(values.options().layout(at::kSparse)).native_sparse_coo_tensor(indices, values, size);
 }
 
 Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values, ArrayRef<int64_t> size) {
-  return values.type().toSparse()._native_sparse_coo_tensor_unsafe(indices, values, size);
+  return at::getType(values.options().layout(at::kSparse))._native_sparse_coo_tensor_unsafe(indices, values, size);
 }
 
 int64_t get_device(const Tensor& self) {
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 2371d82efc6cfb..50726cb99b81b9 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1,7 +1,10 @@
 #include "ATen/ATen.h"
 #include "ATen/ExpandUtils.h"
+#include "ATen/Dispatch.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/native/LinearAlgebraUtils.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/Parallel.h"
 #include <functional>
 #include <numeric>
 #include <vector>
@@ -169,14 +172,14 @@ Tensor mm(const Tensor& self, const Tensor& mat2) {
   if (self.is_sparse()) {
     return mat2.type().addmm(at::zeros({}, mat2.type()), self, mat2, 0, 1);
   }
-  return self.type()._mm(self, mat2);
+  return at::_mm(self, mat2);
 }
 
 Tensor& mm_out(Tensor& result, const Tensor& self, const Tensor& mat2) {
   if (self.is_sparse()) {
-    return mat2.type().addmm_out(result, at::zeros({}, mat2.type()), self, mat2, 0, 1);
+    return at::addmm_out(result, at::zeros({}, mat2.options()), self, mat2, 0, 1);
   }
-  return self.type()._mm_out(result, self, mat2);
+  return at::_mm_out(result, self, mat2);
 }
 
 Tensor mv(const Tensor& self, const Tensor& vec) {
@@ -222,6 +225,153 @@ Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const T
   return at::_addr_out(result, self, vec1, vec2, beta, alpha);
 }
 
+template <typename scalar_t, bool is_bmm>
+inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const Tensor& mat2, Scalar beta_, Scalar alpha_) {
+  int64_t bs = result.size(0);
+  int64_t is = result.size(1);
+  int64_t js = result.size(2);
+  int64_t ks = self.size(2);
+
+  scalar_t alpha = alpha_.to<scalar_t>();
+  scalar_t beta = beta_.to<scalar_t>();
+
+  auto r0 = result.accessor<scalar_t, 3>();
+  auto s0 = self.accessor<scalar_t, 3>();
+  auto m0 = mat2.accessor<scalar_t, 3>();
+
+  int64_t grain_size = std::min(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1);
+  parallel_for(0, bs, grain_size, [&](int64_t b_begin, int64_t b_end) {
+      for (int64_t b = b_begin; b < b_end; b++) {
+        auto r1 = r0[b];
+        auto s1 = s0[b];
+        auto m1 = m0[b];
+        for (int64_t i = 0; i < is; i++) {
+          auto r2 = r1[i];
+          auto s2 = s1[i];
+          for (int64_t j = 0; j < js; j++) {
+            scalar_t &r = r2[j];
+            if (is_bmm) {
+              r = 0;
+              for (int64_t k = 0; k < ks; k++) {
+                r += s2[k] * m1[k][j];
+              }
+            } else {
+              r *= beta;
+              for (int64_t k = 0; k < ks; k++) {
+                r += alpha * s2[k] * m1[k][j];
+              }
+            }
+          }
+        }
+      }
+    });
+}
+
+// This tries to apply some optimizations to bmm/baddbmm:
+// - When the operand size is small, computation are parallelized over the batch
+//   dimension using OMP and naive matrix multiplication is applied.
+// - When the operand size is larger than the threshold, if compiled with MKL, MKL's batch gemm is used.
+// - Otherwise, we use a series of matrix multiplications.
+// The threshold of 400 for the first has not been thoroughly benchmarked yet and may have room for further
+// optimization, it likely depends on the characteristics of the CPU, MKL will be different from non-MKL etc.,
+// but this seems to be a first starting point.
+
+static inline Tensor& bmm_out_or_baddbmm_(Tensor& self_or_result, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha, bool is_bmm_out) {
+  // is_bmm_out: true for bmm_out, false for baddbmm_
+  // self_or_result is "self" for baddbmm_ and "result" for bmm_out
+  CheckedFrom c = (is_bmm_out ? "bmm" : "baddbmm");
+  TensorArg self_arg(self_or_result, is_bmm_out ? "self" : "result", 0);
+  TensorArg b1_arg(batch1, "batch1", 1);
+  TensorArg b2_arg(batch2, "batch2", 2);
+  checkDim(c, b1_arg, 3);
+  checkDim(c, b2_arg, 3);
+
+  int64_t bs = batch1.size(0);
+  checkSize(c, b2_arg, 0, bs);
+  int64_t contraction_size = batch1.size(2);
+  int64_t res_rows = batch1.size(1);
+  int64_t res_cols = batch2.size(2);
+  checkSize(c, b2_arg, 1, contraction_size);
+
+  if (is_bmm_out) {
+    self_or_result.resize_({bs, res_rows, res_cols});
+  } else {
+    checkSize(c, self_arg, 0, bs);
+    checkSize(c, self_arg, 1, res_rows);
+    checkSize(c, self_arg, 2, res_cols);
+  }
+
+  // handle pathological cases that blas may not like
+  if (self_or_result.numel() == 0) {
+    return self_or_result;
+  } else if (contraction_size == 0) {
+    return self_or_result.zero_();
+  }
+  
+  auto batch_items_contiguous_or_transposed = [&](const Tensor& t) {
+    return (t.stride(2) == 1 && t.stride(1) == t.size(2))
+            || (t.stride(1) == 1 && t.stride(2) == t.size(1));
+  };
+
+  if (contraction_size * res_rows * res_cols < 400) {
+    if (is_bmm_out) {
+      AT_DISPATCH_ALL_TYPES(batch1.type(), "bmm", [&] {
+          baddbmm_cpu_kernel<scalar_t, true>(self_or_result, batch1, batch2, beta, alpha);
+        });
+    } else {
+      AT_DISPATCH_ALL_TYPES(batch1.type(), "baddbmm", [&] {
+          baddbmm_cpu_kernel<scalar_t, false>(self_or_result, batch1, batch2, beta, alpha);
+        });
+    }
+  } else if (at::hasMKL() && at::native::is_floating_point(self_or_result)
+	     && batch_items_contiguous_or_transposed(batch1)
+	     && batch_items_contiguous_or_transposed(batch2)
+	     && self_or_result.is_contiguous()) {
+    at::native::_baddbmm_mkl_(self_or_result, batch1, batch2, beta, alpha);
+  } else { // split along batch dimension
+    if (is_bmm_out) {
+      for (int64_t b = 0; b < bs; b++) {
+        auto r = self_or_result.select(0, b);
+        at::native::mm_out(r, batch1.select(0, b), batch2.select(0, b));
+      }
+    } else {
+      for (int64_t b = 0; b < bs; b++) {
+        self_or_result.select(0, b).addmm_(batch1.select(0, b), batch2.select(0, b), beta, alpha);
+      }
+    }
+  }
+  return self_or_result;
+}
+
+
+Tensor baddbmm_cpu(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  Tensor result = self.type().tensor();
+  return at::native::baddbmm_out_cpu(result, self, batch1, batch2, beta, alpha);
+}
+
+Tensor& baddbmm_out_cpu(Tensor &result, const Tensor& self_, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  Tensor self;
+  std::tie(self) = expand_size(self_, {batch1.size(0), batch1.size(1), batch2.size(2)}, "baddbmm");
+  result.resize_(self.sizes());
+  result.copy_(self);
+  return at::native::baddbmm__cpu(result, batch1, batch2, beta, alpha);
+}
+
+Tensor& baddbmm__cpu(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  return bmm_out_or_baddbmm_(self, batch1, batch2, beta, alpha, false);
+}
+
+Tensor bmm_cpu(const Tensor& self, const Tensor& mat2) {
+  Tensor result = self.type().tensor();
+  return at::native::bmm_out_cpu(result, self, mat2);
+}
+
+Tensor& bmm_out_cpu(Tensor &result, const Tensor& batch1, const Tensor& batch2) {
+  Scalar beta(0.0);
+  Scalar alpha(1.0);
+  return bmm_out_or_baddbmm_(result, batch1, batch2, beta, alpha, true);
+}
+
 Tensor dot(const Tensor& self, const Tensor& tensor) {
   check_1d(self, "self", "dot");
   check_1d(tensor, "tensor", "dot");
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
index ccae5fb75f5b01..b7a9c52c64bd4e 100644
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -287,7 +287,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
       for (int64_t c = 0; c < num_labels; c++) {
         scalar_t& res = grad_a[t][c];
         scalar_t lp = log_probs_a[t][c];
-        res = std::exp(lp)-std::exp(res + nll - lp) * gr;
+        res = (std::exp(lp)-std::exp(res + nll - lp)) * gr;
       }
     }
     // zero the remainder
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 24d8a41fb50271..ed0a94ae496718 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -11,11 +11,15 @@ namespace at { namespace native {
 
 namespace {
   void check_dims_match_num_input_features(const char* arg_name, int64_t expected, int64_t actual){
-    if (actual != expected){
-      std::stringstream ss;
-      ss << arg_name << " should contain " << expected << " elements not " << actual ;
-      throw std::runtime_error(ss.str());
+    AT_CHECK(actual == expected,
+             arg_name, " should contain ", expected, " elements not ", actual);
+  }
+
+  static inline Tensor repeat_if_defined(const Tensor& t, int64_t repeat) {
+    if (t.defined()) {
+      return t.repeat(repeat);
     }
+    return t;
   }
 }
 
@@ -28,12 +32,12 @@ Tensor batch_norm(
   if (running_mean.defined()) {
     check_dims_match_num_input_features("running_mean", num_features, running_mean.numel());
   } else if (!training) {
-    throw std::runtime_error("running_mean must be defined in evaluation mode");
+    AT_ERROR("running_mean must be defined in evaluation mode");
   }
   if (running_var.defined()) {
     check_dims_match_num_input_features("running_var", num_features, running_var.numel());
   } else if (!training) {
-    throw std::runtime_error("running_var must be defined in evaluation mode");
+    AT_ERROR("running_var must be defined in evaluation mode");
   }
   if (weight.defined()) {
     check_dims_match_num_input_features("weight", num_features, weight.numel());
@@ -83,35 +87,57 @@ Tensor batch_norm(
             running_mean, running_var, training, momentum, eps);
 }
 
+Tensor instance_norm(
+    const Tensor& input, const Tensor& weight /* optional */, const Tensor& bias /* optional */,
+    const Tensor& running_mean /* optional */, const Tensor& running_var /* optional */,
+    bool use_input_stats, double momentum, double eps, bool cudnn_enabled) {
+  AT_CHECK(use_input_stats || (running_mean.defined() && running_var.defined()),
+           "Expected running_mean and running_var to be defined when use_input_stats is false");
+  std::vector<int64_t> shape = input.sizes().vec();
+  int64_t b = input.size(0);
+  int64_t c = input.size(1);
+  shape[1] = b * c;
+  shape[0] = 1;
+
+  Tensor weight_ = repeat_if_defined(weight, b);
+  Tensor bias_ = repeat_if_defined(bias, b);
+  Tensor running_mean_ = repeat_if_defined(running_mean, b);
+  Tensor running_var_ = repeat_if_defined(running_var, b);
+
+  auto input_reshaped = input.contiguous().view(shape);
+  auto out = at::batch_norm(input_reshaped, weight_, bias_, running_mean_, running_var_,
+                            use_input_stats, momentum, eps, cudnn_enabled);
+
+  // we alias running_mean and running_var because they are const but we want to modify their data
+  if (running_mean.defined()) {
+    at::alias(running_mean).copy_(running_mean_.view({ b, c }).mean(0, false));
+  }
+  if (running_var.defined()) {
+    at::alias(running_var).copy_(running_var_.view({ b, c }).mean(0, false));
+  }
+
+  return out.view(input.sizes());
+}
+
 Tensor layer_norm(const Tensor& input, IntList normalized_shape,
     const Tensor& weight /* optional */, const Tensor& bias /* optional */,
     double eps, bool cudnn_enabled) {
 
     int64_t normalized_ndim = normalized_shape.size();
 
-    if (normalized_ndim < 1) {
-      std::stringstream ss;
-      ss << "Expected normalized_shape to be at least 1-dimensional, i.e., "
-         << "containing at least one element, but got normalized_shape="
-         << normalized_shape;
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(normalized_ndim >= 1,
+             "Expected normalized_shape to be at least 1-dimensional, i.e., ",
+             "containing at least one element, but got normalized_shape=",
+             normalized_shape);
 
-    if (weight.defined() && !weight.sizes().equals(normalized_shape)) {
-      std::stringstream ss;
-      ss << "Expected weight to be of same shape as normalized_shape, but got "
-         << "weight of shape " << weight.sizes() << " and normalized_shape="
-         << normalized_shape;
-      throw std::runtime_error(ss.str());
-    }
-
-    if (bias.defined() && !bias.sizes().equals(normalized_shape)) {
-      std::stringstream ss;
-      ss << "Expected bias to be of same shape as normalized_shape, but got "
-         << "bias of shape " << bias.sizes() << " and normalized_shape="
-         << normalized_shape;
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(!weight.defined() || weight.sizes().equals(normalized_shape),
+             "Expected weight to be of same shape as normalized_shape, but got ",
+             "weight of shape ", weight.sizes(), " and normalized_shape=",
+             normalized_shape);
+    AT_CHECK(!bias.defined() || bias.sizes().equals(normalized_shape),
+             "Expected bias to be of same shape as normalized_shape, but got ",
+             "bias of shape ", bias.sizes(), " and normalized_shape=",
+             normalized_shape);
 
     auto input_shape = input.sizes();
     auto input_ndim = input.dim();
@@ -125,7 +151,7 @@ Tensor layer_norm(const Tensor& input, IntList normalized_shape,
         ss << ", " << size;
       }
       ss << "], but got input of size" << input_shape;
-      throw std::runtime_error(ss.str());
+      AT_ERROR(ss.str());
     }
 
     int64_t n = 1;
@@ -159,29 +185,19 @@ Tensor group_norm(const Tensor& input, int64_t num_groups,
     int64_t b = input.size(0);
     int64_t c = input.size(1);
 
-    if (c % num_groups != 0) {
-      std::stringstream ss;
-      ss << "Expected number of channels in input to be divisible by "
-         << "num_groups, but got input of shape " << input.sizes() << " and "
-         << "num_groups=" << num_groups;
-      throw std::runtime_error(ss.str());
-    }
-
-    if (weight.defined() && (weight.dim() != 1 || weight.numel() != c)) {
-      std::stringstream ss;
-      ss << "Expected weight to be a vector of size equal to the number of "
-         << "channels in input, but got weight of shape " << weight.sizes()
-         << " and input of shape " <<  input.sizes();
-      throw std::runtime_error(ss.str());
-    }
-
-    if (bias.defined() && (bias.dim() != 1 || bias.numel() != c)) {
-      std::stringstream ss;
-      ss << "Expected bias to be a vector of size equal to the number of "
-         << "channels in input, but got bias of shape " << weight.sizes()
-         << " and input of shape " <<  input.sizes();
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(c % num_groups == 0,
+             "Expected number of channels in input to be divisible by ",
+             "num_groups, but got input of shape ", input.sizes(), " and "
+             "num_groups=", num_groups);
+
+    AT_CHECK(!weight.defined() || (weight.dim() == 1 && weight.numel() == c),
+             "Expected weight to be a vector of size equal to the number of ",
+             "channels in input, but got weight of shape ", weight.sizes(),
+             " and input of shape ", input.sizes());
+    AT_CHECK(!bias.defined() || (bias.dim() == 1 && bias.numel() == c),
+             "Expected bias to be a vector of size equal to the number of ",
+             "channels in input, but got bias of shape ", weight.sizes(),
+             " and input of shape ", input.sizes());
 
     // Apply group norm
     auto input_reshaped = input.contiguous().view({1, b * num_groups, -1});
diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp
new file mode 100644
index 00000000000000..1f93ecbc8235ab
--- /dev/null
+++ b/aten/src/ATen/native/PixelShuffle.cpp
@@ -0,0 +1,34 @@
+#include "ATen/native/TensorTransformations.h"
+
+#include <ATen/NativeFunctions.h>
+#include <ATen/Error.h>
+
+#include <algorithm>
+#include <vector>
+
+namespace at {
+namespace native {
+
+Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) {
+  AT_ASSERTM(self.dim() == 4,
+             "pixel_shuffle expects 4D input, but got input with sizes ",self.sizes());
+  int64_t b = self.size(0);
+  int64_t c = self.size(1);
+  int64_t h = self.size(2);
+  int64_t w = self.size(3);
+  int64_t upscale_factor_squared = upscale_factor * upscale_factor;
+  AT_ASSERTM(c % upscale_factor_squared == 0,
+             "pixel_shuffle expects input channel to be divisible by square of "
+             "upscale_factor, but got input with sizes ", self.sizes(),
+             ", upscale_factor=", upscale_factor,
+             ", and self.size(1)=", c, " is not divisible by ", upscale_factor_squared);
+  int64_t oc = c / upscale_factor_squared;
+  int64_t oh = h * upscale_factor;
+  int64_t ow = w * upscale_factor;
+
+  auto input_reshaped = self.reshape({b, oc, upscale_factor, upscale_factor, h, w});
+  return input_reshaped.permute({0 /* b */, 1 /* oc */, 4 /* h */, 2 /* 1st upscale_factor */, 5 /* w */, 3 /* 2nd upscale_factor */})
+                       .reshape({b, oc, oh, ow});
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index 9b06a513a14cd5..df937d6464b487 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -74,10 +74,19 @@ signature.
 - `std::array<bool,N>` (where N is `1-4`).  NB: you MUST NOT put a space after the comma, otherwise
   this argument will not parse correctly.  (If you decide to fix this, make sure you fix the
   argument parser both in ATen and in PyTorch.)
+- `TensorOptions`.  Tensor options provide information about how a
+  tensor should be constructed; it is most useful when you are writing a
+  factory function, where you have no `Tensor` inputs and thus
+  cannot otherwise determine how to construct a `Tensor`.
 - `*` is a special sentinel argument, which doesn't translate into an actual
   argument, but indicates that in the Python bindings, any subsequent arguments
   must be specified as keyword arguments (and cannot be provided positionally).
 
+Functions with no tensor inputs are called *factory functions*, and
+are handled specially by code generation.  If your function is behaving
+differently than another example, check first and see if one is a
+factory while another is not.
+
 **Return types.** These types are permissible as ReturnType:
 
 - `Tensor` and `TensorList`, which translate into the C++ types `Tensor` and `std::vector<Tensor>`,
@@ -218,8 +227,9 @@ direct consequences on valid implementations:
 
 * Never create a `Tensor` directly (e.g., `at::CPU` or `at::CUDA`), as a
   caller will be expecting to get `Variable`s out if it passes `Variable`.
-  Instead, create tensors from the `type()` of one of the input tensors, e.g.,
-  `input.type().tensor()`  or `input.type().toScalarType(kByte)` if you need
+  Instead, create tensors using the `options()` of one of the input
+  tensors.  E.g., `at::empty(sizes, input.options())` or
+  `at::ones(input.options().dtype(kByte))`, if you need
   a different scalar type.
 
 * If you need to call other ATen functions, be sure to qualify the call
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 2c7e641dcbe843..c976121e77ae3f 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -19,6 +19,7 @@ namespace native {
 
 DEFINE_DISPATCH(sum_kernel);
 DEFINE_DISPATCH(prod_kernel);
+DEFINE_DISPATCH(norm_kernel);
 
 static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
   ScalarType scalarType = self.type().scalarType();
@@ -584,12 +585,23 @@ Tensor& _sum_out(Tensor &result, const Tensor &self, IntList dims, bool keepdim)
   return reduce_multi_associative_out<_sum, _sum_out>(result, self, dims, keepdim);
 }
 
-Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) {
-  Tensor result = self.type().tensor();
-  return at::native::norm_out(result, self, p, dim, keepdim);
+Tensor& _norm_out_cpu(Tensor& result, const Tensor& self, Scalar p, int64_t dim_, bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim());
+  if (_dimreduce_return_trivial(result, self, 0, dim, keepdim))
+    return result;
+  if (self.is_contiguous() && result.is_contiguous()) {
+    _dimreduce_setup(result, self, dim);
+    norm_kernel(kCPU, result, self, p, dim);
+    if (!keepdim) {
+      result.squeeze_(dim);
+    }
+    return result;
+  } else {
+    return at::_th_norm_out(result, self, p, dim, keepdim);
+  }
 }
 
-Tensor &norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
+Tensor& norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool keepdim) {
   AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
            "norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
   AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
@@ -597,10 +609,44 @@ Tensor &norm_out(Tensor &result, const Tensor &self, Scalar p, int64_t dim, bool
   if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
     return result;
   } else {
-    return at::_th_norm_out(result, self, p, dim, keepdim);
+    if (self.is_cuda()) {
+      return at::_th_norm_out(result, self, p, dim, keepdim);
+    } else {
+      return _norm_out_cpu(result, self, p, dim, keepdim);
+    }
+  }
+}
+
+Tensor _norm(const Tensor &self, Scalar p) {
+  if (self.type().is_sparse()) {
+    return at::native_norm(self, p);
+  } else {
+    AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+             "norm only supports CPU AND CUDA backend, got: ", at::toString(self.type().backend()));
+    AT_CHECK(at::isFloatingType(self.type().scalarType()), "norm only supports floating-point dtypes");
+    if (self.is_cuda()) {
+      return at::th_norm(self, p);
+    } else {
+      if (self.is_contiguous()) {
+        Tensor result = CPU(kFloat).scalarTensor(0).toType(self.type());
+        norm_kernel(kCPU, result, self, p, nullopt);
+        return result;
+      } else {
+        return at::th_norm(self, p);
+      }
+    }
   }
 }
 
+Tensor norm(const Tensor& self, Scalar p, int64_t dim, bool keepdim) {
+  Tensor result = self.type().tensor();
+  return at::native::norm_out(result, self, p, dim, keepdim);
+}
+
+Tensor norm(const Tensor& self, Scalar p) {
+  return at::native::_norm(self, p);
+}
+
 Tensor all(const Tensor& self, int64_t dim, bool keepdim) {
   Tensor result = self.type().tensor();
   return at::native::all_out(result, self, dim, keepdim);
diff --git a/aten/src/ATen/native/RoiPooling.cpp b/aten/src/ATen/native/RoiPooling.cpp
index 5995e43ef1e536..1a089a9f473c17 100644
--- a/aten/src/ATen/native/RoiPooling.cpp
+++ b/aten/src/ATen/native/RoiPooling.cpp
@@ -134,7 +134,7 @@ Tensor RoiPooling2d_backward_cpu(
   double spatialScale,
   const Tensor& gradOutput,
   const Tensor& argmaxes) {
-  throw std::runtime_error("not implemented");
+  AT_ERROR("not implemented");
 }
 
 }
diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp
index 6ffb891bf7d777..975ae8c1ff9c47 100644
--- a/aten/src/ATen/native/Scalar.cpp
+++ b/aten/src/ATen/native/Scalar.cpp
@@ -18,7 +18,7 @@ Scalar _local_scalar(const Tensor& self) {
 
 Scalar _local_scalar_dense_cpu(const Tensor& self) {
   Scalar r;
-  AT_DISPATCH_ALL_TYPES_AND_HALF(
+  AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX(
       self.type(), "_local_scalar_dense_cpu", [&] {
         scalar_t value = *self.data<scalar_t>();
         r = Scalar(value);
diff --git a/aten/src/ATen/native/SpectralOpsUtils.h b/aten/src/ATen/native/SpectralOpsUtils.h
index 7518d1f945a5fd..875c7aa12b68cf 100644
--- a/aten/src/ATen/native/SpectralOpsUtils.h
+++ b/aten/src/ATen/native/SpectralOpsUtils.h
@@ -51,7 +51,7 @@ inline int64_t infer_ft_complex_to_real_onesided_size(int64_t complex_size,
     std::ostringstream ss;
     ss << "expected real signal size " << expected_size << " is incompatible "
        << "with onesided complex frequency size " << complex_size;
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
 }
 
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
new file mode 100644
index 00000000000000..9605736ee112fe
--- /dev/null
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -0,0 +1,51 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+
+namespace at {
+namespace native {
+
+static void ensure_has_index(Device* device) {
+  if (!device->is_cuda() || device->has_index()) {
+    return;
+  }
+  device->set_index(at::current_device());
+}
+
+static Tensor to_impl(const Tensor& self, const TensorOptions& options, bool non_blocking) {
+  return self.type().toBackend(options.backend()).toScalarType(options.dtype())
+                    .copy(self, non_blocking, options.device());
+}
+
+Tensor to(const Tensor& self, Device device, ScalarType dtype, bool non_blocking) {
+  ensure_has_index(&device);
+  if (self.device() == device && self.dtype() == dtype) {
+    return self;
+  }
+  return to_impl(self, self.options().device(device).dtype(dtype), non_blocking);
+}
+
+Tensor to(const Tensor& self, ScalarType dtype, bool non_blocking) {
+  if (self.dtype() == dtype) {
+    return self;
+  }
+  return to_impl(self, self.options().dtype(dtype), non_blocking);
+}
+
+Tensor to(const Tensor& self, Device device, bool non_blocking) {
+  ensure_has_index(&device);
+  if (self.device() == device) {
+    return self;
+  }
+  return to_impl(self, self.options().device(device), non_blocking);
+}
+
+Tensor to(const Tensor& self, const Tensor& other, bool non_blocking) {
+  auto self_options = self.options();
+  auto options = other.options();
+  if (self_options == options) {
+    return self;
+  }
+  return to_impl(self, options, non_blocking);
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 1a12549b5e70e9..178045d9fd0de4 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -59,6 +59,10 @@ void window_function_checks(
       window_length);
 }
 
+const TypeExtendedInterface& getFactoryType(const TensorOptions& options) {
+  return at::getType(options);
+}
+
 } // namespace
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arange ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -73,7 +77,7 @@ Tensor arange(
     Scalar step,
     const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getType(options)._arange(start, end, step);
+  return getFactoryType(options)._arange(start, end, step);
 }
 
 Tensor& arange_out(Tensor& result, Scalar start, Scalar end) {
@@ -86,7 +90,7 @@ Tensor& arange_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
 
 Tensor arange(Scalar end, const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getType(options)._arange(end);
+  return getFactoryType(options)._arange(end);
 }
 
 Tensor& arange_out(Tensor& result, Scalar end) {
@@ -94,7 +98,7 @@ Tensor& arange_out(Tensor& result, Scalar end) {
 }
 
 Tensor _dim_arange(const Tensor& like, int64_t dim) {
-  return like.type().toScalarType(at::kLong)._arange(like.size(dim));
+  return at::getType(like.options().dtype(at::kLong))._arange(like.size(dim));
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -102,7 +106,7 @@ Tensor _dim_arange(const Tensor& like, int64_t dim) {
 Tensor empty(IntList size, const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
   // Can't call a factory function, because the buck stops with us!
-  return at::getType(options).tensor(size);
+  return getFactoryType(options).tensor(size);
 }
 
 Tensor& empty_out(Tensor& result, IntList size) {
@@ -218,7 +222,7 @@ Tensor linspace(
     int64_t steps,
     const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getType(options)._linspace(start, end, steps);
+  return getFactoryType(options)._linspace(start, end, steps);
 }
 
 Tensor& linspace_out(Tensor& result, Scalar start, Scalar end) {
@@ -241,7 +245,7 @@ Tensor logspace(
     int64_t steps,
     const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getType(options)._logspace(start, end, steps);
+  return getFactoryType(options)._logspace(start, end, steps);
 }
 
 Tensor& logspace_out(Tensor& result, Scalar start, Scalar end) {
@@ -475,7 +479,7 @@ Tensor range(
     Scalar step,
     const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getType(options)._range(start, end, step);
+  return getFactoryType(options)._range(start, end, step);
 }
 
 Tensor& range_out(Tensor& result, Scalar start, Scalar end) {
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
index bae2a94b86273b..15d86fb5162a3e 100644
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@@ -98,7 +98,8 @@ void TensorIterator::compute_common_type() {
       if (op.tensor->defined() && type != op.tensor->type()) {
         if (op.tensor->dim() == 0) {
           if (type.backend() != at::Backend::CUDA) {
-            *op.tensor = op.tensor->toType(type);
+            cast_tensors_.emplace_back(op.tensor->toType(type));
+            op.tensor = &(cast_tensors_.back());
           }
         } else {
           op.needs_cast = true;
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
index 245866373d4763..3faedbec6bb320 100644
--- a/aten/src/ATen/native/TensorIterator.h
+++ b/aten/src/ATen/native/TensorIterator.h
@@ -184,6 +184,7 @@ struct AT_API TensorIterator {
   DimVector shape_;
   DimVector perm_;
   SmallVector<OperandInfo, 4> operands_;
+  SmallVector<Tensor, 4> cast_tensors_;
   int num_outputs_ = 0;
   bool has_coalesced_dimensions_ = false;
 };
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index cf6a9ece5c0d9d..634e7a443d21fd 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -20,9 +20,8 @@ std::vector<Tensor> broadcast_tensors(TensorList tensors) {
 static void check_cat_no_zero_dim(TensorList tensors) {
   for(size_t i = 0; i < tensors.size(); ++i) {
     auto& t = tensors[i];
-    if (t.dim() == 0) {
-      AT_ERROR("zero-dimensional tensor (at position ", i, ") cannot be concatenated");
-    }
+    AT_CHECK(t.dim() > 0,
+             "zero-dimensional tensor (at position ", i, ") cannot be concatenated");
   }
 }
 
@@ -39,12 +38,11 @@ Tensor cat(TensorList tensors, int64_t dim) {
 }
 
 std::vector<Tensor> chunk(const Tensor& self, int64_t chunks, int64_t dim) {
-  if (self.dim() == 0) {
-    AT_ERROR("chunk expects at least a 1-dimensional tensor");
-  }
-  if (chunks <= 0) {
-    AT_ERROR("chunk expects `chunks` to be greater than 0, got: ", chunks);
-  }
+  AT_CHECK(self.dim() > 0,
+           "chunk expects at least a 1-dimensional tensor");
+  AT_CHECK(chunks > 0,
+           "chunk expects `chunks` to be greater than 0, got: ", chunks);
+
   int64_t split_size = (self.size(dim) + chunks - 1) / chunks;
 
   // We need to call split_with_sizes in the case where split_size and dimension size are 0, because
@@ -117,14 +115,11 @@ Tensor expand(const Tensor& self, IntList size, bool implicit) {
   // distinguish between expands inserted by broadcasts and those explicitly
   // requested by the user, because it is legal to remove implicit expands
   // from the graph, but not legal to remove the explicit ones.
-  if (size.size() < (size_t)self.dim()) {
-    std::ostringstream ss;
-    ss << "expand(" << self.type() << "{" << self.sizes() << "}, size=" << size
-       << "): the number of sizes provided (" << size.size() << ") "
-       << "must be greater or equal to the number of dimensions in the tensor ("
-       << self.dim() << ")";
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(size.size() >= (size_t)self.dim(),
+           "expand(", self.type(), "{", self.sizes(), "}, size=", size,
+           "): the number of sizes provided (", size.size(), ") ",
+           "must be greater or equal to the number of dimensions in the tensor (",
+           self.dim(), ")");
 
   std::vector<int64_t> expandedSizes;
   std::vector<int64_t> expandedStrides;
@@ -159,17 +154,15 @@ Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   if (start != cur_size) {  // start being the end is valid, but not a valid dim specification.
     start = maybe_wrap_dim(start, cur_size);
   }
-  if (length < 0 || start > cur_size - length) {
-    AT_ERROR("start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
-  }
+  AT_CHECK(length >= 0 && start <= cur_size - length,
+           "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
   return at::slice(self, dim, start, start + length, 1);
 }
 
 Tensor permute(const Tensor& self, IntList dims) {
   auto nDims = self.dim();
-  if (dims.size() != (size_t)nDims) {
-    AT_ERROR("number of dims don't match in permute");
-  }
+  AT_CHECK(dims.size() == (size_t)nDims,
+           "number of dims don't match in permute");
   auto oldSizes = self.sizes();
   auto oldStrides = self.strides();
   std::vector<int64_t> newSizes(nDims);
@@ -177,9 +170,8 @@ Tensor permute(const Tensor& self, IntList dims) {
   std::vector<bool> seen(nDims);
   for (int64_t i = 0; i < nDims; i++) {
     auto dim = maybe_wrap_dim(dims[i], nDims);
-    if (seen[dim]) {
-      AT_ERROR("repeated dim in permute");
-    }
+    AT_CHECK(!seen[dim],
+             "repeated dim in permute");
     seen[dim] = true;
     newSizes[i] = oldSizes[dim];
     newStrides[i] = oldStrides[dim];
@@ -188,9 +180,8 @@ Tensor permute(const Tensor& self, IntList dims) {
 }
 
 Tensor repeat(const Tensor& self, IntList repeats) {
-  if (repeats.size() < (size_t)self.dim()) {
-    AT_ERROR("Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor");
-  }
+  AT_CHECK(repeats.size() >= (size_t)self.dim(),
+           "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor");
 
   // Add new leading dimensions to the tensor if the
   // number of target dimensions is larger than the
@@ -206,7 +197,7 @@ Tensor repeat(const Tensor& self, IntList repeats) {
   Tensor xtensor = self.expand(padded_size);
 
   Tensor result = self.type().tensor(target_size);
-  Tensor urtensor = result.type().alias(result);
+  Tensor urtensor = at::alias(result);
   for (int64_t i = 0; i < xtensor.dim(); ++i) {
     // can't unfold with step 0, so make sure step is at least 1
     // (it doesn't matter what it is in that case, because the size is 0).
@@ -238,12 +229,9 @@ Tensor select(const Tensor& self, int64_t dim, int64_t index) {
   AT_CHECK(ndim > 0, "select() cannot be applied to a 0-dim tensor.");
   dim = maybe_wrap_dim(dim, ndim);
   auto size = self.size(dim);
-  if (index < -size || index >= size) {
-    std::stringstream ss;
-    ss << "select(): index " << index << " out of range for tensor of size ";
-    ss << self.sizes() << " at dimension " << dim;
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(index >= -size && index < size,
+           "select(): index ", index, " out of range for tensor of size ",
+           self.sizes(), " at dimension ", dim);
   if (index < 0) {
     index += size;
   }
@@ -261,10 +249,8 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_
   dim = maybe_wrap_dim(dim, ndim);
   auto sizes = self.sizes().vec();
   auto strides = self.strides().vec();
-  if (step <= 0) {
-    // TODO: support negative strides
-    throw std::runtime_error("slice step must be positive");
-  }
+  // TODO: support negative strides
+  AT_CHECK(step > 0, "slice step must be positive");
   if (start < 0) {
     start += sizes[dim];
   }
@@ -322,22 +308,15 @@ std::vector<Tensor> split_with_sizes(const Tensor& self, IntList split_sizes, in
 
   for (i = 0; i < num_splits; ++i) {
     auto length = split_sizes[i];
-    if (length < 0) {
-      std::ostringstream ss;
-      ss << "split_with_sizes expects split_sizes have only non-negative "
-         << "entries, but got split_sizes=" << split_sizes;
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(length >= 0,
+             "split_with_sizes expects split_sizes have only non-negative ",
+             "entries, but got split_sizes=", split_sizes);
     splits[i] = self.narrow(dim, start_idx, length);
     start_idx += length;
   }
-  if (start_idx != dim_size) {
-    std::ostringstream ss;
-    ss << "split_with_sizes expects split_sizes to sum exactly to "
-       << dim_size << " (input tensor's size at dimension " << dim << "), "
-       << "but got split_sizes=" << split_sizes;
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(start_idx == dim_size,
+           "split_with_sizes expects split_sizes to sum exactly to ", dim_size,
+           " (input tensor's size at dimension ", dim, "), ", "but got split_sizes=", split_sizes);
   return splits;
 }
 
@@ -350,28 +329,24 @@ static inline std::vector<Tensor> get_stack_inputs(TensorList tensors, int64_t d
 }
 
 Tensor stack(TensorList tensors, int64_t dim) {
-  if (tensors.size() == 0) {
-    throw std::runtime_error("stack expects a non-empty TensorList");
-  }
+  AT_CHECK(tensors.size() > 0,
+           "stack expects a non-empty TensorList");
   dim = maybe_wrap_dim(dim, tensors[0].dim() + 1);
   return at::cat(get_stack_inputs(tensors, dim), dim);
 }
 
 Tensor& stack_out(Tensor& result, TensorList tensors, int64_t dim) {
-  if (tensors.size() == 0) {
-    throw std::runtime_error("stack expects a non-empty TensorList");
-  }
+  AT_CHECK(tensors.size() > 0,
+           "stack expects a non-empty TensorList");
   dim = maybe_wrap_dim(dim, tensors[0].dim() + 1);
   return at::cat_out(result, get_stack_inputs(tensors, dim), dim);
 }
 
 static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t dim1) {
   int64_t nsparseDims = self._sparseDims();
-  if (dim0 >= nsparseDims || dim1 >= nsparseDims) {
-    AT_ERROR(
-        "sparse transpose: transposed dimensions must be sparse ",
-        "Got sparseDims: ", nsparseDims, ", d0: ", dim0, ", d1: ", dim1);
-  }
+  AT_CHECK(dim0 < nsparseDims && dim1 < nsparseDims,
+           "sparse transpose: transposed dimensions must be sparse ",
+           "Got sparseDims: ", nsparseDims, ", d0: ", dim0, ", d1: ", dim1);
 
   if (self._indices().numel() == 0 && self._values().numel() == 0) {
     auto sizes = self.sizes().vec();
@@ -442,10 +417,9 @@ static void check_t(const Tensor& self, const char *fn) {
   if (self.is_sparse()) {
     int64_t sparseDims = self._sparseDims();
     int64_t denseDims = self._denseDims();
-    if (!(sparseDims == 2 && denseDims == 0)) {
-      AT_ERROR(fn, " expects a tensor with 2 sparse and 0 dense dimensions, but got ",
-               sparseDims, " sparse and ", denseDims, " dense dimensions");
-    }
+    AT_CHECK(sparseDims == 2 && denseDims == 0,
+             fn, " expects a tensor with 2 sparse and 0 dense dimensions, but got ",
+             sparseDims, " sparse and ", denseDims, " dense dimensions");
   } else if (self.dim() != 2) {
     AT_ERROR(fn, " expects a 2D tensor, but self is ", self.dim(), "D");
   }
@@ -622,6 +596,10 @@ std::vector<Tensor> meshgrid(TensorList tensors) {
       AT_ERROR("Expected scalar or 1D tensor in the tensor list but got: ", tensors[i]);
     }
   }
+  for(int64_t i = 0; i < size - 1; i++){
+      AT_CHECK(tensors[i].dtype() == tensors[i+1].dtype(), "meshgrid expects all tensors to have the same dtype");
+      AT_CHECK(tensors[i].device() == tensors[i+1].device(), "meshgrid expects all tensors to have the same device");
+  }
   std::vector<Tensor> grids;
   for(int64_t i = 0; i < size; i++) {
     std::vector<int64_t> view_shape(size, 1);
diff --git a/aten/src/ATen/native/WeightNorm.cpp b/aten/src/ATen/native/WeightNorm.cpp
new file mode 100644
index 00000000000000..1627b4c2596e12
--- /dev/null
+++ b/aten/src/ATen/native/WeightNorm.cpp
@@ -0,0 +1,117 @@
+#include "ATen/ATen.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/NativeFunctions.h"
+
+#include <cstring>
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace at { 
+namespace native {
+
+// Staying faithful to the Python for now for clarity, look for optimizations later
+// (e.g., single return statement for RVO)
+Tensor norm_except_dim(const Tensor & v, int64_t pow, int64_t dim)
+{
+  // I assume tensor.contiguous(), view(), norm(), etc. here will dispatch through VariableType.
+  if (dim == -1) {
+    return v.norm(pow);
+  } else if (dim == 0) {
+    std::vector<int64_t> output_size(v.dim(), 1);
+    output_size[0] = v.size(0);
+    return v.contiguous().view({v.size(0), -1}).norm(pow, 1).view(output_size);
+  } else if (dim == v.dim() - 1) {
+    std::vector<int64_t> output_size(v.dim(), 1);
+    output_size[v.dim() - 1] = v.size(v.dim() - 1);
+    return v.contiguous().view({-1, v.size(v.dim() - 1)}).norm(pow, 0).view(output_size);
+  } else {
+    // To consider: at::native::norm_except_dim is probably fine as well, 
+    // and would avoid an additional dynamic dispatch.  
+    return at::norm_except_dim(v.transpose(0, dim), pow, 0).transpose(0, dim); // optimize?
+  }
+}
+
+Tensor _weight_norm
+  (const Tensor & v_in, 
+   const Tensor & g_in,
+   int64_t dim) 
+{
+
+  AT_CHECK(
+    v_in.device() == g_in.device(),
+    "weight_norm: expected v_in and g_in to be on the same device, but v_in is "
+    "on ", v_in.device(), " and g_in is on ", g_in.device()); 
+
+  auto v = v_in.contiguous();
+  auto g = g_in.contiguous();
+    
+  bool can_use_fused = v.type().is_cuda() && (dim == 0 || dim == v.dim() - 1);
+
+  if (can_use_fused) {
+    // weight_norm does not have a derivative defined for it, so this will route back through
+    // VariableType.cpp, and construct a WeightNormFusedBackward object in the autograd graph.
+    return std::get<0>(at::_weight_norm_cuda_interface(v, g, dim));
+  } else {
+    // Double-differentiable primitive ops
+    // at::native::norm_except_dim would probably be fine as well.
+    return v*(g/at::norm_except_dim(v, 2, dim));
+  }
+}
+
+// Differentiable backward path, an alternative to weight_norm_cuda_backward, to be used
+// when backward is itself creating a graph.
+// The GradMode::is_enabled() check must be performed within Functions.cpp; that's why we
+// define a separate function here, instead of inlining it in weight_norm_cuda_backward.
+std::tuple<Tensor, Tensor> _weight_norm_differentiable_backward
+  (const Tensor & grad_w,
+   const Tensor & saved_v,
+   const Tensor & saved_g,
+   const Tensor & saved_norms,
+   int64_t dim)
+{
+  // In Functions.cpp, the HardshrinkBackward object supplies "grad.contiguous()"
+  // as the first argument, so grad_w should be contiguous here.
+  // All these checks should succeed:
+  AT_CHECK(grad_w.is_contiguous(), "grad_w must be contiguous");
+  AT_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
+  AT_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
+  AT_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous");
+
+  int64_t last_dim = saved_v.dim() - 1;
+  int64_t last_size = saved_v.size(last_dim);
+ 
+  // Like weight_norm_fused_backward, weight_norm_differentiable_backward should only ever be called
+  // through a WeightNormFusedBackward object, so we expect that dim == 0 || dim == saved_v.size(-1)
+  AT_CHECK(dim == 0 || dim == last_dim, "Expected dim to be the first or last dimension");
+
+  // saved_g and saved_norms are already shaped to broadcast over the correct dimensions
+
+  // ...but saved_norms might be Float when saved_g and saved_v are half.
+  // To consider:  saved_norms.to(..., True /*non_blocking*/);
+  auto norms = saved_norms.to(saved_g.type().scalarType());
+
+  std::vector<int64_t> bcast_size(saved_v.dim(), 1);
+
+  // Analytic backward path using differentiable primitive ops
+  if (dim == 0) {
+    bcast_size[0] = saved_v.size(0);
+    auto per_dim_sums = (grad_w*saved_v).view({saved_v.size(0), -1}).sum(1).view(bcast_size);
+    auto grad_v = (saved_g/norms)*(grad_w - saved_v*(per_dim_sums/(norms*norms)));
+    auto grad_g = per_dim_sums/norms; 
+    return std::tuple<Tensor, Tensor>{grad_v, grad_g};
+  } else { // dim == last_dim
+    bcast_size[last_dim] = last_size; 
+    auto per_dim_sums = (grad_w*saved_v).view({-1, last_size}).sum(0).view(bcast_size);
+    auto grad_v = (saved_g/norms)*(grad_w - saved_v*(per_dim_sums/(norms*norms)));
+    auto grad_g = per_dim_sums/norms; 
+    return std::tuple<Tensor, Tensor>{grad_v, grad_g};
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
new file mode 100644
index 00000000000000..648defd192e117
--- /dev/null
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -0,0 +1,890 @@
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/core/C++17.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/GridSampler.h>
+#include <ATen/native/cpu/GridSamplerKernel.h>
+#include <ATen/cpu/vml.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <algorithm>
+#include <cstring>
+#include <type_traits>
+
+namespace at { namespace native { namespace {
+
+/**  NOTE [ Grid Sample CPU Kernels ]
+ *
+ *   Implementation of vectorized grid sample CPU kernels is divided into three
+ *   parts. More detailed description exist after this paragraph, but on a high
+ *   level, they are
+ *   1. `ComputeLocation` struct
+ *      + Computes the interpolation location basing on padding mode.
+ *   2. `ApplyGridSample` struct
+ *      + Owns N (# spatial dims) `ComputeLocation` structs, and uses them to
+ *        compute the interpolation locations.
+ *      + Interpolates the values and writes to output.
+ *   3. `grid_sample_2d_grid_slice_iterator` function
+ *      + Iterates over a slice of the grid tensor based on the geometry by the
+ *        spatial ordering, i.e., the first iteration will process grid values
+ *           grid[n, 0, 0, :], grid[n, 0, 1, :], grid[n, 0, 2, :], ...
+ *        (Recall that, e.g., 2D grid has shape [N x H x W x 2], so grid[n, ...]
+ *         is a slice, and grid[n, h, w, :] contains the values for a single
+ *         output spatial location.)
+ *      + Applies a given operator at each iteration, so we can use the same
+ *        pattern for forward and backward.
+ *
+ *   Putting everything together, we have, e.g., the forward kernel implemented
+ *   as
+ *
+ *      // `ApplyGridSample` struct that processes grid values, extracts and
+ *      // interpolates input values, and write to output.
+ *      ApplyGridSample<scalar_t, 2, interp, padding> grid_sample(input_accessor);
+ *
+ *      // For each slice, we call `grid_sample_2d_grid_slice_iterator` with
+ *      //   1. the grid slice, and
+ *      //   2. a lambda that takes in
+ *      //      i.   location vectors (x and y for 2D) extracted from grid
+ *      //      ii.  `spatial_offset` as the spatial offset of these vectors
+ *      //           from the beginning of this slice.
+ *      //      iii. `len` as the number of valid locations in the vectors.
+ *      //           (There might not be enough near boundary.)
+ *      for (int n = 0; n < input_accessor.size(0); n++) {
+ *        grid_sample_2d_grid_slice_iterator(
+ *          grid_accessor[n],
+ *          [&](const Vec256<scalar_t>& grid_x,
+ *              const Vec256<scalar_t>& grid_y,
+ *              int64_t spatial_offset, int64_t len) {
+ *            grid_sample.forward(out_accessor[n], input_accessor[n],
+ *                                spatial_offset, grid_x, grid_y, len);
+ *          });
+ *      }
+ *
+ *   Now we talk about details of each of these three parts:
+ *
+ *   1. `ComputeLocation` struct
+ *      Transforms grid values into interpolation locations of the input tensor
+ *      for a particular spatial dimension, based on the size of that dimension
+ *      in input tensor, and the padding mode.
+ *
+ *        template<typename scalar_t, GridSamplerPadding padding>
+ *        struct ComputeLocation {
+ *          using Vec = Vec256<scalar_t>;
+ *
+ *          // ctor
+ *          ComputeLocation(int64_t size);
+ *
+ *          // Given grid values `in`, return the interpolation locations after
+ *          // un-normalization and padding mechanism (elementwise).
+ *          Vec apply(const Vec &in) const;
+ *
+ *          // Similar to `apply`, but also returns `d apply(in) / d in`
+ *          // (elementwise).
+ *          // this is often used in gradient computation.
+ *          std::pair<Vec, Vec> apply_get_grad(const Vec &in) const;
+ *        };
+ *
+ *   2. `ApplyGridSample` struct
+ *      Owns N `ComputeLocation` structs, where N is the number of spatial
+ *      dimensions. Given N input grid vectors (one for each spatial dimension)
+ *      and spatial offset, it gets the interpolation locations from
+ *      `ComputeLocation`s, applies interpolation procedure, and then writes to
+ *      the output (or grad_input & grad_grid in backward).
+ *
+ *        template<typename scalar_t, int spatial_dim,
+ *                 GridSamplerInterpolation interp,
+ *                 GridSamplerPadding padding>
+ *        struct ApplyGridSample {
+ *
+ *          // ctor
+ *          ApplyGridSample(const TensorAccessor<scalar_t, 4>& input);
+ *
+ *          // Applies grid sampling (forward) procedure:
+ *          //   1. computes interpolation locations from grid values `grid_x`
+ *          //      and `grid_y`,
+ *          //   2. interpolates output values using the locations and input
+ *          //      data in `inp_slice`, and
+ *          //   3. writes the first `len` values in the interpolated vector to
+ *          //      `out_slice` with spatial offset being `offset`.
+ *          //
+ *          // This assimes that `grid_x` and `grid_y` all contain valid grid
+ *          // values \in [-1, 1], even at indices greater than `len`.
+ *          //
+ *          // The `*_slice` argument namess mean samples within a batch (i.e.,
+ *          // with the batch dimension sliced out).
+ *          void forward(TensorAccessor<scalar_t, 3>& out_slice,
+ *                       const TensorAccessor<scalar_t, 3>& inp_slice,
+ *                       int64_t offset, const Vec& grid_x, const Vec& grid_y,
+ *                       int64_t len) const;
+ *
+ *          // Applies grid sampling (backward) procedure. Arguments semantics
+ *          // and strategy are similar to those of `forward`.
+ *          void backward(TensorAccessor<scalar_t, 3>& gInp_slice,
+ *                        TensorAccessor<scalar_t, 3>& gGrid_slice,
+ *                        const TensorAccessor<scalar_t, 3>& gOut_slice,
+ *                        const TensorAccessor<scalar_t, 3>& inp_slice,
+ *                        int64_t offset, const Vec& grid_x, const Vec& grid_y,
+ *                        int64_t len) const;
+ *        };
+ *
+ *   3. `grid_sample_2d_grid_slice_iterator` function
+ *      Among the tensors we work with, we know that the output tensors are
+ *      contiguous (i.e., `output` in forward, and `grad_input` & `grad_grid` in
+ *      backward), we need to randomly read `input` anyways, and `grad_output`
+ *      usually comes from autograd and is often contiguous. So we base our
+ *      iterating strategy on the geometry of grid.
+ *      `grid_sample_2d_grid_slice_iterator` function provides an abstraction to
+ *      efficiently iterates through a `grid` slice (without batch dimension).
+ *      See comments of that function on the specific cases and strategies used.
+ *
+ *        template<typename scalar_t, typename ApplyFn>
+ *        void grid_sample_2d_grid_slice_iterator(
+ *          const TensorAccessor<scalar_t, 3>& grid_slice,
+ *          const ApplyFn &apply_fn);
+ *
+ *      `apply_fn` is a function/lambda that takes in
+ *           i.   location vectors (x and y for 2D) extracted from grid
+ *           ii.  `spatial_offset` as the spatial offset of these vectors
+ *                from the beginning of this slice.
+ *           iii. `len` as the number of valid locations in the vectors.
+ *                (There might not be enough near boundary.)
+
+ *       It should be callable as if it has declaration:
+ *          void apply_fn(const Vec256<scalar_t>& grid_x,
+ *                        const Vec256<scalar_t>& grid_y,
+ *                        int64_t spatial_offset, int64_t len);
+ *
+ *      `apply_fn` will be called multiple times, and together cover the entire
+ *      output spatial space.
+ *
+ *  Now you should be able tp understand everything about the implementaion of
+ *  2D forward kernel shown at the beginning of this note.
+ *
+ **/
+
+
+using at::native::detail::GridSamplerInterpolation;
+using at::native::detail::GridSamplerPadding;
+using namespace at::vec256;
+
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ComputeLocation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Struct to compute interpolation location from grid values, and to apply
+// padding mechanism (e.g., reflection).
+// See NOTE [ Grid Sample CPU Kernels ] for details.
+
+template<typename scalar_t>
+struct ComputeLocationBase {
+  using Vec = Vec256<scalar_t>;
+
+  const scalar_t half_max_val;
+
+  ComputeLocationBase(int64_t size)
+    : half_max_val(static_cast<scalar_t>(size - 1) / 2) {}
+
+  inline Vec unnormalize(const Vec &in) const {
+    return (in + Vec(1)) * Vec(half_max_val);
+  }
+};
+
+template<typename scalar_t, GridSamplerPadding padding>
+struct ComputeLocation;
+
+template<typename scalar_t>
+struct ComputeLocation<scalar_t, GridSamplerPadding::Zeros>
+  : ComputeLocationBase<scalar_t> {
+  using Vec = Vec256<scalar_t>;
+  using ComputeLocationBase<scalar_t>::unnormalize;
+  using ComputeLocationBase<scalar_t>::half_max_val;
+
+  using ComputeLocationBase<scalar_t>::ComputeLocationBase;
+
+  inline Vec apply(const Vec &in) const {
+    return unnormalize(in);
+  }
+
+  inline std::pair<Vec, Vec> apply_get_grad(const Vec &in) const {
+    return std::make_pair(unnormalize(in), Vec(half_max_val));
+  }
+};
+
+template<typename scalar_t>
+struct ComputeLocation<scalar_t, GridSamplerPadding::Border>
+  : ComputeLocationBase<scalar_t> {
+  using Vec = Vec256<scalar_t>;
+  using ComputeLocationBase<scalar_t>::unnormalize;
+  using ComputeLocationBase<scalar_t>::half_max_val;
+
+  const scalar_t max_val;
+
+  ComputeLocation(int64_t size)
+    : ComputeLocationBase<scalar_t>(size)
+    , max_val(static_cast<scalar_t>(size - 1)) {}
+
+  inline Vec apply(const Vec &in) const {
+    return min(Vec(max_val), max(unnormalize(in), Vec(0)));
+  }
+  inline std::pair<Vec, Vec> apply_get_grad(const Vec &in) const {
+    using int_t = int_same_size_t<scalar_t>;
+    Vec max_val_vec(max_val), zeros(0);
+    auto indices = unnormalize(in);
+    auto bounded_lo = max(indices, zeros);
+    // Integral type equality comparison is very very fast because it just looks
+    // at the bits. Casting is free too. So we use the following pattern instead
+    // of comparison + blendv.
+    auto in_bound_lo = cast<scalar_t>(cast<int_t>(bounded_lo) == cast<int_t>(indices));
+    auto res = min(bounded_lo, max_val_vec);
+    auto in_bound_hi = cast<scalar_t>(cast<int_t>(res) == cast<int_t>(indices));
+    return std::make_pair(res, (in_bound_lo & in_bound_hi) & Vec(half_max_val));
+  }
+};
+
+template<typename scalar_t>
+struct ComputeLocation<scalar_t, GridSamplerPadding::Reflection>
+  : ComputeLocationBase<scalar_t> {
+  using Vec = Vec256<scalar_t>;
+  using ComputeLocationBase<scalar_t>::unnormalize;
+  using ComputeLocationBase<scalar_t>::half_max_val;
+
+  bool unit_size;  // whether size == 1, just return 0 in this case
+  const scalar_t double_max_val;
+  const scalar_t neg_half_max_val;
+
+  ComputeLocation(int64_t size)
+    : ComputeLocationBase<scalar_t>(size)
+    , unit_size(size == 1)
+    , double_max_val(static_cast<scalar_t>((size - 1) * 2))
+    , neg_half_max_val(-0.5 * static_cast<scalar_t>(size - 1)) {}
+
+  inline Vec apply(const Vec &in) const {
+    if (unit_size) {
+      return Vec(0);
+    }
+    Vec double_max_val_vec(double_max_val);
+    auto abs_in = unnormalize(in).abs();
+    auto fdouble_flips = abs_in / double_max_val_vec;
+    auto double_flips = fdouble_flips.trunc();
+    auto extra = abs_in - double_flips * double_max_val_vec;
+    // Now we need to test if extra > max_val to find out if another flip is
+    // needed. The following comparison does that and returns the correct
+    // flipped value.
+    return min(extra, double_max_val_vec - extra);
+  }
+
+  inline std::pair<Vec, Vec> apply_get_grad(const Vec &in) const {
+    if (unit_size) {
+      return std::make_pair(Vec(0), Vec(0));
+    }
+    Vec double_max_val_vec(double_max_val);
+    auto unnorm_in = unnormalize(in);
+    auto neg_in = unnorm_in < Vec(0);
+    auto abs_in = unnorm_in.abs();
+    auto fdouble_flips = abs_in / double_max_val_vec;
+    auto double_flips = fdouble_flips.trunc();
+
+    auto extra = abs_in - double_flips * double_max_val_vec;
+    auto reflected_extra = double_max_val_vec - extra;
+    auto one_more_flip = extra > reflected_extra;
+
+    return std::make_pair(
+      Vec::blendv(extra, reflected_extra, one_more_flip),
+      Vec::blendv(Vec(half_max_val), Vec(neg_half_max_val), one_more_flip ^ neg_in)
+    );
+  }
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ApplyGridSample ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Struct to apply grid sample (reading from input, interpolate, and write to
+// output).
+// See NOTE [ Grid Sample CPU Kernels ] for details.
+
+template<typename scalar_t>
+static inline void
+mask_scatter_add(const scalar_t *src, scalar_t* base_addr,
+                 const int_same_size_t<scalar_t> *offsets,
+                 const int_same_size_t<scalar_t> *mask, int64_t len) {
+  #pragma unroll
+  for (int64_t i = 0; i < len; i++) {
+    if (mask[i] & 0x01) {
+      base_addr[offsets[i]] += src[i];
+    }
+  }
+}
+
+template<typename scalar_t, int spatial_dim,
+         GridSamplerInterpolation interp,
+         GridSamplerPadding padding>
+struct ApplyGridSample;
+
+template<typename scalar_t, GridSamplerPadding padding>
+struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear, padding> {
+  using Vec = Vec256<scalar_t>;
+  using integer_t = int_same_size_t<scalar_t>;
+  using iVec = Vec256<integer_t>;
+
+  const int64_t inp_H;
+  const int64_t inp_W;
+  const int64_t inp_sH;
+  const int64_t inp_sW;
+  const int64_t C;
+  const int64_t inp_sC;
+  const ComputeLocation<scalar_t, padding> compute_H;
+  const ComputeLocation<scalar_t, padding> compute_W;
+  const bool must_in_bound = padding != GridSamplerPadding::Zeros;
+
+  ApplyGridSample(const TensorAccessor<scalar_t, 4>& input)
+    : inp_H(input.size(2))
+    , inp_W(input.size(3))
+    , inp_sH(input.stride(2))
+    , inp_sW(input.stride(3))
+    , C(input.size(1))
+    , inp_sC(input.stride(1))
+    , compute_H(input.size(2))
+    , compute_W(input.size(3)) {}
+
+  inline std::tuple<
+    Vec, Vec, Vec, Vec,       // distances to 4 sides
+    Vec, Vec, Vec, Vec,       // interpolation weights wrt 4 corners
+    Vec, Vec, Vec, Vec,       // in_bound masks
+    iVec, iVec                // y_n and x_w
+  >
+  compute_interp_params(const Vec& x, const Vec& y) const {
+    // get NE, NW, SE, SW pixel values from (x, y)
+    // assuming we get exact integer representation and just use scalar_t
+    // if we don't, the weights will be garbage anyways.
+    auto x_w = x.floor();
+    auto y_n = y.floor();
+
+    // get distances to each side
+    auto w = x - x_w;
+    auto e = Vec(1) - w;
+    auto n = y - y_n;
+    auto s = Vec(1) - n;
+
+    // get interpolation weights for each neighbor
+    // e.g., for the nw corder, the weight is `dist_to_south * dist_to_east`.
+    auto nw = s * e;
+    auto ne = s * w;
+    auto sw = n * e;
+    auto se = n * w;
+
+    auto i_x_w = convert_to_int_of_same_size(x_w);
+    auto i_y_n = convert_to_int_of_same_size(y_n);
+    auto i_x_e = i_x_w + iVec(1);
+    auto i_y_s = i_y_n + iVec(1);
+
+    // Use int comparison because it is much faster than float comp with AVX2
+    // (latency 1 cyc vs. 4 cyc on skylake)
+    // Avoid using the le and ge because those are not implemented in AVX2 and
+    // are actually simulated using multiple instructions.
+    auto w_mask = must_in_bound ? iVec(-1)  // true = all ones
+                                : (i_x_w > iVec(-1)) & (i_x_w < iVec(inp_W));
+    auto n_mask = must_in_bound ? iVec(-1)  // true = all ones
+                                : (i_y_n > iVec(-1)) & (i_y_n < iVec(inp_H));
+    auto e_mask = must_in_bound ? (i_x_e < iVec(inp_W))
+                                : (i_x_e > iVec(-1)) & (i_x_e < iVec(inp_W));
+    auto s_mask = must_in_bound ? (i_y_s < iVec(inp_H))
+                                : (i_y_s > iVec(-1)) & (i_y_s < iVec(inp_H));
+    auto nw_mask = cast<scalar_t>(must_in_bound ? iVec(-1) : (w_mask & n_mask));
+    auto ne_mask = cast<scalar_t>(e_mask & n_mask);
+    auto sw_mask = cast<scalar_t>(w_mask & s_mask);
+    auto se_mask = cast<scalar_t>(e_mask & s_mask);
+
+    return std::make_tuple(
+      n, s, w, e,
+      nw, ne, sw, se,
+      nw_mask, ne_mask, sw_mask, se_mask,
+      i_y_n, i_x_w);
+  }
+
+  inline void forward(TensorAccessor<scalar_t, 3>& out_slice,
+                      const TensorAccessor<scalar_t, 3>& inp_slice,
+                      int64_t offset, const Vec& grid_x, const Vec& grid_y,
+                      int64_t len) const {
+    auto x = compute_W.apply(grid_x);
+    auto y = compute_H.apply(grid_y);
+
+    auto interp_params = compute_interp_params(x, y);
+
+    auto nw = std::get<4>(interp_params);
+    auto ne = std::get<5>(interp_params);
+    auto sw = std::get<6>(interp_params);
+    auto se = std::get<7>(interp_params);
+
+    auto nw_mask = std::get<8>(interp_params);
+    auto ne_mask = std::get<9>(interp_params);
+    auto sw_mask = std::get<10>(interp_params);
+    auto se_mask = std::get<11>(interp_params);
+
+    auto i_y_n = std::get<12>(interp_params);
+    auto i_x_w = std::get<13>(interp_params);
+
+    auto i_nw_offset = i_y_n * iVec(inp_sH) + i_x_w * iVec(inp_sW);
+    auto i_ne_offset = i_nw_offset + iVec(inp_sW);
+    auto i_sw_offset = i_nw_offset + iVec(inp_sH);
+    auto i_se_offset = i_sw_offset + iVec(inp_sW);
+
+    #pragma unroll
+    for (int64_t c = 0; c < C; ++c) {
+      auto inp_slice_C_ptr = inp_slice[c].data();
+
+      // mask_gather zeros out the mask, so we need to make copies
+      Vec nw_mask_copy = nw_mask;
+      Vec ne_mask_copy = ne_mask;
+      Vec sw_mask_copy = sw_mask;
+      Vec se_mask_copy = se_mask;
+      auto nw_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_nw_offset, nw_mask_copy);
+      auto ne_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_ne_offset, ne_mask_copy);
+      auto sw_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_sw_offset, sw_mask_copy);
+      auto se_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_se_offset, se_mask_copy);
+
+      auto interpolated = (nw_val * nw) + (ne_val * ne) + (sw_val * sw) + (se_val * se);
+      interpolated.store(out_slice[c].data() + offset, len);
+    }
+  }
+
+  inline void backward(TensorAccessor<scalar_t, 3>& gInp_slice,
+                       TensorAccessor<scalar_t, 3>& gGrid_slice,
+                       const TensorAccessor<scalar_t, 3>& gOut_slice,
+                       const TensorAccessor<scalar_t, 3>& inp_slice,
+                       int64_t offset, const Vec& grid_x, const Vec& grid_y,
+                       int64_t len) const {
+    Vec x, y, gx_mult, gy_mult;
+    std::tie(x, gx_mult) = compute_W.apply_get_grad(grid_x);
+    std::tie(y, gy_mult) = compute_H.apply_get_grad(grid_y);
+
+    Vec n, s, w, e, nw, ne, sw, se, nw_mask, ne_mask, sw_mask, se_mask;
+    iVec i_y_n, i_x_w;
+
+    std::tie(
+      n, s, w, e, nw, ne, sw, se, nw_mask, ne_mask, sw_mask, se_mask,
+      i_y_n, i_x_w) = compute_interp_params(x, y);
+
+    auto i_nw_offset = i_y_n * iVec(inp_sH) + i_x_w * iVec(inp_sW);
+    auto i_ne_offset = i_nw_offset + iVec(inp_sW);
+    auto i_sw_offset = i_nw_offset + iVec(inp_sH);
+    auto i_se_offset = i_sw_offset + iVec(inp_sW);
+
+    auto i_gInp_nw_offset = i_y_n * iVec(inp_W) + i_x_w;
+    auto i_gInp_ne_offset = i_gInp_nw_offset + iVec(1);
+    auto i_gInp_sw_offset = i_gInp_nw_offset + iVec(inp_W);
+    auto i_gInp_se_offset = i_gInp_sw_offset + iVec(1);
+
+    // When reading input values, we used mask_gather. Unfortunately, there is
+    // no mask_scatter_add (the backward of mask_gather) in Intel intrinsics.
+    // So we store the necessary vectors to temporary arrays and use the helper
+    // mask_scatter_add defined above.
+
+    integer_t i_gInp_nw_offset_arr[iVec::size];
+    integer_t i_gInp_ne_offset_arr[iVec::size];
+    integer_t i_gInp_sw_offset_arr[iVec::size];
+    integer_t i_gInp_se_offset_arr[iVec::size];
+    i_gInp_nw_offset.store(i_gInp_nw_offset_arr);
+    i_gInp_ne_offset.store(i_gInp_ne_offset_arr);
+    i_gInp_sw_offset.store(i_gInp_sw_offset_arr);
+    i_gInp_se_offset.store(i_gInp_se_offset_arr);
+
+    integer_t i_nw_mask_arr[iVec::size];
+    integer_t i_ne_mask_arr[iVec::size];
+    integer_t i_sw_mask_arr[iVec::size];
+    integer_t i_se_mask_arr[iVec::size];
+    nw_mask.store(i_nw_mask_arr);
+    ne_mask.store(i_ne_mask_arr);
+    sw_mask.store(i_sw_mask_arr);
+    se_mask.store(i_se_mask_arr);
+
+    scalar_t gInp_corner_arr[Vec::size];
+
+    auto gx = Vec(0), gy = Vec(0);
+    #pragma unroll
+    for (int64_t c = 0; c < C; ++c) {
+      auto inp_slice_C_ptr = inp_slice[c].data();
+      auto gInp_slice_C_ptr = gInp_slice[c].data();
+      auto gOut = Vec::loadu(gOut_slice[c].data() + offset, len);
+
+      (nw * gOut).store(gInp_corner_arr);
+      mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_nw_offset_arr, i_nw_mask_arr, len);
+      (ne * gOut).store(gInp_corner_arr);
+      mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_ne_offset_arr, i_ne_mask_arr, len);
+      (sw * gOut).store(gInp_corner_arr);
+      mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_sw_offset_arr, i_sw_mask_arr, len);
+      (se * gOut).store(gInp_corner_arr);
+      mask_scatter_add(gInp_corner_arr, gInp_slice_C_ptr, i_gInp_se_offset_arr, i_se_mask_arr, len);
+
+      // mask_gather zeros out the mask, so we need to make copies
+      Vec nw_mask_copy = nw_mask;
+      Vec ne_mask_copy = ne_mask;
+      Vec sw_mask_copy = sw_mask;
+      Vec se_mask_copy = se_mask;
+      auto nw_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_nw_offset, nw_mask_copy);
+      auto ne_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_ne_offset, ne_mask_copy);
+      auto sw_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_sw_offset, sw_mask_copy);
+      auto se_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_C_ptr, i_se_offset, se_mask_copy);
+
+      gx = gx + ((ne_val - nw_val) * s + (se_val - sw_val) * n) * gOut;
+      gy = gy + ((sw_val - nw_val) * e + (se_val - ne_val) * w) * gOut;
+    }
+
+    gx = gx * gx_mult;
+    gy = gy * gy_mult;
+
+    constexpr int64_t step = Vec::size;
+    auto interleaved_gGrid = interleave2(gx, gy);
+    auto gGrid_ptr = gGrid_slice.data() + offset * 2;
+    std::get<0>(interleaved_gGrid).store(gGrid_ptr,
+                                         std::min(len * 2, step));
+    std::get<1>(interleaved_gGrid).store(gGrid_ptr + step,
+                                         std::max(static_cast<int64_t>(0), len * 2 - step));
+  }
+};
+
+template<typename scalar_t, GridSamplerPadding padding>
+struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Nearest, padding> {
+  using Vec = Vec256<scalar_t>;
+  using integer_t = int_same_size_t<scalar_t>;
+  using iVec = Vec256<integer_t>;
+
+  const int64_t inp_H;
+  const int64_t inp_W;
+  const int64_t inp_sH;
+  const int64_t inp_sW;
+  const int64_t C;
+  const int64_t inp_sC;
+  const ComputeLocation<scalar_t, padding> compute_H;
+  const ComputeLocation<scalar_t, padding> compute_W;
+  const bool must_in_bound = padding != GridSamplerPadding::Zeros;
+
+  ApplyGridSample(const TensorAccessor<scalar_t, 4>& input)
+    : inp_H(input.size(2))
+    , inp_W(input.size(3))
+    , inp_sH(input.stride(2))
+    , inp_sW(input.stride(3))
+    , C(input.size(1))
+    , inp_sC(input.stride(1))
+    , compute_H(input.size(2))
+    , compute_W(input.size(3)) {}
+
+  inline void forward(TensorAccessor<scalar_t, 3>& out_slice,
+                      const TensorAccessor<scalar_t, 3>& inp_slice,
+                      int64_t offset, const Vec& grid_x, const Vec& grid_y,
+                      int64_t len) const {
+    auto x = compute_W.apply(grid_x);
+    auto y = compute_H.apply(grid_y);
+
+    auto x_nearest = x.round();
+    auto y_nearest = y.round();
+
+    auto i_x_nearest = convert_to_int_of_same_size(x_nearest);
+    auto i_y_nearest = convert_to_int_of_same_size(y_nearest);
+
+    auto i_mask = must_in_bound ? iVec(-1)
+                                : (i_x_nearest > iVec(-1)) & (i_x_nearest < iVec(inp_W)) &
+                                  (i_y_nearest > iVec(-1)) & (i_y_nearest < iVec(inp_H));
+    auto mask = cast<scalar_t>(i_mask);
+
+    auto i_offset = i_y_nearest * iVec(inp_sH) + i_x_nearest * iVec(inp_sW);
+
+    auto out_ptr = out_slice.data() + offset;
+    auto out_sC = out_slice.stride(0);
+    auto inp_slice_ptr = inp_slice.data();
+    #pragma unroll
+    for (int c = 0; c < C; ++c, out_ptr += out_sC, inp_slice_ptr += inp_sC) {
+      // mask_gather zeros out the mask, so we need to make a copy
+      auto mask_copy = mask;
+      auto inp_val = mask_gather<sizeof(scalar_t)>(Vec(0), inp_slice_ptr, i_offset, mask_copy);
+      inp_val.store(static_cast<void*>(out_ptr), len);
+    }
+  }
+
+  inline void backward(TensorAccessor<scalar_t, 3>& gInp_slice,
+                       TensorAccessor<scalar_t, 3>& gGrid_slice,
+                       const TensorAccessor<scalar_t, 3>& gOut_slice,
+                       const TensorAccessor<scalar_t, 3>& inp_slice,
+                       int64_t offset, const Vec& grid_x, const Vec& grid_y,
+                       int64_t len) const {
+    auto x = compute_W.apply(grid_x);
+    auto y = compute_H.apply(grid_y);
+
+    auto x_nearest = x.round();
+    auto y_nearest = y.round();
+
+    auto i_x_nearest = convert_to_int_of_same_size(x_nearest);
+    auto i_y_nearest = convert_to_int_of_same_size(y_nearest);
+
+    auto i_mask = must_in_bound ? iVec(-1)
+                                : (i_x_nearest > iVec(-1)) & (i_x_nearest < iVec(inp_W)) &
+                                  (i_y_nearest > iVec(-1)) & (i_y_nearest < iVec(inp_H));
+
+    auto i_gInp_offset = i_y_nearest * iVec(inp_W) + i_x_nearest;  // gInp is contiguous
+
+    integer_t mask_arr[iVec::size];
+    i_mask.store(mask_arr);
+    integer_t gInp_offset_arr[iVec::size];
+    i_gInp_offset.store(gInp_offset_arr);
+
+    #pragma unroll
+    for (int64_t c = 0; c < C; ++c) {
+      mask_scatter_add(gOut_slice[c].data() + offset, gInp_slice[c].data(),
+                       gInp_offset_arr, mask_arr, len);
+    }
+
+    // grid has zero 0 gradient in Nearest mode
+    auto gGrid_ptr = gGrid_slice.data() + offset * 2;
+    std::memset(gGrid_ptr, 0, sizeof(scalar_t) * len * 2);
+  }
+};
+
+// ~~~~~~~~~~~~~~~~~~ grid_sample_2d_grid_slice_iterator ~~~~~~~~~~~~~~~~~~~~~~
+// Function to apply a vectorized function on a grid slice tensor (without batch
+// dimension).
+// See NOTE [ Grid Sample CPU Kernels ] for details.
+
+template<typename scalar_t, typename ApplyFn>
+static inline void grid_sample_2d_grid_slice_iterator(
+    const TensorAccessor<scalar_t, 3>& grid_slice, const ApplyFn &apply_fn) {
+  int64_t out_H = grid_slice.size(0);
+  int64_t out_W = grid_slice.size(1);
+  int64_t grid_sH = grid_slice.stride(0);
+  int64_t grid_sW = grid_slice.stride(1);
+  int64_t grid_sCoor = grid_slice.stride(2);
+  auto grid_ptr = grid_slice.data();
+
+  using Vec = Vec256<scalar_t>;
+  using iVec = Vec256<int_same_size_t<scalar_t>>;
+  constexpr int64_t step = Vec::size;
+
+  // Loop over each output pixel in grid.
+  // We consider the following three cases (after slicing out the batch
+  // dimension).
+  // See detailed discussions under each if-case.
+
+  if (at::geometry_is_contiguous({out_H, out_W, 2}, {grid_sH, grid_sW, grid_sCoor})) {
+    // Case 1:
+    // Grid is contiguous.
+    // Strategy: Sequentially load two vectors at the same time, and get,
+    //           e.g.,  {x0, y0, x1, y1}, {x2, y2, x3, y3}. Then we use
+    //           at::vec256::deinterleave2 to get x and y vectors.
+    auto total_size = out_H * out_W;
+    for (int64_t spatial_offset = 0; spatial_offset < total_size; spatial_offset += step) {
+      auto grid_offset = spatial_offset * 2;
+      auto len = std::min(step, total_size - spatial_offset);
+      auto vec1 = Vec::loadu(grid_ptr + grid_offset,
+                             std::min(step, len * 2));
+      auto vec2 = Vec::loadu(grid_ptr + grid_offset + step,
+                             std::max(static_cast<int64_t>(0), len * 2 - step));
+      auto vec_xy_pair = deinterleave2(vec1, vec2);
+
+      auto x = std::get<0>(vec_xy_pair);
+      auto y = std::get<1>(vec_xy_pair);
+
+      // make sure that x and y are valid grid sample locations
+      if (len < step) {
+        x = Vec::set(Vec(0), x, len);
+        y = Vec::set(Vec(0), y, len);
+      }
+      apply_fn(x, y, spatial_offset, len);
+    }
+  } else if (grid_sW == 1 || out_W == 1) {
+    // Case 2:
+    // The W dimension is contiguous.
+    // This can be common, e.g., grid is from a conv net output of shape
+    // [N, 2, H, W].
+    // Strategy: Divide into two contiguous slices each of shape [H, W], and
+    //           each containing x and y vectors. So we sequentially load a
+    //           vector from each of them to get x and y vector
+
+    // Function to apply along a contiguous W dimension (or flattened H x W).
+    auto line_fn = [&](const scalar_t *grid_ptr_x, const scalar_t *grid_ptr_y,
+                       int64_t out_base_offset, int64_t total_size) {
+      for (int64_t i = 0; i < total_size; i += step) {
+        auto len = std::min(step, total_size - i);
+        auto x = Vec::loadu(grid_ptr_x + i, len);
+        auto y = Vec::loadu(grid_ptr_y + i, len);
+        // make sure that x and y are valid grid sample locations
+        if (len < step) {
+          x = Vec::set(Vec(0), x, len);
+          y = Vec::set(Vec(0), y, len);
+        }
+        apply_fn(x, y, out_base_offset + i, len);
+      }
+    };
+
+    if (at::geometry_is_contiguous({out_H, out_W}, {grid_sH, grid_sW})) {
+      // If [H, W] is contiguous, apply line_fn once.
+      line_fn(grid_ptr, grid_ptr + grid_sCoor, 0, out_H * out_W);
+    } else {
+      // If only [W] is contiguous, apply line_fn once for each h slice.
+      auto grid_ptr_NH = grid_ptr;
+      for (int64_t h = 0; h < out_H; h++) {
+        line_fn(grid_ptr_NH, grid_ptr_NH + grid_sCoor, h * out_W, out_W);
+        grid_ptr_NH += grid_sH;
+      }
+    }
+  } else {
+    // Case 3:
+    // General case.
+    // Strategy: Do a for-loop over H, for each W slice, use
+    //           at::vec256::gather to load the x and y vectors.
+    auto spatial_offset = 0;
+    auto i_offsets_delta = iVec(grid_sW * step);
+
+    #pragma unroll
+    for (int64_t h = 0; h < out_H; h++) {
+      auto grid_ptr_x = grid_ptr + h * grid_sH;
+      auto grid_ptr_y = grid_ptr_x + grid_sCoor;
+      auto i_offsets = iVec::arange(0, grid_sW);
+      #pragma unroll
+      for (int64_t w = 0; w < out_W; w += step) {
+        auto len = std::min(step, out_W - w);
+        if (len < step) {
+          // prevents illegal memory access, sets the exceeding offsets to zero
+          i_offsets = iVec::set(iVec(0), i_offsets, len);
+        }
+        apply_fn(gather<sizeof(scalar_t)>(grid_ptr_x, i_offsets),
+                 gather<sizeof(scalar_t)>(grid_ptr_y, i_offsets),
+                 spatial_offset, len);
+
+        i_offsets = i_offsets + i_offsets_delta;
+        spatial_offset += len;
+      }
+    }
+  }
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~ Grid Sample Kernels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Use the structs & functions defined above to calculate grid sample forward
+// and backward.
+// See NOTE [ Grid Sample CPU Kernels ] for details.
+
+Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid,
+                                       int64_t interpolation_mode,
+                                       int64_t padding_mode) {
+  auto N = input.size(0);
+  auto H = grid.size(1);
+  auto W = grid.size(2);
+  auto output = at::empty({N, input.size(1), H, W}, input.options());
+  auto spatial_size = H * W;
+  auto grain_size = spatial_size == 0 ? (N + 1)
+                                      : at::divup(at::internal::GRAIN_SIZE, spatial_size * 4 /* 2d * 2 tensors*/);
+
+#define HANDLE_CASE(interp, padding)                                           \
+  case padding: {                                                              \
+    ApplyGridSample<scalar_t, 2, interp, padding> grid_sample(inp_acc);        \
+    parallel_for(0, N, grain_size, [&](int64_t begin, int64_t end) {           \
+      for (int64_t n = begin; n < end; n++) {                                  \
+        auto out_slice = out_acc[n];                                           \
+        auto inp_slice = inp_acc[n];                                           \
+        grid_sample_2d_grid_slice_iterator(                                    \
+          grid_acc[n],                                                         \
+          [&](const Vec256<scalar_t>& grid_x, const Vec256<scalar_t>& grid_y,  \
+              int64_t spatial_offset, int64_t len) {                           \
+            grid_sample.forward(out_slice, inp_slice, spatial_offset,          \
+                                grid_x, grid_y, len);                          \
+          });                                                                  \
+        }                                                                      \
+      });                                                                      \
+    return;                                                                    \
+  }
+
+#define HANDLE_INTERP(interp)                                          \
+  case interp: {                                                       \
+    switch (static_cast<GridSamplerPadding>(padding_mode)) {           \
+      HANDLE_CASE(interp, GridSamplerPadding::Zeros);                  \
+      HANDLE_CASE(interp, GridSamplerPadding::Border);                 \
+      HANDLE_CASE(interp, GridSamplerPadding::Reflection);             \
+    }                                                                  \
+    return;                                                            \
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_cpu_kernel_impl", [&] {
+    auto out_acc = output.accessor<scalar_t, 4>();
+    auto inp_acc = input.accessor<scalar_t, 4>();
+    auto grid_acc = grid.accessor<scalar_t, 4>();
+    switch (static_cast<GridSamplerInterpolation>(interpolation_mode)) {
+      HANDLE_INTERP(GridSamplerInterpolation::Bilinear);
+      HANDLE_INTERP(GridSamplerInterpolation::Nearest);
+    }
+  });
+#undef HANDLE_CASE
+#undef HANDLE_INTERP
+
+  return output;
+}
+
+std::tuple<Tensor, Tensor>
+grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_,
+                                         const Tensor& input,
+                                         const Tensor& grid,
+                                         int64_t interpolation_mode,
+                                         int64_t padding_mode) {
+  // grad_output should be contiguous most of time. Ensuring that it is
+  // contiguous can greatly simplify this code.
+  auto grad_output = grad_output_.contiguous();
+
+  auto grad_input = at::zeros_like(input);
+  auto grad_grid = at::empty_like(grid);
+  auto N = input.size(0);
+  auto spatial_size = grid.size(1) * grid.size(2);
+  auto grain_size = spatial_size == 0 ? (N + 1)
+                                      : at::divup(at::internal::GRAIN_SIZE, spatial_size * 10 /* 2d * 5 tensors*/);
+
+#define HANDLE_CASE(interp, padding)                                             \
+  case padding: {                                                                \
+    ApplyGridSample<scalar_t, 2, interp, padding> grid_sample(inp_acc);          \
+    parallel_for(0, N, grain_size, [&](int64_t begin, int64_t end) {             \
+      for (int64_t n = begin; n < end; n++) {                                    \
+        auto gInp_slice = gInp_acc[n];                                           \
+        auto gGrid_slice = gGrid_acc[n];                                         \
+        auto gOut_slice = gOut_acc[n];                                           \
+        auto inp_slice = inp_acc[n];                                             \
+        grid_sample_2d_grid_slice_iterator(                                      \
+          grid_acc[n],                                                           \
+          [&](const Vec256<scalar_t>& grid_x, const Vec256<scalar_t>& grid_y,    \
+              int64_t spatial_offset, int64_t len) {                             \
+            grid_sample.backward(gInp_slice, gGrid_slice, gOut_slice, inp_slice, \
+                                 spatial_offset, grid_x, grid_y, len);           \
+          });                                                                    \
+      }                                                                          \
+    });                                                                          \
+    return;                                                                      \
+  }
+
+#define HANDLE_INTERP(interp)                                          \
+  case interp: {                                                       \
+    switch (static_cast<GridSamplerPadding>(padding_mode)) {           \
+      HANDLE_CASE(interp, GridSamplerPadding::Zeros);                  \
+      HANDLE_CASE(interp, GridSamplerPadding::Border);                 \
+      HANDLE_CASE(interp, GridSamplerPadding::Reflection);             \
+    }                                                                  \
+    return;                                                            \
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu_kernel_impl", [&] {
+    auto gInp_acc = grad_input.accessor<scalar_t, 4>();
+    auto gGrid_acc = grad_grid.accessor<scalar_t, 4>();
+    auto inp_acc = input.accessor<scalar_t, 4>();
+    auto grid_acc = grid.accessor<scalar_t, 4>();
+    auto gOut_acc = grad_output.accessor<scalar_t, 4>();
+    switch (static_cast<GridSamplerInterpolation>(interpolation_mode)) {
+      HANDLE_INTERP(GridSamplerInterpolation::Bilinear);
+      HANDLE_INTERP(GridSamplerInterpolation::Nearest);
+    }
+  });
+#undef HANDLE_CASE
+#undef HANDLE_INTERP
+
+  return std::make_tuple(grad_input, grad_grid);
+}
+
+}
+
+REGISTER_DISPATCH(grid_sampler_2d_cpu_kernel, &grid_sampler_2d_cpu_kernel_impl);
+REGISTER_DISPATCH(grid_sampler_2d_backward_cpu_kernel, &grid_sampler_2d_backward_cpu_kernel_impl);
+
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.h b/aten/src/ATen/native/cpu/GridSamplerKernel.h
new file mode 100644
index 00000000000000..36ba3a91cc9bb8
--- /dev/null
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "ATen/ATen.h"
+#include "ATen/Dispatch.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/native/DispatchStub.h"
+#include "ATen/cpu/vml.h"
+
+#include <tuple>
+
+namespace at { namespace native {
+
+using forward_2d_fn = Tensor(*)(const Tensor &, const Tensor &, int64_t, int64_t);
+using backward_2d_fn = std::tuple<Tensor, Tensor>(*)(const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t);
+DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel);
+DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel);
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index feea350fd08306..3be4ed4e06c28d 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -183,9 +183,163 @@ static void prod_kernel_impl(Tensor& result, const Tensor& self, at::optional<in
   });
 }
 
+template<typename scalar_t>
+struct NormReduction {
+  // reduction width in number of scalar elements
+  static constexpr int WIDTH = 128 / sizeof(scalar_t);
+  using Vec = Vec256<scalar_t>;
+
+  static void apply(Tensor& res, const Tensor& self, Scalar p, at::optional<int64_t> dim) {
+    auto out_ = res.data<scalar_t>();
+    auto data_ = self.data<scalar_t>();
+    auto numel = self.numel();
+    float pval = 0.0;
+    if (p.isIntegral()){
+      pval = p.to<int64_t>();
+    } else if (p.isFloatingPoint()) {
+      pval = p.to<float>();
+    }
+    if (!dim.has_value()) {
+      *out_ = reduce_all(data_, numel,  pval);
+      return;
+    }
+    int64_t n = self.size(*dim);
+    int64_t stride = self.stride(*dim);
+    // A contiguous tensor does not need to hold a meaningful stride
+    // if the corresponding size is 1
+    if (n == 1) {
+      stride = 1;
+      for (int64_t i = self.ndimension() - 1; i > *dim; i--) {
+        stride *= self.size(i);
+      }
+    }
+    int64_t batch = numel / n;
+    parallel_for(0, batch, 1, [=](int64_t begin, int64_t end) {
+      for (int64_t bi = begin; bi < end; bi++) {
+        int64_t b = bi / stride;
+        int64_t i = bi % stride;
+        const scalar_t* data = &data_[b * n * stride + i];
+        out_[bi] = norm_reduce(data, n, stride, pval);
+      }
+    });
+  }
+
+  static scalar_t reduce_all(const scalar_t* data_, int64_t size,  float pval) {
+    scalar_t sum = parallel_reduce(
+      0,
+      size,
+      internal::GRAIN_SIZE,
+      (scalar_t)0,
+      [=](int64_t begin, int64_t end, scalar_t init) {
+        const scalar_t* data = &data_[begin];
+        int64_t n = end - begin;
+        scalar_t result = norm_reduce(data, n, 1, pval);
+        return result;
+      },
+      std::plus<scalar_t>());
+    return sum;
+  }
+
+  static scalar_t norm_reduce(const scalar_t* data, int64_t n, int64_t stride, float pval) {
+    scalar_t result = 0.0;
+    if (stride == 1 && (pval == 1 || pval == 2 || pval == 3) && n >= WIDTH) {
+      int64_t n_rounded = round_down(n, WIDTH);
+      scalar_t result1 = norm_reduce128(data, n_rounded, pval);
+      scalar_t result2 = norm_reduce_sequential(data + n_rounded, n - n_rounded, stride, pval);
+      result = std::pow(std::pow(result1, pval) + std::pow(result2, pval), 1.0/pval);
+    } else {
+      result = norm_reduce_sequential(data, n, stride, pval);
+    }
+    return result;
+  }
+
+  static scalar_t norm_reduce_sequential(const scalar_t* data, int64_t n, int64_t stride, float pval) {
+    scalar_t result = 0.0;
+    if (pval == 0) {
+      for (int64_t k = 0; k < n; k++) {
+        result += (data[k * stride] != 0.0);
+      }
+    } else if (pval == 1) {
+      for (int64_t k = 0; k < n; k++) {
+        result += std::abs(data[k * stride]);
+      }
+    } else if (pval == 2) {
+      for (int64_t k = 0; k < n; k++) {
+        result += data[k * stride] * data[k * stride];
+      }
+      result = std::sqrt(result);
+    } else if (pval == 3) {
+      for (int64_t k = 0; k < n; k++) {
+        result += std::abs(data[k * stride] * data[k * stride] * data[k * stride]);
+      }
+      result = std::pow(result, 1.0/3);
+    } else if (std::isinf(pval)) {
+      for (int64_t k = 0; k < n; k++) {
+        result = std::abs(data[k * stride]) > result ? std::abs(data[k * stride]) : result;
+      }
+      result = result;
+    } else {
+      for (int64_t k = 0; k < n; k++) {
+        result += std::pow(std::abs(data[k * stride]), pval);
+      }
+      result = std::pow(result, 1.0/pval);
+    }
+    return result;
+  }
+
+  // Reduce down a column of WIDTH elements (128 bytes) with the given number n
+  // n is already rounded by 128
+  static scalar_t norm_reduce128(const scalar_t* data, int64_t n, float pval) {
+    scalar_t result = 0.0;
+    Vec acc[4] = {0.0, 0.0, 0.0, 0.0};  // 128 bytes (two cache lines)
+    static_assert(sizeof(acc) == 128, "accumulator should be 128 bytes");
+    int64_t rows = n / WIDTH;
+    if (pval == 1){
+      for (int row = 0; row < rows; row ++) {
+        for (int j = 0; j != 4; j++) {
+          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
+          acc[j] = acc[j] + val.abs();
+        }
+      }
+    }
+    else if (pval == 2) {
+      for (int row = 0; row < rows; row ++) {
+        for (int j = 0; j != 4; j++) {
+          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
+          acc[j] = acc[j] + val * val;
+        }
+      }
+    }
+    else if (pval == 3) {
+      for (int row = 0; row < rows; row ++) {
+        for (int j = 0; j != 4; j++) {
+          auto val = Vec::loadu(&data[row * WIDTH + j * Vec::size]);
+          acc[j] = acc[j] + (val * val * val).abs();
+        }
+      }
+    }
+    scalar_t buf[WIDTH] = {0};
+    for (int j = 0; j != 4; j++) {
+      acc[j].store(&buf[j * Vec::size]);
+    }
+    for (int i = 0; i < WIDTH; i++) {
+      result += buf[i];
+    }
+    result = std::pow(result, 1.0/pval);
+    return result;
+  }
+};
+
+static void norm_kernel_impl(Tensor& result, const Tensor& self, Scalar p, at::optional<int64_t> dim) {
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "norm", [&] {
+    NormReduction<scalar_t>::apply(result, self, p, dim);
+  });
+}
+
 }  // anonymous namespace
 
 REGISTER_DISPATCH(sum_kernel, &sum_kernel_impl);
 REGISTER_DISPATCH(prod_kernel, &prod_kernel_impl);
+REGISTER_DISPATCH(norm_kernel, &norm_kernel_impl);
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.h b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
index 4c5c8c15149a1b..5fc7c60ff2803d 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.h
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
@@ -11,4 +11,7 @@ using reduce_fn = void(*)(Tensor &, const Tensor &, at::optional<int64_t>);
 DECLARE_DISPATCH(reduce_fn, sum_kernel);
 DECLARE_DISPATCH(reduce_fn, prod_kernel);
 
+using reduce_norm_fn = void(*)(Tensor &, const Tensor &, Scalar, at::optional<int64_t>);
+DECLARE_DISPATCH(reduce_norm_fn, norm_kernel);
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
index 37c8f3a364f75a..8715a9ef460ee6 100644
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@@ -112,22 +112,16 @@ class CuFFTConfig {
     if (input.type().scalarType() == ScalarType::Half) {
       // cuFFT on half requires compute capability of at least SM_53
       auto dev_prop = at::cuda::getCurrentDeviceProperties();
-      if (dev_prop->major < 5 || (dev_prop->major == 5 && dev_prop->minor < 3)) {
-        std::ostringstream ss;
-        ss << "cuFFT doesn't support signals of half type with compute "
-           << "capability less than SM_53, but the device containing input half "
-           << "tensor only has SM_" << dev_prop->major << dev_prop->minor;
-        throw std::runtime_error(ss.str());
-      }
+      AT_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3),
+               "cuFFT doesn't support signals of half type with compute "
+               "capability less than SM_53, but the device containing input half "
+               "tensor only has SM_", dev_prop->major, dev_prop->minor);
       for (int64_t i = 0; i < signal_ndim; i++) {
         auto signal_size = checked_signal_sizes[i];
-        if (!is_pow_of_two(signal_size)) {
-          std::ostringstream ss;
-          ss << "cuFFT doesn't support signals of half type with size at any "
-             << "dimension that is not a power of two, but got a signal size of "
-             << checked_signal_sizes;
-          throw std::runtime_error(ss.str());
-        }
+        AT_CHECK(is_pow_of_two(signal_size),
+                 "cuFFT doesn't support signals of half type with size at any ",
+                 "dimension that is not a power of two, but got a signal size of ",
+                 checked_signal_sizes);
       }
       clone_input |= input.stride(signal_ndim) != 1;
     }
@@ -212,7 +206,7 @@ class CuFFTConfig {
       } else if (!complex_input && complex_output) {
         exec_type = HIPFFT_R2C;
       } else {
-        throw std::runtime_error("hipFFT doesn't support r2r (float)");
+        AT_ERROR("hipFFT doesn't support r2r (float)");
       }
     } else if (input.type().scalarType() == ScalarType::Double) {
       if (complex_input && complex_output) {
@@ -222,13 +216,13 @@ class CuFFTConfig {
       } else if (!complex_input && complex_output) {
         exec_type = HIPFFT_D2Z;
       } else {
-        throw std::runtime_error("hipFFT doesn't support r2r (double)");
+        AT_ERROR("hipFFT doesn't support r2r (double)");
       }
     } else {
       std::ostringstream ss;
       ss << "hipFFT doesn't support tensor of type: "
          << at::toString(input.type().scalarType());
-      throw std::runtime_error(ss.str());
+      AT_ERROR(ss.str());
     }
 
 #else
@@ -249,7 +243,7 @@ class CuFFTConfig {
       std::ostringstream ss;
       ss << "cuFFT doesn't support tensor of type: "
          << at::toString(input.type().scalarType());
-      throw std::runtime_error(ss.str());
+      AT_ERROR(ss.str());
     }
 #endif
 
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index 853c04deb2215c..afa4c8e1916604 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -73,14 +73,22 @@ __global__ void EmbeddingBag_updateOutputKernel(
         }
       }
       if (mode == MODE_MEAN) {
-        weightFeatSum = weightFeatSum / static_cast<accscalar_t>(bag_size_);
-        bag_size[bag] = bag_size_;
+        if (end == begin) {
+          bag_size[bag] = 0;
+        } else {
+          weightFeatSum = weightFeatSum / static_cast<accscalar_t>(bag_size_);
+          bag_size[bag] = bag_size_;
+        }
       }
 
       if (mode == MODE_MEAN || mode == MODE_SUM) {
         output[bag * featureSize + featureDim] = static_cast<scalar_t>(weightFeatSum);
       }
       else if (mode == MODE_MAX) {
+        if (end == begin) {
+          // If bag is empty, set output to 0.
+          weightFeatMax = 0;
+        }
         max_indices[bag * featureSize + featureDim] = maxWord;
         output[bag * featureSize + featureDim] = weightFeatMax;
       }
@@ -268,8 +276,10 @@ __global__ void EmbeddingBag_accGradParametersKernel_max(
       int64_t bag = chunk / chunksPerBag;
 
       int64_t word_idx = max_indices[bag * stride + featureDim];
-
-      atomicAdd(&(gradWeight[word_idx * stride + featureDim]), gradOutput[bag * stride + featureDim]);
+      if (word_idx >= 0) {
+        // If bag is empty, we have max_indices[idx] set to -1 in forward.
+        atomicAdd(&(gradWeight[word_idx * stride + featureDim]), gradOutput[bag * stride + featureDim]);
+      }
     }
   }
 }
diff --git a/aten/src/ATen/native/cuda/Gesv.cu b/aten/src/ATen/native/cuda/Gesv.cu
index bc37e83990e192..80c7aaeb74f6a8 100644
--- a/aten/src/ATen/native/cuda/Gesv.cu
+++ b/aten/src/ATen/native/cuda/Gesv.cu
@@ -84,7 +84,7 @@ static inline Storage pin_memory(int64_t size, Tensor dummy) {
   name = static_cast<type*>(storage_##name.data());
 
 template <typename scalar_t>
-static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t> infos) {
+static void applyGesv(Tensor& b, Tensor& A, std::vector<int64_t>& infos) {
 #ifndef USE_MAGMA
 AT_ERROR("gesv: MAGMA library not found in "
     "compilation. Please rebuild with MAGMA.");
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
new file mode 100644
index 00000000000000..1c3609f50b201c
--- /dev/null
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -0,0 +1,25 @@
+#include "ATen/ATen.h"
+
+namespace at { namespace native {
+
+Tensor baddbmm_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  return _th_baddbmm(self, batch1, batch2, beta, alpha);
+}
+
+Tensor& baddbmm_out_cuda(Tensor &result, const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  return _th_baddbmm_out(result, self, batch1, batch2, beta, alpha);
+}
+
+Tensor& baddbmm__cuda(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  return _th_baddbmm_out(self, self, batch1, batch2, beta, alpha);
+}
+
+Tensor bmm_cuda(const Tensor& self, const Tensor& mat2) {
+  return _th_bmm(self, mat2);
+}
+
+Tensor& bmm_out_cuda(Tensor &result, const Tensor& batch1, const Tensor& batch2) {
+  return _th_bmm_out(result, batch1, batch2);
+}
+
+} }
diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu
index 16d7935f3d49fa..dc0b5af8d4e264 100644
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@@ -227,7 +227,7 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
   Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options());
 
   // Very likely, we could be more clever here, e.g. learning (or genralizing and reusing) from SoftMax.cu...
-  constexpr int max_threads = 1024;
+  constexpr int max_threads = std::is_same<scalar_t, float>::value ? 1024 : 896; // we need 72 or so 32 bit registers for double
   int threads_target = max_threads;
   while (threads_target / 2 >= 2*max_target_length+1) {
     threads_target /= 2;
@@ -247,6 +247,7 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
                       log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
                       tg_batch_offsets.data<int64_t>(), tg_target_stride,
                       batch_size, BLANK);
+  THCudaCheck(cudaGetLastError()); // catch launch errors
   return std::make_tuple(neg_log_likelihood, log_alpha);
 }
 
@@ -452,7 +453,7 @@ __global__ void ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ grad
     scalar_t& res = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * c];
     if (t < input_length) {
       scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * c];
-      res = std::exp(lp)-std::exp(res + nll - lp) * gr;
+      res = (std::exp(lp)-std::exp(res + nll - lp)) * gr;
     }
     else {
       res = 0.;
@@ -505,7 +506,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
   Tensor grad = at::full_like(log_probs, neginf); // initialization for log(sum (alpha beta))
 
   // As above, there may be better configurations to use.
-  constexpr int max_threads = 1024;
+  constexpr int max_threads = std::is_same<scalar_t, float>::value ? 1024 : 896; // we need 72 or so 32 bit registers for double
   int threads_target = max_threads;
   while (threads_target / 2 >= 2*max_target_length+1) {
     threads_target /= 2;
@@ -526,6 +527,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
        log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
        tg_batch_offsets.data<int64_t>(), tg_target_stride,
        batch_size, BLANK);
+    THCudaCheck(cudaGetLastError()); // catch launch errors
   }
 
   // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on
@@ -550,7 +552,10 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
 		   .sub_(log_probs.narrow(2, BLANK, 1))
 		   .exp_()
 		   );
-    // Tor the non-blank characters, we use a kernel to compute the subtrahend.
+    // scale by output gradient (blanks and first summand of non-blanks)
+    grad *= grad_out.view({1, batch_size, 1});
+
+    // For the non-blank characters, we use a kernel to compute the subtrahend.
     // Again we might configure block and grid in a better way.
     int threads_target = max_threads;
     while (threads_target / 2 >= max_target_length) {
@@ -572,6 +577,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
        log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
        tg_batch_offsets.data<int64_t>(), tg_target_stride,
        batch_size, num_labels, BLANK);
+    THCudaCheck(cudaGetLastError()); // catch launch errors
   } else { // small problem, use naive algorithm
     // Still no block/grid configuration guru...
     int threads_input = max_threads;
@@ -595,6 +601,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
        log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
        tg_batch_offsets.data<int64_t>(), tg_target_stride,
        batch_size, num_labels, BLANK);
+    THCudaCheck(cudaGetLastError()); // catch launch errors
   }
   return grad;
 }
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
index c82f4e7afb87de..38b1dddb496276 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -206,7 +206,7 @@ static inline Tensor _run_cufft(
         CUFFT_CHECK(hipfftExecR2C(plan, static_cast<hipfftReal*>(input.data_ptr()),
           static_cast<hipfftComplex*>(output.data_ptr())));
       } else {
-        throw std::runtime_error("hipFFT doesn't support r2r (float)");
+        AT_ERROR("hipFFT doesn't support r2r (float)");
       }
     } else if (input.type().scalarType() == ScalarType::Double) {
       if (complex_input && complex_output) {
@@ -220,13 +220,13 @@ static inline Tensor _run_cufft(
         CUFFT_CHECK(hipfftExecD2Z(plan, static_cast<hipfftDoubleReal*>(input.data_ptr()),
           static_cast<hipfftDoubleComplex*>(output.data_ptr())));
       } else {
-        throw std::runtime_error("hipFFT doesn't support r2r (double)");
+        AT_ERROR("hipFFT doesn't support r2r (double)");
       }
     } else {
       std::ostringstream ss;
       ss << "hipFFT doesn't support tensor of type: "
          << at::toString(input.type().scalarType());
-      throw std::runtime_error(ss.str());
+      AT_ERROR(ss.str());
     }
 #else
   CUFFT_CHECK(cufftXtExec(plan, input.data_ptr(), output.data_ptr(),
diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu
new file mode 100644
index 00000000000000..67d8f39e2de71d
--- /dev/null
+++ b/aten/src/ATen/native/cuda/WeightNorm.cu
@@ -0,0 +1,502 @@
+#include "ATen/ATen.h"
+#include "ATen/AccumulateType.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/core/Error.h"
+
+#include "ATen/cuda/CUDAContext.h"
+#include <THC/THCDeviceUtils.cuh>
+#include <THC/THCTensorMathReduce.cuh>
+
+namespace at { 
+namespace native {
+namespace {
+
+// Block size for weight_norm_*_first_dim_kernel.
+// Currently, kernels are non-persistent.
+// Dialing up the block size to, say 1024, can improve performance by
+// increase the amount of cache available per block, which can improve cache hit rate.
+// However, this is less efficient for short rows.  256 is pretty versatile. 
+// May be worth implementing heuristics later.
+#define BLOCK 256
+
+// Block size for weight_norm_*_last_dim_kernel.
+// This is tricker than the first_dim case because we must make blocks 
+// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses.
+// Since output-element parallelism is along the fast dimension, this reduces the number of 
+// blocks we can launch by 16X.  
+#define TILE_W 16
+// Somewhat versatile strategy: max out intra-block parallelism by extending
+// blocks across the slow dimension up to the hardware-max block size of 1024.
+#define TILE_H 64
+
+template<typename T, typename ReduceOp>
+__device__ __forceinline__ void reduce_block_into_lanes
+  (T *x, 
+   T val, 
+   int lanes, // lanes is intended to be <= 32.
+   ReduceOp reduceOp) 
+{ 
+  int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
+
+  if(blockSize >= 64)
+  {
+    x[tid] = val;
+    __syncthreads();
+  }
+  
+  #pragma unroll
+  for(int i = (blockSize >> 1); i >= 64; i >>= 1) 
+  {
+    if(tid < i)
+      x[tid] = reduceOp(x[tid], x[tid+i]);
+    __syncthreads();
+  }
+
+  if(tid < 32) 
+  {
+    T final;
+    if(blockSize >= 64)
+      final = reduceOp(x[tid], x[tid+32]);
+    else
+      final = val;
+    // __SYNCWARP();
+
+    #pragma unroll
+    for(int i = 16; i >= lanes; i >>= 1)
+      final = reduceOp(final, WARP_SHFL_DOWN(final, i));
+
+    if(tid < lanes) 
+      x[tid] = final; // EpilogueOp
+  }
+
+  // Make sure the smem result is visible to all warps.
+  __syncthreads();
+}
+
+template
+  <typename scalar_t, 
+   typename accscalar_t>
+__global__ void weight_norm_fwd_first_dim_kernel
+  (scalar_t* __restrict__ w,
+   accscalar_t* __restrict__ norms,
+   const scalar_t* __restrict__ v,
+   const scalar_t* __restrict__ g,
+   const int rowSize) 
+{
+  // We are norming each slowest-dim row of the tensor separately.
+  // For now, assign one block to each row.
+  const int tid = threadIdx.x;
+  const int row = blockIdx.x;
+  const int stride = blockDim.x;
+
+  // Logical index offset for this flattened row
+  const int rowStart = row*rowSize;
+
+  // Hack to get around nvcc complaining when an smem array is declared with the same name
+  // but different types in different kernels (in this case different instantiations)
+  // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
+  extern __shared__ char buf[];
+  accscalar_t* s = (accscalar_t*)buf;
+  
+  accscalar_t thread_sum = 0.f;
+  for(int i = tid; i < rowSize; i += stride ) 
+  {
+    accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]); 
+    thread_sum += val_f*val_f; // AccumOp, could do Kahan here
+  }
+
+  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
+  accscalar_t result = s[0];
+
+  result = sqrtf(result);
+  
+  if(tid == 0)
+    norms[row] = result;
+
+  // Broadcast load, could use shared memory instead.
+  accscalar_t g_this_row = scalar_cast<accscalar_t>(g[row]);
+
+  accscalar_t rnorm = 1.f/result; // for consistency with backward kernel
+
+  // Write data to output
+  for(int i = tid; i < rowSize; i += stride ) 
+  {
+    accscalar_t val_f = scalar_cast<accscalar_t>(v[i+rowStart]);
+    w[i+rowStart] = scalar_cast<scalar_t>(g_this_row*val_f*rnorm);
+  }
+}
+
+template
+  <typename scalar_t, 
+   typename accscalar_t>
+__global__ void weight_norm_fwd_last_dim_kernel
+(
+  scalar_t* __restrict__ w,
+  accscalar_t* __restrict__ norms,
+  const scalar_t* __restrict__ v,
+  const scalar_t* __restrict__ g,
+  const int fast_dim_size,
+  const int slower_dims_size
+)
+{
+  const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
+
+  extern __shared__ char buf[];
+  accscalar_t* alloc = (accscalar_t*)buf;
+  accscalar_t* s = &alloc[0];
+  accscalar_t* rnorms_this_block = &alloc[blockDim.x*blockDim.y];
+
+  accscalar_t thread_sum = 0.f;
+
+  int slower_dims_location = threadIdx.y;
+  int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]); 
+      thread_sum += val_f*val_f; // AccumOp, could do Kahan here
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    }
+
+  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); 
+
+  // Better to pass an EpilogueOp to reduce_block_into_lanes?
+  if(threadIdx.y == 0)
+  {
+    accscalar_t result = s[threadIdx.x];
+    accscalar_t norm_this_col = sqrtf(result);
+    norms[fast_dim_location] = norm_this_col;
+    rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
+  }
+   
+  __syncthreads(); 
+
+  accscalar_t g_this_col = scalar_cast<accscalar_t>(g[fast_dim_location]);     
+  accscalar_t rnorm = rnorms_this_block[threadIdx.x]; 
+
+  slower_dims_location = threadIdx.y;
+  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t val_f = scalar_cast<accscalar_t>(v[currentIdx]); 
+      w[currentIdx] = scalar_cast<scalar_t>(g_this_col*val_f*rnorm);
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    } 
+}
+
+template
+  <typename scalar_t, 
+   typename accscalar_t>
+__global__ void weight_norm_bwd_first_dim_kernel
+  (scalar_t* __restrict__ grad_v,
+   scalar_t* __restrict__ grad_g,
+   const scalar_t* __restrict__ grad_w,
+   const scalar_t* __restrict__ saved_v,
+   const scalar_t* __restrict__ saved_g,
+   const accscalar_t* __restrict__ saved_norms,
+   const int rowSize)
+{
+  // For now, assign one block to each row.
+  const int tid = threadIdx.x;
+  const int row = blockIdx.x;
+  const int stride = blockDim.x;
+
+  // Logical index offset for this flattened row
+  const int rowStart = row*rowSize;
+
+  // Hack to get around nvcc complaining when an smem array is declared with the same name
+  // but different types in different kernels (in this case different instantiations)
+  // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
+  extern __shared__ char buf[];
+  accscalar_t* s = (accscalar_t*)buf;
+  
+  accscalar_t thread_sum = 0.f;
+  for(int i = tid; i < rowSize; i += stride ) 
+  {
+    accscalar_t grad_wi = scalar_cast<accscalar_t>(grad_w[i+rowStart]); 
+    accscalar_t saved_vi = scalar_cast<accscalar_t>(saved_v[i+rowStart]); 
+    thread_sum += grad_wi*saved_vi; // AccumOp, could do Kahan here
+  }
+
+  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
+  accscalar_t result = s[0];
+
+  // Could choose to save reciprocal of norm instead I suppose, but norms is probably
+  // more handy to keep around.
+  // Broadcast load; could use shared memory instead.
+  accscalar_t rnorm = 1.f/saved_norms[row];  
+  accscalar_t rnorm3 = rnorm*rnorm*rnorm;
+
+  // Write g gradients.
+  if(tid == 0)
+    grad_g[row] = scalar_cast<scalar_t>(result*rnorm);
+
+  // Broadcast load, could use shared memory instead.
+  accscalar_t g_this_row = scalar_cast<accscalar_t>(saved_g[row]);
+   
+  // Write v gradients.  We are reusing values that were loaded earlier, so there 
+  // is an optimization opportunity here (store values persistently).
+  for(int j = tid; j < rowSize; j += stride ) 
+  {
+    accscalar_t grad_wj = scalar_cast<accscalar_t>(grad_w[j+rowStart]);  
+    accscalar_t saved_vj = scalar_cast<accscalar_t>(saved_v[j+rowStart]);  
+    accscalar_t grad_vj = g_this_row*(rnorm*grad_wj - rnorm3*saved_vj*result);
+    grad_v[j+rowStart] = scalar_cast<scalar_t>(grad_vj);
+  }
+}
+
+template 
+  <typename scalar_t, 
+   typename accscalar_t>
+__global__ void weight_norm_bwd_last_dim_kernel
+  (scalar_t* __restrict__ grad_v,
+   scalar_t* __restrict__ grad_g,
+   const scalar_t* __restrict__ grad_w,
+   const scalar_t* __restrict__ saved_v,
+   const scalar_t* __restrict__ saved_g,
+   const accscalar_t* __restrict__ saved_norms,
+   const int fast_dim_size,
+   const int slower_dims_size)
+{
+  const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
+
+  extern __shared__ char buf[];
+  accscalar_t* s = (accscalar_t*)buf;
+
+  accscalar_t thread_sum = 0.f;
+
+  int slower_dims_location = threadIdx.y;
+  int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t grad_wi = scalar_cast<accscalar_t>(grad_w[currentIdx]); 
+      accscalar_t saved_vi = scalar_cast<accscalar_t>(saved_v[currentIdx]); 
+      thread_sum += grad_wi*saved_vi; // AccumOp, could do Kahan here
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    }
+
+  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>()); 
+  accscalar_t result = s[threadIdx.x];
+
+  // Broadcast load; could use shared memory instead.
+  accscalar_t rnorm = 1.f/saved_norms[fast_dim_location];  
+  accscalar_t rnorm3 = rnorm*rnorm*rnorm;
+
+  // Write g gradients.
+  if(threadIdx.y == 0)
+    grad_g[fast_dim_location] = scalar_cast<scalar_t>(result*rnorm);
+
+  // Entire block pulls these values, could use shared memory instead.
+  accscalar_t g_this_col = scalar_cast<accscalar_t>(saved_g[fast_dim_location]);
+
+  // Write v gradients.
+  slower_dims_location = threadIdx.y;
+  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t grad_wj = scalar_cast<accscalar_t>(grad_w[currentIdx]);  
+      accscalar_t saved_vj = scalar_cast<accscalar_t>(saved_v[currentIdx]);  
+      accscalar_t grad_vj = g_this_col*(rnorm*grad_wj - rnorm3*saved_vj*result);
+      grad_v[currentIdx] = scalar_cast<scalar_t>(grad_vj);
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y; 
+    } 
+}
+
+} // anonymous namespace
+
+std::tuple<Tensor,Tensor> weight_norm_cuda
+  (const Tensor & v,
+   const Tensor & g,
+   int64_t dim) 
+{
+  auto w = at::empty_like(v);
+
+  // weight_norm_fused does have a derivative defined in derivatives.yaml, therefore, VariableType.cpp
+  // sends the unpacked g.data() as the argument.  In other words, we expect "g" is a bare Tensor here.
+
+  // norms is only needed to stash for backward.
+  // g.type().scalarType() may be at::ScalarType::Double, Float, or Half.  
+  // If Half, stash norms as float.
+  at::ScalarType AccType = g.type().scalarType() == at::ScalarType::Half ?
+                           at::ScalarType::Float : g.type().scalarType();
+  // Will this create norms on the same device as g, regardless of what the thread's default 
+  // current device is?  I believe so, because Type::* functions are DeviceGuard()ed.
+  auto norms = g.type().toScalarType(AccType).tensor(g.sizes(), g.strides());
+
+  const int ndims = v.dim();
+
+  if(dim == 0) 
+  {
+    // Find logical size of each flattened slowest-dim row
+    int rowSize = 1;
+    for(int i = ndims - 1; i > 0; i--)
+      rowSize *= v.size(i);
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (v.type(), 
+       "weight_norm_fwd_first_dim_kernel",  
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+
+         weight_norm_fwd_first_dim_kernel<scalar_t, accscalar_t>
+           <<<v.size(0), 
+              BLOCK, 
+              BLOCK*sizeof(accscalar_t),
+              stream>>>
+           (w.data<scalar_t>(), 
+            norms.data<accscalar_t>(),
+            v.data<scalar_t>(),  
+            g.data<scalar_t>(),  
+            rowSize);
+       });
+  }
+  else if(dim == ndims - 1)
+  {
+    // Precompute slower_dims_size and fast_dim_size
+    int slower_dims_size = 1;
+    for(int i = 0; i < ndims - 1; i++)
+      slower_dims_size *= v.size(i);
+
+    int fast_dim_size = v.size(ndims-1);
+ 
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (v.type(), 
+       "weight_norm_fwd_last_dim_kernel",  
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+        
+         weight_norm_fwd_last_dim_kernel<scalar_t, accscalar_t>
+           <<<(fast_dim_size+TILE_W-1)/TILE_W,
+              dim3(TILE_W,TILE_H),
+              (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
+              stream>>>
+           (w.data<scalar_t>(),
+            norms.data<accscalar_t>(),
+            v.data<scalar_t>(),
+            g.data<scalar_t>(),
+            fast_dim_size,
+            slower_dims_size);
+       });
+  }
+
+  // The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
+  // not the kernel's execution.  Errors in kernel execution aren't guaranteed to be caught
+  // until a later error check on a synchronizing CUDA call.  Unfortunately, without manually 
+  // synchronizing here, this is the best we can do.
+  THCudaCheck(cudaGetLastError());
+
+  return std::tuple<Tensor, Tensor>{w, norms};
+}
+
+std::tuple<Tensor, Tensor> weight_norm_cuda_backward
+  (const Tensor & grad_w, 
+   const Tensor & saved_v, 
+   const Tensor & saved_g, 
+   const Tensor & saved_norms,
+   int64_t dim)
+{
+  // These checks should always succeed, because weight_norm_fused_backward should only
+  // ever be recorded in the autograd graph via weight_norm, which passes contiguous v and g.
+  AT_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
+  AT_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
+  AT_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous");
+  AT_CHECK(dim == 0 || dim == saved_v.dim() - 1, "fused kernels can only be applied for first or last dim")
+
+  auto grad_v = at::empty_like(saved_v);
+  auto grad_g = at::empty_like(saved_g);
+
+  const int ndims = saved_v.dim();
+
+  if(dim == 0) 
+  {
+    // Find logical size of each flattened slowest-dim row
+    int rowSize = 1;
+    for(int i = ndims - 1; i > 0; i--)
+      rowSize *= saved_v.size(i);
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (saved_v.type(), 
+       "weight_norm_bwd_first_dim_kernel",  
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+
+	 weight_norm_bwd_first_dim_kernel<scalar_t, accscalar_t>
+	   <<<grad_w.size(0), 
+	      BLOCK, 
+	      BLOCK*sizeof(accscalar_t),
+              stream>>>
+	   (grad_v.data<scalar_t>(),
+	    grad_g.data<scalar_t>(),
+	    grad_w.data<scalar_t>(),
+	    saved_v.data<scalar_t>(),
+	    saved_g.data<scalar_t>(),
+	    saved_norms.data<accscalar_t>(),
+	    rowSize);
+       });
+  }
+  else if(dim == ndims - 1)
+  {
+    // Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
+    int slower_dims_size = 1;
+    for(int i = 0; i < ndims - 1; i++)
+      slower_dims_size *= saved_v.size(i);
+
+    int fast_dim_size = saved_v.size(ndims-1);
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF
+      (saved_v.type(), 
+       "weight_norm_bwd_last_dim_kernel",  
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+
+         weight_norm_bwd_last_dim_kernel<scalar_t, accscalar_t>
+           <<<(fast_dim_size+TILE_W-1)/TILE_W,
+              dim3(TILE_W,TILE_H), 
+              (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
+              stream>>>
+           (grad_v.data<scalar_t>(),
+            grad_g.data<scalar_t>(),
+            grad_w.data<scalar_t>(),
+            saved_v.data<scalar_t>(),
+            saved_g.data<scalar_t>(),
+            saved_norms.data<accscalar_t>(),
+            fast_dim_size,
+            slower_dims_size);
+       });
+  }
+
+  // The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
+  // not the kernel's execution.  Errors in kernel execution aren't guaranteed to be caught
+  // until a later error check on a synchronizing CUDA call.  Unfortunately, without manually 
+  // synchronizing here, this is the best we can do.
+  THCudaCheck(cudaGetLastError());
+
+  return std::tuple<Tensor, Tensor>{grad_v, grad_g};
+}
+
+#undef BLOCK
+#undef TILE_W
+#undef TILE_H
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
index 6856c465e9e8ef..463d4ffea3cf04 100644
--- a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
+++ b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
@@ -12,13 +12,13 @@ namespace at { namespace native {
 Tensor cudnn_affine_grid_generator_forward(
     const Tensor& theta,
     int64_t N, int64_t C, int64_t H, int64_t W) {
-  throw std::runtime_error("cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support");
 }
 
 Tensor cudnn_affine_grid_generator_backward(
     const Tensor& grad_theta,
     int64_t N, int64_t C, int64_t H, int64_t W) {
-  throw std::runtime_error("cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support");
 }
 
 }}
diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp
index 7cd7466a285035..d54fe256b29152 100644
--- a/aten/src/ATen/native/cudnn/BatchNorm.cpp
+++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp
@@ -13,7 +13,7 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm(
     const Tensor& input, const Tensor& weight,
     const Tensor& bias, const Tensor& running_mean, const Tensor& running_var,
     bool training, double exponential_average_factor, double epsilon) {
-  throw std::runtime_error("cudnn_batch_norm: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_batch_norm: ATen not compiled with cuDNN support");
 }
 
 std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
@@ -21,7 +21,7 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
     const Tensor& running_mean, const Tensor& running_var,
     const Tensor& save_mean, const Tensor& save_var,
     double epsilon) {
-  throw std::runtime_error("cudnn_batch_norm_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_batch_norm_backward: ATen not compiled with cuDNN support");
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp
index a94916532a3f02..afbd7653aefa67 100644
--- a/aten/src/ATen/native/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@@ -14,61 +14,61 @@ at::Tensor cudnn_convolution(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
     IntList padding, IntList stride, IntList dilation,
     int64_t groups, bool benchmark, bool deterministic) {
-  throw std::runtime_error("cudnn_convolution: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_backward_input(
     IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic) {
-  throw std::runtime_error("cudnn_convolution_backward_input: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_backward_input: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_backward_weight(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic) {
-  throw std::runtime_error("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_backward_bias(
     const at::Tensor& grad_output) {
-  throw std::runtime_error("cudnn_convolution_backward_bias: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_backward_bias: ATen not compiled with cuDNN support");
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_backward(
     const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
-  throw std::runtime_error("cudnn_convolution_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_backward: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_transpose(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
     IntList padding, IntList output_padding, IntList stride, IntList dilation,
     int64_t groups, bool benchmark, bool deterministic) {
-  throw std::runtime_error("cudnn_convolution_transpose: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_transpose: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_transpose_backward_input(
     const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation,
     int64_t groups, bool benchmark, bool deterministic) {
-  throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
 }
 
 at::Tensor cudnn_convolution_transpose_backward_weight(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic) {
-  throw std::runtime_error("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support");
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_transpose_backward(
     const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
-  throw std::runtime_error("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
 }
 
 }}
@@ -194,16 +194,12 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) {
 // Used on pad, stride and dilation
 static void check_args(CheckedFrom c, IntList args, size_t expected_size, const char* arg_name)
 {
-  if (args.size() > expected_size){
-    std::stringstream ss;
-    ss << "Too many " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")";
-    throw std::runtime_error(ss.str());
-  }
-  else if (args.size() < expected_size){
-    std::stringstream ss;
-    ss << "Not enough " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")";
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(args.size() <= expected_size,
+           "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
+  AT_CHECK(args.size() >= expected_size,
+           "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
 
   auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
   if (num_negative_values > 0){
@@ -211,7 +207,7 @@ static void check_args(CheckedFrom c, IntList args, size_t expected_size, const
     ss << arg_name << " should be greater than zero but got (";
     std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
     ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
 }
 
@@ -449,7 +445,7 @@ perf_t getBestAlgorithm(perf_t *perfResults, bool deterministic, int n_algo) {
         return perfResults[i];
       }
     }
-    throw std::runtime_error("no deterministic convolution algorithms available in CuDNN");
+    AT_ERROR("no deterministic convolution algorithms available in CuDNN");
   } else {
     return perfResults[0];
   }
diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp
index 1ce92cf7e18d2a..e859344bcc3691 100644
--- a/aten/src/ATen/native/cudnn/GridSampler.cpp
+++ b/aten/src/ATen/native/cudnn/GridSampler.cpp
@@ -11,13 +11,13 @@ namespace at { namespace native {
 
 Tensor cudnn_grid_sampler_forward(
     const Tensor& input_t, const Tensor& grid_t) {
-  throw std::runtime_error("cudnn_grid_sampler_forward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_grid_sampler_forward: ATen not compiled with cuDNN support");
 }
 
 std::tuple<Tensor, Tensor> cudnn_grid_sampler_backward(
     const Tensor& input_t, const Tensor& grid_t,
     const Tensor& grad_output_t) {
-  throw std::runtime_error("cudnn_grid_sampler_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("cudnn_grid_sampler_backward: ATen not compiled with cuDNN support");
 }
 
 }}
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
index 966aa20e0a128d..98c0cb7918f02f 100644
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -14,7 +14,7 @@ namespace at { namespace native {
 // See Note [ATen preprocessor philosophy]
 
 std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, bool deterministic) {
-  throw std::runtime_error("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support");
+  AT_ERROR("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support");
 }
 
 }}
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 8fc896afe23a12..876590409c43c4 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -22,7 +22,7 @@ Tensor _cudnn_rnn_flatten_weight(
     int64_t fn_num_layers, bool batch_first,
     bool fn_bidirectional
     ) {
-  throw std::runtime_error("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support");
+  AT_ERROR("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support");
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
@@ -34,7 +34,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     bool fn_train, bool fn_bidirectional, IntList fn_batch_sizes,
     const Tensor& fn_dropout_state
     ) {
-  throw std::runtime_error("_cudnn_rnn: ATen not compiled with cuDNN support");
+  AT_ERROR("_cudnn_rnn: ATen not compiled with cuDNN support");
 }
 
 std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
@@ -47,11 +47,11 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
     const Tensor& dropout_state, const Tensor& reserve,
     std::array<bool, 4> output_mask
     ) {
-  throw std::runtime_error("_cudnn_rnn_backward: ATen not compiled with cuDNN support");
+  AT_ERROR("_cudnn_rnn_backward: ATen not compiled with cuDNN support");
 }
 
 Tensor _cudnn_init_dropout_state(const Type& ty, double dropout, bool train, int64_t dropout_seed) {
-  throw std::runtime_error("_cudnn_init_dropout_state: ATen not compiled with cuDNN support");
+  AT_ERROR("_cudnn_init_dropout_state: ATen not compiled with cuDNN support");
 }
 
 }} // namespace at::native
@@ -123,7 +123,7 @@ namespace {
         {
           std::ostringstream oss;
           oss << "unrecognized cuDNN RNN mode " << fn_mode;
-          throw std::runtime_error(oss.str());
+          AT_ERROR(oss.str());
         }
       }
     }
@@ -131,7 +131,7 @@ namespace {
     void set_bidirectional(bool fn_bidirectional) {
       bidirectional = fn_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
     }
-     
+
     void set_algo(cudnnRNNAlgo_t algo){
       this->algo = algo;
     }
@@ -570,7 +570,7 @@ namespace {
       if (prop->major == 7 && rnn.datatype == CUDNN_DATA_HALF && !tensors.is_input_packed()) {
           if (rnn.num_layers == 1 && rnn.hidden_size <= 1024 && tensors.input_size <=1024 && rnn.num_directions() == 1 &&
                   rnn.hidden_size % 128 == 0 && tensors.input_size % 128 == 0){
-              //technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf, 
+              //technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf,
               //weed them out
               if ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8){
                   if ((tensors.seq_length >=40 && bsize <=128) ||
@@ -599,9 +599,8 @@ Tensor _cudnn_rnn_flatten_weight(
     bool fn_bidirectional
     ) {
 
-  if (weight_arr.size() == 0) {
-    throw std::runtime_error("_cudnn_rnn_flatten_weight_: cannot flatten empty weight list");
-  }
+  AT_CHECK(weight_arr.size() > 0,
+           "_cudnn_rnn_flatten_weight_: cannot flatten empty weight list");
 
   auto any_param = weight_arr[0];
 
@@ -671,9 +670,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   // TODO: Set device to input
 
   if (fn.rnn.mode != CUDNN_LSTM) {
-    if (cx.defined()) {
-      throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN");
-    }
+    AT_CHECK(!cx.defined(),
+             "rnn: illegal defined cx for non-LSTM RNN");
   }
 
   // TODO: can batch_first be a wrapper around this function?
@@ -685,12 +683,10 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
   auto output_size = _output_size(fn.rnn, fn.tensors);
 
-  if (!hx.is_contiguous()) {
-    throw std::runtime_error("rnn: hx is not contiguous");
-  }
-  if (cx.defined() && !cx.is_contiguous()) {
-    throw std::runtime_error("rnn: cx is not contiguous");
-  }
+  AT_CHECK(hx.is_contiguous(),
+           "rnn: hx is not contiguous");
+  AT_CHECK(!cx.defined() || cx.is_contiguous(),
+           "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
   auto output = input.type().tensor(output_size);
@@ -723,11 +719,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
     w_desc.set(weight_buf, 3);
   }
 
-  if (cx.defined() && !cx.sizes().equals(hidden_size)) {
-    std::ostringstream oss;
-    oss << "Expected cell size " << IntList{hidden_size} << ", got " << cx.sizes();
-    throw std::runtime_error(oss.str());
-  }
+  AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size),
+           "Expected cell size ", IntList{hidden_size}, ", got ", cx.sizes());
 
   size_t workspace_size;
   auto x_descs_arr = descs.get_x_descs();
@@ -817,9 +810,8 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   auto handle = getCudnnHandle();
 
   if (fn.rnn.mode != CUDNN_LSTM) {
-    if (cx.defined()) {
-      throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN");
-    }
+    AT_CHECK(!cx.defined(),
+             "rnn: illegal defined cx for non-LSTM RNN");
   }
 
   auto is_input_packed = fn_batch_sizes.size() != 0;
@@ -833,12 +825,10 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
   auto output_size = _output_size(fn.rnn, fn.tensors);
 
-  if (!hx.is_contiguous()) {
-    throw std::runtime_error("rnn: hx is not contiguous");
-  }
-  if (cx.defined() && !cx.is_contiguous()) {
-    throw std::runtime_error("rnn: cx is not contiguous");
-  }
+  AT_CHECK(hx.is_contiguous(),
+           "rnn: hx is not contiguous");
+  AT_CHECK(!cx.defined() || cx.is_contiguous(),
+           "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
   auto dy = grad_output.contiguous();
@@ -851,42 +841,25 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
   AT_ASSERTM(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN");
   auto dcx = cx.defined() ? cx.type().tensor(hidden_size) : Tensor();
 
-  if (!fn_train) {
-    throw std::runtime_error("cudnn RNN backward can only be called in training mode");
-  }
-  if (!input.sizes().equals(input_size)) {
-    std::ostringstream oss;
-    oss << "Expected input size " << IntList{input_size} << ", got " << input.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (!output.sizes().equals(output_size)) {
-    std::ostringstream oss;
-    oss << "Expected output size " << IntList{output_size} << ", got " << output.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (hx.defined() && !hx.sizes().equals(hidden_size)) {
-    std::ostringstream oss;
-    oss << "Expected hidden size " << IntList{hidden_size} << ", got " << hx.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (cx.defined() && !cx.sizes().equals(hidden_size)) {
-    std::ostringstream oss;
-    oss << "Expected cell size " << IntList{hidden_size} << ", got " << cx.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (dhy.defined() && !dhy.sizes().equals(hidden_size)) {
-    std::ostringstream oss;
-    oss << "Expected d_hidden size " << IntList{hidden_size} << ", got " << dhy.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (dcy.defined() && !dcy.sizes().equals(hidden_size)) {
-    std::ostringstream oss;
-    oss << "Expected d_cell size " << IntList{hidden_size} << ", got " << dcy.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (!dhy.is_cuda() || !dy.is_cuda() || (dcy.defined() && !dcy.is_cuda())) {
-    throw std::runtime_error("Gradients aren't CUDA tensors");
-  }
+  AT_CHECK(fn_train,
+           "cudnn RNN backward can only be called in training mode");
+
+  AT_CHECK(input.sizes().equals(input_size),
+           "Expected input size ", IntList{input_size}, ", got ", input.sizes());
+  AT_CHECK(output.sizes().equals(output_size),
+           "Expected output size ", IntList{output_size}, ", got ", output.sizes());
+
+  AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
+           "Expected hidden size ", IntList{hidden_size}, ", got ", hx.sizes());
+  AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size),
+           "Expected cell size ", IntList{hidden_size}, ", got ", cx.sizes());
+  AT_CHECK(!dhy.defined() || dhy.sizes().equals(hidden_size),
+           "Expected d_hidden size ", IntList{hidden_size}, ", got ", dhy.sizes());
+  AT_CHECK(!dcy.defined() || dcy.sizes().equals(hidden_size),
+           "Expected d_cell size ", IntList{hidden_size}, ", got ", dcy.sizes());
+
+  AT_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()),
+           "Gradients aren't CUDA tensors");
 
   cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors);
   fn.rnn.set_algo(algo);
@@ -959,9 +932,8 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
   auto handle = getCudnnHandle();
 
   if (fn.rnn.mode != CUDNN_LSTM) {
-    if (cx.defined()) {
-      throw std::runtime_error("rnn: illegal defined cx for non-LSTM RNN");
-    }
+    AT_CHECK(!cx.defined(),
+             "rnn: illegal defined cx for non-LSTM RNN");
   }
 
   auto is_input_packed = fn_batch_sizes.size() != 0;
@@ -973,28 +945,21 @@ std::vector<Tensor> _cudnn_rnn_backward_weight(
   auto input_size = _input_size(fn.tensors);
   auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
 
-  if (!fn_train) {
-    throw std::runtime_error("cudnn RNN backward can only be called in training mode");
-  }
-  if (!input.sizes().equals(input_size)) {
-    std::ostringstream oss;
-    oss << "Expected input size " << IntList{input_size} << ", got " << input.sizes();
-    throw std::runtime_error(oss.str());
-  }
-  if (hx.defined() && !hx.sizes().equals(hidden_size)) {
-    std::ostringstream oss;
-    oss << "Expected hidden size " << IntList{hidden_size} << ", got " << hx.sizes();
-    throw std::runtime_error(oss.str());
-  }
+  AT_CHECK(fn_train,
+           "cudnn RNN backward can only be called in training mode");
+
+  AT_CHECK(input.sizes().equals(input_size),
+           "Expected input size ", IntList{input_size}, ", got ", input.sizes());
+  AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
+           "Expected hidden size ", IntList{hidden_size}, ", got ", hx.sizes());
+
   // TODO: the above were the only checks in rnn.py, but it doesn't seem
   // like these checks are enough
 
-  if (!hx.is_contiguous()) {
-    throw std::runtime_error("rnn: hx is not contiguous");
-  }
-  if (cx.defined() && !cx.is_contiguous()) {
-    throw std::runtime_error("rnn: cx is not contiguous");
-  }
+  AT_CHECK(hx.is_contiguous(),
+           "rnn: hx is not contiguous");
+  AT_CHECK(!cx.defined() || cx.is_contiguous(),
+           "rnn: cx is not contiguous");
 
   auto x = input.contiguous();
   const auto& y = output;
@@ -1059,9 +1024,9 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
     std::array<bool, 4> output_mask
     ) {
 
-  auto grad_output = grad_output_r.defined() ? grad_output_r : output.type().zeros_like(output);
-  auto grad_hy = grad_hy_r.defined() ? grad_hy_r : hx.type().zeros_like(hx);
-  auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : cx.type().zeros_like(cx)) : grad_cy_r;
+  auto grad_output = grad_output_r.defined() ? grad_output_r : at::zeros_like(output);
+  auto grad_hy = grad_hy_r.defined() ? grad_hy_r : at::zeros_like(hx);
+  auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : at::zeros_like(cx)) : grad_cy_r;
 
   Tensor dx, dhx, dcx;
   // NB: unconditionally compute this gradient, because it mutates reserve
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index 997431b7a86170..c9d25780bd65d3 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -14,7 +14,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
     const Tensor& input, const Tensor& weight,
     const Tensor& bias, const Tensor& running_mean, const Tensor& running_var,
     bool training, double exponential_average_factor, double epsilon) {
-  throw std::runtime_error("miopen_batch_norm: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_batch_norm: ATen not compiled with MIOpen support");
 }
 
 std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
@@ -22,7 +22,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
     const Tensor& running_mean, const Tensor& running_var,
     const Tensor& save_mean, const Tensor& save_var,
     double epsilon) {
-  throw std::runtime_error("miopen_batch_norm_backward: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_batch_norm_backward: ATen not compiled with MIOpen support");
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 1ae36edd5c7b76..9aeaad73558617 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -13,61 +13,61 @@ at::Tensor miopen_convolution(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
     IntList padding, IntList stride, IntList dilation,
     int64_t groups, bool benchmark, bool deterministic) {
-  throw std::runtime_error("miopen_convolution: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_backward_input(
     IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic) {
-  throw std::runtime_error("miopen_convolution_backward_input: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_backward_input: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_backward_weight(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic) {
-  throw std::runtime_error("miopen_convolution_backward_weight: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_backward_weight: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_backward_bias(
     const at::Tensor& grad_output) {
-  throw std::runtime_error("miopen_convolution_backward_bias: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_backward_bias: ATen not compiled with MIOpen support");
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_backward(
     const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
-  throw std::runtime_error("miopen_convolution_backward: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_backward: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_transpose(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
     IntList padding, IntList output_padding, IntList stride, IntList dilation,
     int64_t groups, bool benchmark, bool deterministic) {
-  throw std::runtime_error("miopen_convolution_transpose: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_transpose: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_transpose_backward_input(
     const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation,
     int64_t groups, bool benchmark, bool deterministic) {
-  throw std::runtime_error("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support");
 }
 
 at::Tensor miopen_convolution_transpose_backward_weight(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
     IntList padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic) {
-  throw std::runtime_error("miopen_convolution_transpose_backward_weight: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_transpose_backward_weight: ATen not compiled with MIOpen support");
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_transpose_backward(
     const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups,
     bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
-  throw std::runtime_error("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support");
+  AT_ERROR("miopen_convolution_transpose_backward: ATen not compiled with MIOpen support");
 }
 
 }}
@@ -180,16 +180,12 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) {
 // Used on pad, stride and dilation
 static void check_args(CheckedFrom c, IntList args, size_t expected_size, const char* arg_name)
 {
-  if (args.size() > expected_size){
-    std::stringstream ss;
-    ss << "Too many " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")";
-    throw std::runtime_error(ss.str());
-  }
-  else if (args.size() < expected_size){
-    std::stringstream ss;
-    ss << "Not enough " << arg_name << " values (" << args.size() << ") supplied, expecting " << expected_size << " (while checking arguments for " << c << ")";
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(args.size() <= expected_size,
+           "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
+  AT_CHECK(args.size() >= expected_size,
+           "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
 
   auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
   if (num_negative_values > 0){
@@ -197,7 +193,7 @@ static void check_args(CheckedFrom c, IntList args, size_t expected_size, const
     ss << arg_name << " should be greater than zero but got (";
     std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
     ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
 }
 
diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.cpp b/aten/src/ATen/native/mkl/LinearAlgebra.cpp
new file mode 100644
index 00000000000000..062dd56d2ca300
--- /dev/null
+++ b/aten/src/ATen/native/mkl/LinearAlgebra.cpp
@@ -0,0 +1,95 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/Config.h"
+
+#if !AT_MKL_ENABLED()
+
+namespace at { namespace native {
+
+Tensor& _baddbmm_mkl_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  AT_ERROR("bmm: ATen not compiled with MKL support");
+}
+
+}}
+
+#else // AT_MKL_ENABLED
+
+#include "ATen/ATen.h"
+#include "ATen/Config.h"
+#include "ATen/Dispatch.h"
+#include "ATen/Utils.h"
+#include "ATen/NativeFunctions.h"
+
+#include <algorithm>
+#include <vector>
+#include <numeric>
+#include <cmath>
+
+#include <mkl.h>
+#include <ATen/mkl/Exceptions.h>
+#include <ATen/mkl/Descriptors.h>
+#include <ATen/mkl/Limits.h>
+
+namespace at { namespace native {
+
+static inline void gemm_batched(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B,
+  const int batch_size, const int M, const int N, const int K, const float alpha,
+  const float** A, const float** B, const float beta, float** C) {
+  const int lda = (trans_A == CblasNoTrans) ? K : M;
+  const int ldb = (trans_B == CblasNoTrans) ? N : K;
+  const int ldc = N;
+
+  cblas_sgemm_batch(CblasRowMajor, &trans_A, &trans_B, &M, &N, &K, &alpha,
+    A, &lda, B, &ldb, &beta, C, &ldc, 1, &batch_size);
+}
+
+static inline void gemm_batched(const CBLAS_TRANSPOSE trans_A, const CBLAS_TRANSPOSE trans_B,
+  const int batch_size, const int M, const int N, const int K, const double alpha,
+  const double** A, const double** B, const double beta, double** C) {
+  const int lda = (trans_A == CblasNoTrans) ? K : M;
+  const int ldb = (trans_B == CblasNoTrans) ? N : K;
+  const int ldc = N;
+
+  cblas_dgemm_batch(CblasRowMajor, &trans_A, &trans_B, &M, &N, &K, &alpha,
+    A, &lda, B, &ldb, &beta, C, &ldc, 1, &batch_size);
+}
+
+template <typename scalar_t>
+static inline void baddbmm_mkl_template(const Tensor& res, const Tensor& mat1, const Tensor& mat2, Scalar beta_, Scalar alpha_) {
+  auto is_transposed = [&](const Tensor& t) {
+    return t.stride(0) == 1 && t.stride(1) == t.size(0);
+  };
+  const CBLAS_TRANSPOSE trans_A = is_transposed(mat1[0]) ? CblasTrans : CblasNoTrans;
+  const CBLAS_TRANSPOSE trans_B = is_transposed(mat2[0]) ? CblasTrans : CblasNoTrans;
+
+  const int batch_size = mat1.size(0);
+  const int M = mat1.size(1);
+  const int N = mat2.size(2);
+  const int K = mat1.size(2);
+  scalar_t alpha = alpha_.to<scalar_t>();
+  scalar_t beta = beta_.to<scalar_t>();
+
+  std::vector<const scalar_t*> A(batch_size);
+  std::vector<const scalar_t*> B(batch_size);
+  std::vector<scalar_t*> C(batch_size);
+  for (int64_t batch = 0; batch < batch_size; batch++) {
+    A[batch] = mat1[batch].data<scalar_t>();
+    B[batch] = mat2[batch].data<scalar_t>();
+    C[batch] = res[batch].data<scalar_t>();
+  }
+
+  gemm_batched(trans_A, trans_B, batch_size, M, N, K, alpha, A.data(), B.data(), beta, C.data());
+}
+
+Tensor& _baddbmm_mkl_(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
+  // checks are done in native/LinearAlgebra.cpp
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "baddbmm__mkl", [&] {
+      baddbmm_mkl_template<scalar_t>(self, batch1, batch2, beta, alpha);
+    });
+
+  return self;
+}
+
+}} // namespace at::native
+
+#endif
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index c3451824c05113..2c81d69d3b8435 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -12,7 +12,7 @@ Tensor _fft_mkl(const Tensor& input, int64_t signal_ndim,
                 bool inverse, IntList checked_signal_sizes,
                 bool normalized, bool onesided,
                 IntList output_sizes) {
-  throw std::runtime_error("fft: ATen not compiled with MKL support");
+  AT_ERROR("fft: ATen not compiled with MKL support");
 }
 
 }}
@@ -191,12 +191,8 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim,
       osize = output_sizes[i];
       istride = complex_input ? input.stride(i) >> 1 : input.stride(i);
       ostride = onumel;
-      if (isize > MKL_LONG_MAX || osize > MKL_LONG_MAX || ostride > MKL_LONG_MAX) {
-        std::ostringstream ss;
-        ss << "MKL FFT: input signal numel exceeds allowed range [1 ~ "
-           << MKL_LONG_MAX << "]";
-        throw std::runtime_error(ss.str());
-      }
+      AT_CHECK(isize <= MKL_LONG_MAX && osize <= MKL_LONG_MAX && ostride <= MKL_LONG_MAX,
+               "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]");
       if (!need_contiguous && istride > MKL_LONG_MAX) {
         // If we didn't plan to contiguous-fy but the `istride` exceeds bound,
         // check if we can stride (equal to `inumel`) get back within bound if
@@ -205,12 +201,8 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim,
         // fine as `inumel` is non-decreasing.
         need_contiguous = true;
       }
-      if (need_contiguous && inumel > MKL_LONG_MAX) {
-        std::ostringstream ss;
-        ss << "MKL FFT: input signal numel exceeds allowed range [1 ~ "
-           << MKL_LONG_MAX << "]";
-        throw std::runtime_error(ss.str());
-      }
+      AT_CHECK(!need_contiguous || inumel <= MKL_LONG_MAX,
+               "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]");
       inumel *= isize;
       onumel *= osize;
     }
@@ -227,7 +219,7 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim,
     std::ostringstream ss;
     ss << "MKL FFT doesn't support tensor of type: "
        << at::toString(input.type().scalarType());
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
   // signal type
   DFTI_CONFIG_VALUE signal_type;
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index 00f4e8f95b92d4..ddbd6977645e74 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -9,25 +9,25 @@ namespace at { namespace native {
 at::Tensor mkldnn_convolution(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
     IntList padding, IntList stride, IntList dilation, int64_t groups) {
-  throw std::runtime_error("mkldnn_convolution_forward: ATen not compiled with MKLDNN support");
+  AT_ERROR("mkldnn_convolution_forward: ATen not compiled with MKLDNN support");
 }
 
 at::Tensor mkldnn_convolution_backward_input(
     IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) {
-  throw std::runtime_error("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support");
+  AT_ERROR("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support");
 }
 
 std::tuple<at::Tensor,at::Tensor> mkldnn_convolution_backward_weights(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
     IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) {
-  throw std::runtime_error("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support");
+  AT_ERROR("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support");
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_convolution_backward(
     const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
     IntList padding, IntList stride, IntList dilation, int64_t groups, std::array<bool,3> output_mask) {
-  throw std::runtime_error("mkldnn_convolution_backward: ATen not compiled with MKLDNN support");
+  AT_ERROR("mkldnn_convolution_backward: ATen not compiled with MKLDNN support");
 }
 
 }}
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2b71982be8cf6a..5c99d7c97e9b3e 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -245,6 +245,27 @@
     CPU: _atan_out_cpu
     CUDA: _atan_out_cuda
 
+- func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: baddbmm_cpu
+    CUDA: baddbmm_cuda
+
+- func: baddbmm_(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: method
+  dispatch:
+    CPU: baddbmm__cpu
+    CUDA: baddbmm__cuda
+
+- func: _baddbmm_mkl_(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+
+- func: baddbmm_out(Tensor result, Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CPU: baddbmm_out_cpu
+    CUDA: baddbmm_out_cuda
+
 - func: bartlett_window(int64_t window_length, TensorOptions options={}) -> Tensor
 
 - func: bartlett_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
@@ -281,6 +302,18 @@
 
 - func: blackman_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
 
+- func: bmm(Tensor self, Tensor mat2) -> Tensor
+  variants: function, method
+  dispatch:
+    CPU: bmm_cpu
+    CUDA: bmm_cuda
+
+- func: bmm_out(Tensor result, Tensor self, Tensor mat2) -> Tensor
+  variants: function
+  dispatch:
+    CPU: bmm_out_cpu
+    CUDA: bmm_out_cuda
+
 - func: broadcast_tensors(TensorList tensors) -> TensorList
 
 - func: cat(TensorList tensors, int64_t dim=0) -> Tensor
@@ -374,7 +407,7 @@
 
 - func: conv3d(Tensor input, Tensor weight, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1, int64_t groups=1) -> Tensor
 
-- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad) -> Tensor
+- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad=0) -> Tensor
 
 - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int64_t pad) -> (Tensor, Tensor, Tensor)
 
@@ -840,6 +873,9 @@
 - func: index_put_(Tensor self, TensorList indices, Tensor values) -> Tensor
   variants: function, method
 
+- func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, double momentum, double eps, bool cudnn_enabled) -> Tensor
+  variants: function
+
 - func: inverse(Tensor self) -> Tensor
   variants: function, method
 
@@ -1177,6 +1213,8 @@
 - func: permute(Tensor self, IntList dims) -> Tensor
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
 
+- func: pixel_shuffle(Tensor self, int64_t upscale_factor) -> Tensor
+
 - func: pin_memory(Tensor self) -> Tensor
   variants: function, method
 
@@ -1713,6 +1751,27 @@
     CPU: _s_where_cpu
     CUDA: _s_where_cuda
 
+- func: norm_except_dim(Tensor v, int64_t pow=2, int64_t dim=0) -> Tensor
+  variants: function
+
+# VariableType::_weight_norm does not want to be given a gap in the autograd graph,
+# so we don't define "dispatch" variants for it.
+- func: _weight_norm(Tensor v, Tensor g, int64_t dim=0) -> Tensor
+  variants: function
+
+- func: _weight_norm_cuda_interface(Tensor v, Tensor g, int64_t dim=0) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: weight_norm_cuda
+
+- func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int64_t dim) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: weight_norm_cuda_backward
+
+- func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int64_t dim) -> (Tensor, Tensor)
+  variants: function
+
 - func: zeros(IntList size, TensorOptions options={}) -> Tensor
 
 - func: zeros_out(Tensor result, IntList size) -> Tensor
@@ -2030,6 +2089,22 @@
   variants: function, method
   device_guard: False
 
+- func: to(Tensor self, Device device, ScalarType dtype, bool non_blocking=false) -> Tensor
+  variants: method
+  device_guard: False
+
+- func: to(Tensor self, ScalarType dtype, bool non_blocking=false) -> Tensor
+  variants: method
+  device_guard: False
+
+- func: to(Tensor self, Device device, bool non_blocking=false) -> Tensor
+  variants: method
+  device_guard: False
+
+- func: to(Tensor self, Tensor other, bool non_blocking=false) -> Tensor
+  variants: method
+  device_guard: False
+
 - func: meshgrid(TensorList tensors) -> TensorList
 
 # This has a method dispatch to work around circular include problems
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index c96e577ba9d47a..25fd4fc5df4326 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -68,7 +68,7 @@ SparseTensor new_sparse(const SparseType& dtype) {
   } else {
     type_id = SparseCPUTensorId();
   }
-  return SparseTensor(c10::make_intrusive<SparseTensorImpl>(type_id, dtype.scalarType()).release(), /* retain */ false);
+  return SparseTensor(c10::make_intrusive<SparseTensorImpl>(type_id, scalarTypeToTypeMeta(dtype.scalarType())));
 }
 
 /*** Helper methods ***/
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 2e37ad41a3b96e..afd8001734a9a8 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -784,7 +784,7 @@ Tensor& _sspaddmm_out_only_sparse(Tensor& result, const Tensor& self,
 // sparse, dense -> sparse
 Tensor smm(const Tensor& self, const Tensor& mat2) {
   auto result = self.type().tensor();
-  self.type().sspaddmm_out(result, result, self, mat2, 0.0, 1.0);
+  at::sspaddmm_out(result, result, self, mat2, 0.0, 1.0);
   return result;
 }
 
@@ -792,7 +792,7 @@ Tensor smm(const Tensor& self, const Tensor& mat2) {
 Tensor sspaddmm(const Tensor& self, const Tensor& mat1, const Tensor& mat2,
     Scalar beta, Scalar alpha) {
   auto result = self.type().tensor();
-  self.type().sspaddmm_out(result, self, mat1, mat2, beta, alpha);
+  at::sspaddmm_out(result, self, mat1, mat2, beta, alpha);
   return result;
 }
 
diff --git a/aten/src/ATen/preprocess_declarations.py b/aten/src/ATen/preprocess_declarations.py
index 173ac439487d26..98b22c7f8e1d6a 100644
--- a/aten/src/ATen/preprocess_declarations.py
+++ b/aten/src/ATen/preprocess_declarations.py
@@ -217,8 +217,20 @@ def signature(option, i=None, value=None):
                                                                        (raw_args - filtered_args)]
 
 
+def is_extended_method(option):
+    if 'method' in option['variants']:
+        return False
+    elif option.get('deprecated', False):
+        return False
+    elif not option['variants']:
+        return False
+    else:
+        return True
+
+
 def run(declarations):
     declarations = [d for d in declarations if not exclude(d)]
+    non_extended_methods = set()
     for declaration in declarations:
         common_with_cwrap.set_declaration_defaults(declaration)
         declaration['options'] = [deepcopy(o) for o in declaration['options']]
@@ -237,6 +249,20 @@ def run(declarations):
                 sanitize_return(option)
             process_types_and_backends(option)
             add_variants(option)
+            if not is_extended_method(option):
+                non_extended_methods.add(option['api_name'])
         declaration['options'] = handle_outputs_taken_as_arguments(
             declaration['options'])
+
+    # We (very unfortunately) have overloaded virtual methods. Because
+    # of C++'s rules, we cannot move one overload without doing some
+    # extra work to make sure that overload in a superclass and an
+    # overload in a subclass resolve together. I've chosen to resolve
+    # this problem simply by moving ALL overloads of a method which
+    # occurs in Tensor to Type.  This is why we have to first compute
+    # which methods *names* go on type, and then move ALL overloads
+    # of this name to Type.
+    for declaration in declarations:
+        for option in declaration['options']:
+            option['extended_method'] = option['api_name'] not in non_extended_methods
     return declarations
diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h
index b4a2e05e759ea3..8bbc17af5da291 100644
--- a/aten/src/ATen/templates/Functions.h
+++ b/aten/src/ATen/templates/Functions.h
@@ -4,6 +4,7 @@
 
 #include "ATen/core/Scalar.h"
 #include "ATen/Type.h"
+#include "ATen/TypeExtendedInterface.h"
 #include "ATen/Tensor.h"
 #include "ATen/core/Storage.h"
 #include "ATen/core/Generator.h"
@@ -20,14 +21,19 @@ using native::tensor;
 
 ${function_declarations}
 
-static inline Type & infer_type(const Tensor & t) {
+namespace detail {
+
+static inline TypeExtendedInterface & infer_type(const Tensor & t) {
   AT_CHECK(t.defined(), "undefined Tensor");
-  return t.type();
+  return getType(t);
 }
-static inline Type & infer_type(const TensorList & tl) {
+static inline TypeExtendedInterface & infer_type(const TensorList & tl) {
   AT_CHECK(tl.size() > 0, "expected a non-empty list of Tensors");
-  return tl[0].type();
+  return getType(tl[0]);
 }
+
+} // namespace detail
+
 // function definitions are all static inline because
 // they are one-line statically dispatched functions that
 // invoke the actual dynamic dispatch on the correct argument
diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h
index 82a2f00ff77bc7..c6355127734b1b 100644
--- a/aten/src/ATen/templates/NativeFunctions.h
+++ b/aten/src/ATen/templates/NativeFunctions.h
@@ -5,7 +5,7 @@
 #include <ATen/Context.h>
 #include <ATen/ScalarType.h>
 #include <ATen/TensorOperators.h>
-#include <ATen/TensorMethods.h>
+#include <ATen/core/TensorMethods.h>
 #include <ATen/core/TensorOptions.h>
 
 #include <array>
diff --git a/aten/src/ATen/templates/SparseTypeDerived.cpp b/aten/src/ATen/templates/SparseTypeDerived.cpp
index 064da2bc2186e0..61035f2c3d38f7 100644
--- a/aten/src/ATen/templates/SparseTypeDerived.cpp
+++ b/aten/src/ATen/templates/SparseTypeDerived.cpp
@@ -11,7 +11,7 @@
 #include "ATen/Allocator.h"
 #include "ATen/DeviceGuard.h"
 #include "ATen/NativeFunctions.h"
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 #include "ATen/Utils.h"
 #include "ATen/WrapDimUtils.h"
 #include "ATen/core/Half.h"
@@ -32,6 +32,9 @@ namespace at {
 ScalarType ${Type}::scalarType() const {
   return ScalarType::${ScalarName};
 }
+caffe2::TypeMeta ${Type}::typeMeta() const {
+  return caffe2::TypeMeta::Make<${ScalarType}>();
+}
 Backend ${Type}::backend() const {
   return Backend::${Backend};
 }
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 77c18466404c83..85e7c84961d6ee 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -1,7 +1,5 @@
 #pragma once
 
-// ${generated_comment}
-
 #include "ATen/core/Device.h"
 #include "ATen/core/Layout.h"
 #include "ATen/core/Scalar.h"
@@ -9,9 +7,9 @@
 #include "ATen/core/SparseTensorRef.h"
 #include "ATen/core/Storage.h"
 #include "ATen/core/TensorAccessor.h"
-#include "ATen/TensorImpl.h"
+#include "ATen/core/TensorImpl.h"
 #include "ATen/core/optional.h"
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 #include "ATen/core/Error.h"
 
 namespace at {
@@ -41,20 +39,12 @@ namespace at {
 // special care must be taken to handle this.
 struct AT_API Tensor {
   Tensor(){};
-  Tensor(TensorImpl* tensor_impl, bool retain)
-      : tensor_impl_(c10::intrusive_ptr<TensorImpl, UndefinedTensor>::reclaim(
-            tensor_impl)) {
-    if (tensor_impl == nullptr) {
+  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
+      : tensor_impl_(std::move(tensor_impl)) {
+    if (tensor_impl_.get() == nullptr) {
       throw std::runtime_error("TensorBaseImpl with nullptr not supported");
     }
-    if (retain && tensor_impl != UndefinedTensor::singleton()) {
-      c10::raw::intrusive_ptr::incref(tensor_impl);
-    }
   }
-  Tensor(const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& ptr)
-      : tensor_impl_(ptr) {}
-  Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensor>&& ptr)
-      : tensor_impl_(std::move(ptr)) {}
 
   Tensor(const Tensor&) = default;
   Tensor(Tensor&&) = default;
@@ -69,7 +59,7 @@ struct AT_API Tensor {
   TensorImpl * unsafeReleaseTensorImpl() {
     return tensor_impl_.release();
   }
-  const c10::intrusive_ptr<TensorImpl, UndefinedTensor>& getIntrusivePtr() const {
+  const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr() const {
     return tensor_impl_;
   }
 
@@ -152,7 +142,7 @@ struct AT_API Tensor {
     return tensor_impl_->type_id();
   }
   ScalarType scalar_type() const {
-    return tensor_impl_->scalar_type();
+    return dataTypeToScalarType(tensor_impl_->dtype().id());
   }
   const Storage& storage() const {
     return tensor_impl_->storage();
@@ -162,12 +152,6 @@ struct AT_API Tensor {
   Tensor toType(ScalarType t) const;
   Tensor toBackend(Backend b) const;
 
-  /// New-style `to()` methods.
-  /// NB: These methods are defined in TensorOptions.h.
-  Tensor to(Device device, ScalarType dtype, bool non_blocking = false) const;
-  Tensor to(ScalarType dtype, bool non_blocking = false) const;
-  Tensor to(Device device, bool non_blocking = false) const;
-
   /// Returns true if the `Tensor` is actually a `torch::autograd::Variable`.
   /// Defined in Type.h because of include order issues.
   bool is_variable() const noexcept;
@@ -202,6 +186,8 @@ struct AT_API Tensor {
   AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE)
   #undef TO_C_TYPE
 
+  // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and
+  // dimension.
   template<typename T, size_t N>
   TensorAccessor<T,N> accessor() const& {
     static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
@@ -211,6 +197,20 @@ struct AT_API Tensor {
   template<typename T, size_t N>
   TensorAccessor<T,N> accessor() && = delete;
 
+  // Return a `PackedTensorAccessor` for CUDA `Tensor`s. You have to specify scalar type and
+  // dimension. You can optionally specify RestrictPtrTraits as a template parameter to
+  // cast the data pointer to a __restrict__ pointer.
+  // In order to use this, your CUDA kernel has to take a corresponding PackedTensorAccessor
+  // as an argument.
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+    PackedTensorAccessor<T,N,PtrTraits> packed_accessor() const& {
+    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data<T>()");
+    AT_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
+    return PackedTensorAccessor<T,N,PtrTraits>(static_cast<typename PtrTraits<T>::PtrType>(data<T>()),sizes().data(),strides().data());
+  }
+  template<typename T, size_t N,  template <typename U> class PtrTraits = DefaultPtrTraits>
+  PackedTensorAccessor<T,N> packed_accessor() && = delete;
+
   Tensor operator-() const;
   Tensor& operator+=(const Tensor & other);
   Tensor& operator+=(Scalar other);
@@ -267,7 +267,7 @@ struct AT_API Tensor {
   friend struct WeakTensor;
 
 protected:
-  c10::intrusive_ptr<TensorImpl, UndefinedTensor> tensor_impl_;
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl_;
 };
 
 struct AT_API WeakTensor {
@@ -295,6 +295,8 @@ struct AT_API WeakTensor {
   }
 
 private:
-  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensor> weak_tensor_impl_;
+  c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl> weak_tensor_impl_;
 };
 } // namespace at
+
+#include "ATen/core/TensorMethods.h"
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index 07e5f2b634372c..8283bea01f6bed 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -1,13 +1,10 @@
 #pragma once
 
-// ${generated_comment}
-
-#include "ATen/Tensor.h"
+#include "ATen/core/Tensor.h"
 #include "ATen/core/Scalar.h"
 #include "ATen/core/SparseTensorRef.h"
-#include "ATen/Type.h"
+#include "ATen/core/Type.h"
 #include "ATen/core/TensorOptions.h"
-#include "ATen/DeviceGuard.h"
 
 namespace at {
 
@@ -44,45 +41,6 @@ inline TensorOptions Tensor::options() const {
                         .is_variable(is_variable());
 }
 
-namespace detail {
-inline Tensor to(
-    const Tensor& tensor,
-    const TensorOptions& options,
-    bool non_blocking) {
-  // Don't copy if the options match.
-  if (tensor.options() == options) {
-    return tensor;
-  }
-  AT_CHECK(tensor.is_variable() == options.is_variable(),
-           "cannot change is_variable, from: ", tensor.is_variable(),
-           " to: ", options.is_variable());
-  DeviceGuard guard(options.device());
-  return tensor.type().toBackend(options.backend()).toScalarType(options.dtype()).copy(tensor, non_blocking);
-}
-} // namespace detail
-
-inline Tensor Tensor::to(Device device, ScalarType dtype, bool non_blocking)
-    const {
-  if (this->device() == device && this->dtype() == dtype) {
-    return *this;
-  }
-  return detail::to(*this, options().device(device).dtype(dtype), non_blocking);
-}
-
-inline Tensor Tensor::to(ScalarType dtype, bool non_blocking) const {
-  if (this->dtype() == dtype) {
-    return *this;
-  }
-  return detail::to(*this, options().dtype(dtype), non_blocking);
-}
-
-inline Tensor Tensor::to(Device device, bool non_blocking) const {
-  if (this->device() == device) {
-    return *this;
-  }
-  return detail::to(*this, options().device(device), non_blocking);
-}
-
 inline void Tensor::backward(
     at::optional<Tensor> gradient,
     bool keep_graph,
@@ -97,6 +55,22 @@ inline void Tensor::set_data(Tensor new_data) {
 // all static inline to allow for inlining of the non-dynamic part of dispatch
 ${tensor_method_definitions}
 
+inline bool Tensor::is_variable() const noexcept {
+  return type().is_variable();
+}
+
+inline ScalarType Tensor::dtype() const noexcept {
+  return type().scalarType();
+}
+
+inline Layout Tensor::layout() const noexcept {
+  return type().layout();
+}
+
+inline Device Tensor::device() const {
+  return Device(type().device_type(), type().is_cuda() ? get_device() : -1);
+}
+
 #define DEFINE_CAST(T, name, _)                  \
   template <>                                    \
   inline T* Tensor::data() const {               \
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index 3a7080ea201e35..0e00a5d3499fcd 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -1,7 +1,5 @@
 #pragma once
 
-// ${generated_comment}
-
 #include "ATen/core/ATenGeneral.h"
 #include "ATen/core/Allocator.h"
 #include "ATen/core/Deprecated.h"
@@ -10,7 +8,6 @@
 #include "ATen/core/Scalar.h"
 #include "ATen/core/ScalarType.h"
 #include "ATen/core/SparseTensorRef.h"
-#include "ATen/Tensor.h"
 #include "ATen/core/ArrayRef.h"
 #include "ATen/core/Half.h"
 #include "ATen/core/TensorTypeIdRegistration.h"
@@ -36,6 +33,7 @@ class Context;
 struct Allocator;
 struct Generator;
 struct Storage;
+struct Tensor;
 
 static inline void noop_deleter(void*) {}
 
@@ -55,6 +53,7 @@ struct AT_API Type {
 
   virtual ~Type() {}
   virtual ScalarType scalarType() const = 0;
+  virtual caffe2::TypeMeta typeMeta() const = 0;
   virtual Backend backend() const = 0;
   Layout layout() const noexcept { return layout_from_backend(backend()); }
   virtual bool is_cuda() const = 0;
@@ -99,7 +98,7 @@ struct AT_API Type {
     return backendToDeviceType(backend());
   }
 
-  virtual Tensor copy(const Tensor & src, bool non_blocking=false) const = 0;
+  virtual Tensor copy(const Tensor & src, bool non_blocking=false, optional<Device> to_device={}) const = 0;
   virtual Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const = 0;
   virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0;
   virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const = 0;
@@ -144,20 +143,6 @@ struct AT_API Type {
 
 };
 
-inline bool Tensor::is_variable() const noexcept {
-  return type().is_variable();
-}
-
-inline ScalarType Tensor::dtype() const noexcept {
-  return type().scalarType();
-}
-
-inline Layout Tensor::layout() const noexcept {
-  return type().layout();
-}
-
-inline Device Tensor::device() const {
-  return Device(type().device_type(), type().is_cuda() ? get_device() : -1);
-}
-
 } // namespace at
+
+#include "ATen/core/Tensor.h"
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index 5e614edc57f216..0891f6d9f4f492 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -22,8 +22,11 @@ Tensor & TypeDefault::copy_(Tensor & self, const Tensor & src, bool non_blocking
   return s_copy_(self, b_src, non_blocking);
 }
 
-Tensor TypeDefault::copy(const Tensor & src, bool non_blocking) const {
-  // TODO(psag): have a DeviceGuard here
+Tensor TypeDefault::copy(const Tensor & src, bool non_blocking, optional<Device> to_device) const {
+  DeviceGuard device_guard;
+  if (to_device.has_value()) {
+    device_guard.set_index(to_device.value().index());
+  }
   AT_CHECK(src.defined(), "attempt to copy an undefined tensor");
   if (is_sparse()) {
     auto indices = src._indices();
@@ -91,29 +94,33 @@ Tensor TypeDefault::tensorWithAllocator(IntList sizes, IntList strides, Allocato
 }
 
 Storage TypeDefault::storage(bool resizable) const {
-  return Storage(scalarType(), 0, allocator(), resizable);
+  return Storage(typeMeta(), 0, allocator(), resizable);
 }
 Storage TypeDefault::storage(size_t size, bool resizable) const {
-  return Storage(scalarType(), size, allocator(), resizable);
+  return Storage(typeMeta(), size, allocator(), resizable);
 }
 Storage TypeDefault::storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const {
     return Storage(
-      scalarType(),
+      typeMeta(),
       InefficientStdFunctionContext::makeDataPtr(data, deleter, getDeviceFromPtr(data)),
       size,
       deleter);
 }
 Storage TypeDefault::storageWithAllocator(int64_t size, Allocator* allocator) const {
-    return Storage(scalarType(), size, allocator);
+    return Storage(typeMeta(), size, allocator);
 }
 Tensor TypeDefault::unsafeTensorFromTH(void * th_pointer, bool retain) const {
-  return Tensor(static_cast<TensorImpl*>(th_pointer), retain);
+  auto tensor_impl = c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(static_cast<TensorImpl*>(th_pointer));
+  if (retain && tensor_impl.get() != UndefinedTensorImpl::singleton()) {
+    c10::raw::intrusive_ptr::incref(tensor_impl.get());
+  }
+  return Tensor(std::move(tensor_impl));
 }
 Storage TypeDefault::unsafeStorageFromTH(void * th_pointer, bool retain) const {
   if (retain && th_pointer) {
     c10::raw::intrusive_ptr::incref(static_cast<StorageImpl*>(th_pointer));
   }
-  return Storage(static_cast<StorageImpl*>(th_pointer));
+  return Storage(c10::intrusive_ptr<StorageImpl>::reclaim(static_cast<StorageImpl*>(th_pointer)));
 }
 
 
diff --git a/aten/src/ATen/templates/TypeDefault.h b/aten/src/ATen/templates/TypeDefault.h
index 64ec158f82349e..e4a75abb48993e 100644
--- a/aten/src/ATen/templates/TypeDefault.h
+++ b/aten/src/ATen/templates/TypeDefault.h
@@ -2,13 +2,13 @@
 
 // ${generated_comment}
 
-#include "ATen/Type.h"
+#include "ATen/TypeExtendedInterface.h"
 
 namespace at {
 
-struct AT_API TypeDefault : public Type {
+struct AT_API TypeDefault : public TypeExtendedInterface {
   explicit TypeDefault(TensorTypeId type_id, bool is_variable, bool is_undefined)
-      : Type(type_id, is_variable, is_undefined) {}
+      : TypeExtendedInterface(type_id, is_variable, is_undefined) {}
 
   // Make sure overload resolution considers the nullary virtual method.
   // (A single argument overload is generated in the list.)
@@ -25,7 +25,7 @@ struct AT_API TypeDefault : public Type {
   Type & toBackend(Backend b) const override;
   Type & toScalarType(ScalarType s) const override;
 
-  Tensor copy(const Tensor & src, bool non_blocking=false) const override;
+  Tensor copy(const Tensor & src, bool non_blocking=false, optional<Device> to_device={}) const override;
   Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const override;
 
   void backward(Tensor & self, at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const override;
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index 5b26400ea52f12..d012274c5fceed 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -10,11 +10,11 @@
 $th_headers
 $storage_tensor_headers
 #include "ATen/${Generator}.h"
-#include "ATen/TensorImpl.h"
+#include "ATen/core/TensorImpl.h"
 #include "ATen/Allocator.h"
 #include "ATen/DeviceGuard.h"
 #include "ATen/NativeFunctions.h"
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 #include "ATen/Utils.h"
 #include "ATen/WrapDimUtils.h"
 #include "ATen/core/Half.h"
@@ -32,9 +32,15 @@ namespace at {
 
 ${Type}::${Type}()
   : ${DenseBackend}TypeDefault(${Backend}TensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
+
 ScalarType ${Type}::scalarType() const {
   return ScalarType::${ScalarName};
 }
+
+caffe2::TypeMeta ${Type}::typeMeta() const {
+    return caffe2::TypeMeta::Make<${ScalarType}>();
+}
+
 Backend ${Type}::backend() const {
   return Backend::${Backend};
 }
diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h
index 3a48d8b26e32b4..116df9b4d465fe 100644
--- a/aten/src/ATen/templates/TypeDerived.h
+++ b/aten/src/ATen/templates/TypeDerived.h
@@ -19,6 +19,7 @@ namespace at {
 struct ${Type} final : public ${DenseBackend}TypeDefault {
   explicit ${Type}();
   virtual ScalarType scalarType() const override;
+  virtual caffe2::TypeMeta typeMeta() const override;
   virtual Backend backend() const override;
   virtual const char * toString() const override;
   virtual size_t elementSizeInBytes() const override;
diff --git a/aten/src/ATen/templates/TypeExtendedInterface.h b/aten/src/ATen/templates/TypeExtendedInterface.h
new file mode 100644
index 00000000000000..82cb658c9eeea8
--- /dev/null
+++ b/aten/src/ATen/templates/TypeExtendedInterface.h
@@ -0,0 +1,12 @@
+#pragma once
+#include <ATen/Type.h>
+
+namespace at {
+
+struct AT_API TypeExtendedInterface : public Type {
+  explicit TypeExtendedInterface(TensorTypeId type_id, bool is_variable, bool is_undefined)
+      : Type(type_id, is_variable, is_undefined) {}
+  ${pure_virtual_extended_type_method_declarations}
+};
+
+} // namespace at
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 4fcbeaa137ae78..8103f025988aab 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -27,7 +27,8 @@ list(APPEND ATen_CUDA_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/apply_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/stream_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_half_test.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_optional_test.cu)
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_optional_test.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_packedtensoraccessor_test.cu)
 if (CUDNN_FOUND)
   list(APPEND ATen_CUDA_TEST_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cudnn_test.cpp)
diff --git a/aten/src/ATen/test/apply_test.cpp b/aten/src/ATen/test/apply_test.cpp
index 986f599da6d11b..fc39eccee3926b 100644
--- a/aten/src/ATen/test/apply_test.cpp
+++ b/aten/src/ATen/test/apply_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "cuda.h"
 #include "cuda_runtime.h"
@@ -11,111 +11,111 @@ Tests related to tensor indexing and applying operations.
 */
 #ifndef _WIN32
 
-TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") {
+CATCH_TEST_CASE("2D Contiguous", "Collapses a 2D contiguous tensor to 1D contiguous") {
     int sizes[] = {4, 4};
     int strides[] = {4, 1};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
     ti.collapseDims();
-    REQUIRE(ti.dims == 1);
-    REQUIRE(ti.sizes[0] == (4 * 4));
+    CATCH_REQUIRE(ti.dims == 1);
+    CATCH_REQUIRE(ti.sizes[0] == (4 * 4));
 }
 
-TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") {
+CATCH_TEST_CASE("3D Contiguous", "Collapses a 3D contiguous tensor to a 1D contiguous") {
     int sizes[] = {6, 3, 7};
     int strides[] = {3 * 7, 7, 1};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
     ti.collapseDims();
-    REQUIRE(ti.dims == 1);
-    REQUIRE(ti.sizes[0] == (6 * 3 * 7));
+    CATCH_REQUIRE(ti.dims == 1);
+    CATCH_REQUIRE(ti.sizes[0] == (6 * 3 * 7));
 }
 
-TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") {
+CATCH_TEST_CASE("3D Partial Collapse", "Collapses a 3D noncontiguous tensor to a 2D tensor") {
     int sizes[] = {4, 3, 2};
     int strides[] = {3 * 3, 3, 1};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
     ti.collapseDims();
-    REQUIRE(ti.dims == 2);
-    REQUIRE(ti.sizes[0] == (4 * 3));
-    REQUIRE(ti.sizes[1] == 2);
+    CATCH_REQUIRE(ti.dims == 2);
+    CATCH_REQUIRE(ti.sizes[0] == (4 * 3));
+    CATCH_REQUIRE(ti.sizes[1] == 2);
 }
 
-TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") {
+CATCH_TEST_CASE("2D Strided Collapse", "Collapses a 2D skip contiguous tensor to a 1D skip contiguous tensor") {
     int sizes[] = {3, 2};
     int strides[] = {2 * 2, 2};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 2, sizes, strides};
     ti.collapseDims();
-    REQUIRE(ti.dims == 1);
-    REQUIRE(ti.sizes[0] == (3 * 2));
-    REQUIRE(ti.strides[0] == 2);
+    CATCH_REQUIRE(ti.dims == 1);
+    CATCH_REQUIRE(ti.sizes[0] == (3 * 2));
+    CATCH_REQUIRE(ti.strides[0] == 2);
 }
 
-TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){
+CATCH_TEST_CASE("4D Partial Strided Collapse", "Collapses a 4D tensor to a 2D tensor"){
     int sizes[] = {3, 6, 5, 2};
     int strides[] = {6 * 22, 22, 2 * 2, 2};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
     ti.collapseDims();
-    REQUIRE(ti.dims == 2);
-    REQUIRE(ti.sizes[0] == (3 * 6));
-    REQUIRE(ti.strides[0] == 22);
-    REQUIRE(ti.sizes[1] == (5 * 2));
-    REQUIRE(ti.strides[1] == 2);
+    CATCH_REQUIRE(ti.dims == 2);
+    CATCH_REQUIRE(ti.sizes[0] == (3 * 6));
+    CATCH_REQUIRE(ti.strides[0] == 22);
+    CATCH_REQUIRE(ti.sizes[1] == (5 * 2));
+    CATCH_REQUIRE(ti.strides[1] == 2);
 }
 
-TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") {
+CATCH_TEST_CASE("Collapsing Zeros and Ones", "Collapses a 5D tensor to a 1D tensor") {
     int sizes[] = {1, 10, 1, 5, 4};
     int strides[] = {4, 0, 16, 0, 1};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 5, sizes, strides};
     ti.collapseDims();
-    REQUIRE(ti.dims == 2);
-    REQUIRE(ti.sizes[0] == (10 * 5));
-    REQUIRE(ti.strides[0] == 0);
-    REQUIRE(ti.sizes[1] == 4);
-    REQUIRE(ti.strides[1] == 1);
+    CATCH_REQUIRE(ti.dims == 2);
+    CATCH_REQUIRE(ti.sizes[0] == (10 * 5));
+    CATCH_REQUIRE(ti.strides[0] == 0);
+    CATCH_REQUIRE(ti.sizes[1] == 4);
+    CATCH_REQUIRE(ti.strides[1] == 1);
 }
 
-TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") {
+CATCH_TEST_CASE("Collapsing to a Point Tensor", "Collapses a 3D tensor to a point tensor") {
     int sizes[] = {1, 1, 1};
     int strides[] = {17, 12, 3};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
-    REQUIRE(ti.collapseDims() == 0);
-    REQUIRE(ti.dims == 1);
-    REQUIRE(ti.sizes[0] == 1);
-    REQUIRE(ti.strides[0] == 1);
+    CATCH_REQUIRE(ti.collapseDims() == 0);
+    CATCH_REQUIRE(ti.dims == 1);
+    CATCH_REQUIRE(ti.sizes[0] == 1);
+    CATCH_REQUIRE(ti.strides[0] == 1);
 }
 
-TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") {
+CATCH_TEST_CASE("Excluding in a 4D Contiguous", "Collapses a 4D tensor to a 3D tensor") {
     int sizes[] = {3, 6, 5, 2};
     int strides[] = {6 * 22, 22, 2 * 2, 2};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
-    REQUIRE(ti.collapseDims(1) == 1);
-    REQUIRE(ti.dims == 3);
-    REQUIRE(ti.sizes[0] == 3);
-    REQUIRE(ti.strides[0] == (6 * 22));
-    REQUIRE(ti.sizes[1] == 6);
-    REQUIRE(ti.strides[1] == 22);
-    REQUIRE(ti.sizes[2] == (5 * 2));
-    REQUIRE(ti.strides[2] == 2);
+    CATCH_REQUIRE(ti.collapseDims(1) == 1);
+    CATCH_REQUIRE(ti.dims == 3);
+    CATCH_REQUIRE(ti.sizes[0] == 3);
+    CATCH_REQUIRE(ti.strides[0] == (6 * 22));
+    CATCH_REQUIRE(ti.sizes[1] == 6);
+    CATCH_REQUIRE(ti.strides[1] == 22);
+    CATCH_REQUIRE(ti.sizes[2] == (5 * 2));
+    CATCH_REQUIRE(ti.strides[2] == 2);
 }
 
-TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") {
+CATCH_TEST_CASE("Roving Exclusion", "Collapses a 4D tensor to a 3D tensor") {
     int sizes[] = {3, 6, 5, 2};
     int strides[] = {6 * 22, 22, 2 * 2, 2};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 4, sizes, strides};
-    REQUIRE(ti.collapseDims(2) == 1);
-    REQUIRE(ti.dims == 3);
-    REQUIRE(ti.sizes[0] == (3 * 6));
-    REQUIRE(ti.strides[0] == 22);
-    REQUIRE(ti.sizes[1] == 5);
-    REQUIRE(ti.strides[1] == 4);
-    REQUIRE(ti.sizes[2] == 2);
-    REQUIRE(ti.strides[2] == 2);
+    CATCH_REQUIRE(ti.collapseDims(2) == 1);
+    CATCH_REQUIRE(ti.dims == 3);
+    CATCH_REQUIRE(ti.sizes[0] == (3 * 6));
+    CATCH_REQUIRE(ti.strides[0] == 22);
+    CATCH_REQUIRE(ti.sizes[1] == 5);
+    CATCH_REQUIRE(ti.strides[1] == 4);
+    CATCH_REQUIRE(ti.sizes[2] == 2);
+    CATCH_REQUIRE(ti.strides[2] == 2);
 }
 
-TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") {
+CATCH_TEST_CASE("Invalid Exclusion", "Attempts to exclude a nonexisting dimension") {
     int sizes[] = {1, 1, 1};
     int strides[] = {17, 12, 3};
     ::at::cuda::detail::TensorInfo<void, int> ti{nullptr, 3, sizes, strides};
-    REQUIRE_THROWS(ti.collapseDims(5));
+    _CATCH_REQUIRE_THROWS(ti.collapseDims(5));
 } 
 
 #endif
diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp
index 38027baae97b73..22be6de7acbc02 100644
--- a/aten/src/ATen/test/apply_utils_test.cpp
+++ b/aten/src/ATen/test/apply_utils_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/CPUApplyUtils.h"
@@ -108,32 +108,32 @@ void test(Type& type, IntList shape, int64_t a = 0, int64_t b = 1) {
   });
 }
 
-TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") {
+CATCH_TEST_CASE("apply utils test 2-dim small contiguous", "[cpu]") {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {2, 1}, -1, -1);
 }
 
-TEST_CASE("apply utils test 2-dim small", "[cpu]") {
+CATCH_TEST_CASE("apply utils test 2-dim small", "[cpu]") {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {2, 1});
 }
 
-TEST_CASE("apply utils test 2-dim", "[cpu]") {
+CATCH_TEST_CASE("apply utils test 2-dim", "[cpu]") {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {20, 10});
 }
 
-TEST_CASE("apply utils test 3-dim", "[cpu]") {
+CATCH_TEST_CASE("apply utils test 3-dim", "[cpu]") {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 4, 2});
 }
 
-TEST_CASE("apply utils test 3-dim medium", "[cpu]") {
+CATCH_TEST_CASE("apply utils test 3-dim medium", "[cpu]") {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 40, 2});
 }
 
-TEST_CASE("apply utils test 10-dim", "[cpu]") {
+CATCH_TEST_CASE("apply utils test 10-dim", "[cpu]") {
   manual_seed(123, at::kCPU);
   test(CPU(kDouble), {3, 4, 2, 5, 2, 1, 3, 4, 2, 3});
 }
diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp
index dd37fa86af3a36..8dffa3d7c02c75 100644
--- a/aten/src/ATen/test/atest.cpp
+++ b/aten/src/ATen/test/atest.cpp
@@ -1,5 +1,4 @@
-#define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "gtest/gtest.h"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
@@ -19,26 +18,26 @@ void trace() {
     trace += foo_a[i][i];
   }
 
-  REQUIRE(foo.trace().toCFloat() == Approx(trace));
+  EXPECT_FLOAT_EQ(foo.trace().toCFloat(), trace);
 }
 
-TEST_CASE( "atest", "[]" ) {
-
+// TEST_CASE( "atest", "[]" ) {
+TEST(atest, atest) {
   manual_seed(123, at::kCPU);
   manual_seed(123, at::kCUDA);
 
   auto foo = rand({12,6});
-  REQUIRE(foo.data<float>() == foo.toFloatData());
+  EXPECT_EQ(foo.data<float>(), foo.toFloatData());
 
-  REQUIRE(foo.size(0) == 12);
-  REQUIRE(foo.size(1) == 6);
+  EXPECT_EQ(foo.size(0), 12);
+  EXPECT_EQ(foo.size(1), 6);
 
   foo = foo+foo*3;
   foo -= 4;
 
   Scalar a = 4;
   float b = a.to<float>();
-  REQUIRE(b == 4);
+  EXPECT_EQ(b, 4);
 
   foo = (foo*foo) == (foo.pow(3));
   foo =  2 + (foo+1);
@@ -51,7 +50,7 @@ TEST_CASE( "atest", "[]" ) {
     }
   }
 
-  REQUIRE(foo.equal(4 * ones({12, 6}, kByte)));
+  EXPECT_TRUE(foo.equal(4 * ones({12, 6}, kByte)));
 
   trace();
 
@@ -61,17 +60,18 @@ TEST_CASE( "atest", "[]" ) {
   auto f = CPU(kFloat).tensorFromBlob(data, {1,2,3});
   auto f_a = f.accessor<float,3>();
 
-  REQUIRE(f_a[0][0][0] == 1.0);
-  REQUIRE(f_a[0][1][1] == 5.0);
+  EXPECT_EQ(f_a[0][0][0], 1.0);
+  EXPECT_EQ(f_a[0][1][1], 5.0);
 
-  REQUIRE(f.strides()[0] == 6);
-  REQUIRE(f.strides()[1] == 3);
-  REQUIRE(f.strides()[2] == 1);
-  REQUIRE(f.sizes()[0] == 1);
-  REQUIRE(f.sizes()[1] == 2);
-  REQUIRE(f.sizes()[2] == 3);
+  EXPECT_EQ(f.strides()[0], 6);
+  EXPECT_EQ(f.strides()[1], 3);
+  EXPECT_EQ(f.strides()[2], 1);
+  EXPECT_EQ(f.sizes()[0], 1);
+  EXPECT_EQ(f.sizes()[1], 2);
+  EXPECT_EQ(f.sizes()[2], 3);
 
-  REQUIRE_THROWS(f.resize_({3,4,5}));
+  // TODO(ezyang): maybe do a more precise exception type.
+  ASSERT_THROW(f.resize_({3,4,5}), std::exception);
   {
     int isgone = 0;
     {
@@ -79,7 +79,7 @@ TEST_CASE( "atest", "[]" ) {
         isgone++;
       });
     }
-    REQUIRE(isgone == 1);
+    EXPECT_EQ(isgone, 1);
   }
   {
     int isgone = 0;
@@ -90,9 +90,9 @@ TEST_CASE( "atest", "[]" ) {
       });
       a_view = f2.view({3,2,1});
     }
-    REQUIRE(isgone == 0);
+    EXPECT_EQ(isgone, 0);
     a_view.reset();
-    REQUIRE(isgone == 1);
+    EXPECT_EQ(isgone, 1);
   }
 
   if(at::hasCUDA()) {
@@ -103,6 +103,6 @@ TEST_CASE( "atest", "[]" ) {
         isgone++;
       });
     }
-    REQUIRE(isgone==1);
+    EXPECT_EQ(isgone, 1);
   }
 }
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 94988122adedcc..c64fdec0089dff 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/core/Reduction.h"
@@ -20,66 +20,66 @@ using namespace at;
 using Catch::Matchers::StartsWith;
 
 static void test(Type & type) {
-  SECTION( "resize" ) {
+  CATCH_SECTION( "resize" ) {
     auto a = type.tensor();
     a.resize_({3,4});
-    REQUIRE(a.numel() == 12);
+    CATCH_REQUIRE(a.numel() == 12);
     a.resize_({5, 7});
-    REQUIRE(a.numel() == 35);
+    CATCH_REQUIRE(a.numel() == 35);
 
   }
 
-  SECTION( "ones and dot" ) {
+  CATCH_SECTION( "ones and dot" ) {
     Tensor b0 = ones({1, 1}, type);
-    REQUIRE(2 == (b0+b0).sum().toCDouble());
+    CATCH_REQUIRE(2 == (b0+b0).sum().toCDouble());
 
     Tensor b1 = ones({1, 2}, type);
-    REQUIRE(4 == (b1+b1).sum().toCDouble());
+    CATCH_REQUIRE(4 == (b1+b1).sum().toCDouble());
 
     Tensor b = ones({3, 4}, type);
-    REQUIRE(24 == (b+b).sum().toCDouble());
-    REQUIRE(12 == b.numel());
-    REQUIRE(b.view(-1).dot(b.view(-1)).toCDouble() == 12);
+    CATCH_REQUIRE(24 == (b+b).sum().toCDouble());
+    CATCH_REQUIRE(12 == b.numel());
+    CATCH_REQUIRE(b.view(-1).dot(b.view(-1)).toCDouble() == 12);
   }
 
-  SECTION( "rand" ) {
+  CATCH_SECTION( "rand" ) {
     for(auto i = 0; i < 10; i++) {
       Tensor a = rand({3,4}, type.toScalarType(i % 2 == 0 ? kFloat : kDouble));
     }
   }
 
-  SECTION( "sort" ) {
+  CATCH_SECTION( "sort" ) {
     Tensor b = rand({3, 4}, type);
 
     auto z = b.sort(1);
     auto z_sorted = std::get<0>(z);
 
-    REQUIRE(z_sorted[0][0].toCFloat() < z_sorted[0][1].toCFloat());
+    CATCH_REQUIRE(z_sorted[0][0].toCFloat() < z_sorted[0][1].toCFloat());
   }
 
   if(type.backend() != Backend::CUDA)
-  SECTION( "randperm" ) {
+  CATCH_SECTION( "randperm" ) {
     Tensor b = randperm(15, type);
     Tensor rv, ri;
     std::tie(rv, ri) = sort(b, 0);
-    REQUIRE(rv[0].toCFloat() <= rv[1].toCFloat());
+    CATCH_REQUIRE(rv[0].toCFloat() <= rv[1].toCFloat());
   }
 
-  SECTION( "context" ) {
+  CATCH_SECTION( "context" ) {
     std::stringstream ss;
     ss << "context: " << std::hex << (int64_t)&globalContext() << std::endl;
   }
 
-  SECTION( "add" ) {
+  CATCH_SECTION( "add" ) {
     Tensor a = rand({3, 4}, type);
     Tensor b = rand({3, 4}, type);
     Tensor c = add(a, add(a, b));
     //TODO:0-dim Tensor d(3.f);
     Scalar d = 3.f;
-    REQUIRE( add(c, d).allclose(a + a + b + d) );
+    CATCH_REQUIRE( add(c, d).allclose(a + a + b + d) );
   }
 
-  SECTION( "loads of adds" ) {
+  CATCH_SECTION( "loads of adds" ) {
     auto begin = std::chrono::high_resolution_clock::now();
     Tensor d = ones({3, 4}, type);
     Tensor r = zeros({3, 4}, type);
@@ -89,10 +89,10 @@ static void test(Type & type) {
     auto end = std::chrono::high_resolution_clock::now();
     //TODO TEST PERF?
     std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
-    REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
+    CATCH_REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
   }
 
-  SECTION( "loads of adds (with copy)" ) {
+  CATCH_SECTION( "loads of adds (with copy)" ) {
     auto begin = std::chrono::high_resolution_clock::now();
     Tensor d = ones({3, 4}, type);
     Tensor r = zeros({3, 4}, type);
@@ -102,59 +102,59 @@ static void test(Type & type) {
     auto end = std::chrono::high_resolution_clock::now();
     //TODO TEST PERF?
     std::cout << std::dec << "   " << std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count() << " ms" << std::endl;
-    REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
+    CATCH_REQUIRE(norm(100000*d).toCDouble() == norm(r).toCDouble());
   }
 
-  SECTION( "isContiguous" ) {
+  CATCH_SECTION( "isContiguous" ) {
     Tensor a = rand({3, 4}, type);
-    REQUIRE(a.is_contiguous());
+    CATCH_REQUIRE(a.is_contiguous());
     a = a.transpose(0, 1);
-    REQUIRE(!a.is_contiguous());
+    CATCH_REQUIRE(!a.is_contiguous());
   }
 
-  SECTION( "permute" ) {
+  CATCH_SECTION( "permute" ) {
     Tensor a = rand({3, 4, 5}, type);
     Tensor b = a.permute({1, 2, 0});
-    REQUIRE(b.sizes().equals({4, 5, 3}));
-    REQUIRE(b.strides().equals({5, 1, 20}));
+    CATCH_REQUIRE(b.sizes().equals({4, 5, 3}));
+    CATCH_REQUIRE(b.strides().equals({5, 1, 20}));
   }
 
-  SECTION( "mm" ) {
+  CATCH_SECTION( "mm" ) {
     Tensor a = rand({3, 4}, type);
     Tensor b = rand({4}, type);
     Tensor c = mv(a, b);
-    REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1)));
+    CATCH_REQUIRE(c.equal(addmv(zeros({3}, type), a, b, 0, 1)));
   }
 
-  SECTION( "squeeze" ) {
+  CATCH_SECTION( "squeeze" ) {
     Tensor a = rand({2, 1}, type);
     Tensor b = squeeze(a);
-    REQUIRE(b.dim() == 1);
+    CATCH_REQUIRE(b.dim() == 1);
     a = rand({1}, type);
     b = squeeze(a);
     //TODO 0-dim squeeze
-    REQUIRE(a[0].equal(b));
+    CATCH_REQUIRE(a[0].equal(b));
   }
 
-  SECTION( "copy" ) {
+  CATCH_SECTION( "copy" ) {
     Tensor a = zeros({4, 3}, type);
     Tensor e = rand({4, 3}, type);
     a.copy_(e);
-    REQUIRE(a.equal(e));
+    CATCH_REQUIRE(a.equal(e));
   }
 
-  SECTION( "copy (broadcasting)" ) {
+  CATCH_SECTION( "copy (broadcasting)" ) {
     Tensor a = zeros({4, 3}, type);
     Tensor e = rand({3}, type);
     a.copy_(e);
     for (int i = 0; i < 4; ++i) {
-      REQUIRE(a[i].equal(e));
+      CATCH_REQUIRE(a[i].equal(e));
     }
   }
 
-  SECTION( "abs(value)" ) {
+  CATCH_SECTION( "abs(value)" ) {
     Tensor r = at::abs(type.scalarTensor(-3));
-    REQUIRE(r.toCInt() == 3);
+    CATCH_REQUIRE(r.toCInt() == 3);
   }
 
 //TODO(zach): operator overloads
@@ -168,120 +168,120 @@ static void test(Type & type) {
   }
 #endif
 
-  SECTION( "adding a value with a scalar" ) {
+  CATCH_SECTION( "adding a value with a scalar" ) {
     Tensor a = rand({4, 3}, type);
-    REQUIRE((ones({4,3}, type) + a).equal(add(a,1)));
+    CATCH_REQUIRE((ones({4,3}, type) + a).equal(add(a,1)));
   }
 
-  SECTION( "select" ) {
+  CATCH_SECTION( "select" ) {
     Tensor a = rand({3, 7}, type);
     auto a_13 = select(a, 1, 3);
     auto a_13_02 = select(select(a, 1, 3), 0, 2);
-    REQUIRE( a[0][3].equal(a_13[0]) );
-    REQUIRE( a[2][3].equal(a_13_02) );
+    CATCH_REQUIRE( a[0][3].equal(a_13[0]) );
+    CATCH_REQUIRE( a[2][3].equal(a_13_02) );
   }
 
-  SECTION( "zero-dim" ) {
+  CATCH_SECTION( "zero-dim" ) {
     Tensor a =  type.scalarTensor(4); //rand(type, {1});
 
     Tensor b = rand({3,4}, type);
-    REQUIRE((a + a).dim() == 0);
-    REQUIRE((1 + a).dim() == 0);
-    REQUIRE((b + a).dim() == 2);
-    REQUIRE((a + b).dim() == 2);
+    CATCH_REQUIRE((a + a).dim() == 0);
+    CATCH_REQUIRE((1 + a).dim() == 0);
+    CATCH_REQUIRE((b + a).dim() == 2);
+    CATCH_REQUIRE((a + b).dim() == 2);
     auto c = rand({3,4}, type);
-    REQUIRE(c[1][2].dim() == 0);
+    CATCH_REQUIRE(c[1][2].dim() == 0);
 
     auto f = rand({3,4}, type);
     f[2] = zeros({4}, type);
     f[1][0] = -1;
-    REQUIRE(f[2][0].toCDouble() == 0);
+    CATCH_REQUIRE(f[2][0].toCDouble() == 0);
   }
 
-  SECTION( "tensor from TH" ) {
+  CATCH_SECTION( "tensor from TH" ) {
     int a = 4;
     THFloatTensor *t = THFloatTensor_newWithSize2d(a, a);
     THFloatTensor_fill(t, a);
     Tensor tt = CPU(kFloat).unsafeTensorFromTH(t,false);
-    REQUIRE_NOTHROW(tt);
+    CATCH_REQUIRE_NOTHROW(tt);
   }
 
-  SECTION( "toCFloat" ) {
+  CATCH_SECTION( "toCFloat" ) {
     Tensor a = zeros({3,4});
     Tensor b = ones({3,7});
     Tensor c = cat({a,b},1);
-    REQUIRE(c.size(1) == 11);
+    CATCH_REQUIRE(c.size(1) == 11);
 
     Tensor e = rand({});
-    REQUIRE(*e.data<float>() == e.sum().toCFloat());
+    CATCH_REQUIRE(*e.data<float>() == e.sum().toCFloat());
   }
 
-  SECTION( "to string" ) {
+  CATCH_SECTION( "to string" ) {
     Tensor b = ones({3,7})*.0000001f;
     std::stringstream s;
     s << b << "\n";
     std::string expect = "1e-07 *";
-    REQUIRE(s.str().substr(0,expect.size()) == expect);
+    CATCH_REQUIRE(s.str().substr(0,expect.size()) == expect);
   }
-  SECTION("indexing by Scalar") {
+  CATCH_SECTION("indexing by Scalar") {
     Tensor tensor = arange(0, 10, kInt);
     Tensor one = ones({}, kInt);
     for (int64_t i = 0; i < tensor.numel(); ++i) {
-      REQUIRE(tensor[i].equal(one * i));
+      CATCH_REQUIRE(tensor[i].equal(one * i));
     }
     for (size_t i = 0; i < static_cast<uint64_t>(tensor.numel()); ++i) {
-      REQUIRE(tensor[i].equal(one * static_cast<int64_t>(i)));
+      CATCH_REQUIRE(tensor[i].equal(one * static_cast<int64_t>(i)));
     }
     for (int i = 0; i < tensor.numel(); ++i) {
-      REQUIRE(tensor[i].equal(one * i));
+      CATCH_REQUIRE(tensor[i].equal(one * i));
     }
     for (int16_t i = 0; i < tensor.numel(); ++i) {
-      REQUIRE(tensor[i].equal(one * i));
+      CATCH_REQUIRE(tensor[i].equal(one * i));
     }
     for (int8_t i = 0; i < tensor.numel(); ++i) {
-      REQUIRE(tensor[i].equal(one * i));
+      CATCH_REQUIRE(tensor[i].equal(one * i));
     }
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         tensor[Scalar(3.14)].equal(one),
         StartsWith(
             "Can only index tensors with integral scalars"));
   }
-  SECTION("indexing by zero-dim tensor") {
+  CATCH_SECTION("indexing by zero-dim tensor") {
     Tensor tensor = arange(0, 10, kInt);
     Tensor one = ones({}, kInt);
     for (int i = 0; i < tensor.numel(); ++i) {
-      REQUIRE(tensor[one * i].equal(one * i));
+      CATCH_REQUIRE(tensor[one * i].equal(one * i));
     }
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         tensor[ones({}) * 3.14].equal(one),
         StartsWith(
             "Can only index tensors with integral scalars"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         tensor[Tensor()].equal(one),
         StartsWith("Can only index with tensors that are defined"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         tensor[ones({2, 3, 4}, kInt)].equal(one),
         StartsWith("Can only index with tensors that are scalars (zero-dim)"));
   }
-  SECTION("dispatch") {
+  CATCH_SECTION("dispatch") {
     Tensor tensor = randn({20, 20});
     Tensor other = randn({20, 20});
     auto result = tensor.m(relu).m(mse_loss, other, Reduction::ElementwiseMean);
-    REQUIRE(result.allclose(mse_loss(relu(tensor), other)));
+    CATCH_REQUIRE(result.allclose(mse_loss(relu(tensor), other)));
   }
-  SECTION("core") {
+  CATCH_SECTION("core") {
     int i = CoreTest();
-    REQUIRE(i + 1 == CoreTest());
+    CATCH_REQUIRE(i + 1 == CoreTest());
   }
 }
 
-TEST_CASE( "basic tests CPU", "[cpu]" ) {
+CATCH_TEST_CASE( "basic tests CPU", "[cpu]" ) {
   manual_seed(123, at::kCPU);
 
   test(CPU(kFloat));
 }
 
-TEST_CASE( "basic tests GPU", "[cuda]" ) {
+CATCH_TEST_CASE( "basic tests GPU", "[cuda]" ) {
   manual_seed(123, at::kCUDA);
 
   if(at::hasCUDA()) {
diff --git a/aten/src/ATen/test/broadcast_test.cpp b/aten/src/ATen/test/broadcast_test.cpp
index cd5c43d32fae86..822a1d79df1bda 100644
--- a/aten/src/ATen/test/broadcast_test.cpp
+++ b/aten/src/ATen/test/broadcast_test.cpp
@@ -1,154 +1,154 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
 
 using namespace at;
 
-TEST_CASE( "broadcast", "[]" ) {
+CATCH_TEST_CASE( "broadcast", "[]" ) {
 
   manual_seed(123, at::kCPU);
 
   Type & T = CPU(kFloat);
 
   // 0) pre-req tests:
-  SECTION( "can't expand empty tensor" ) {
+  CATCH_SECTION( "can't expand empty tensor" ) {
     auto empty = randn({0}, T);
-    REQUIRE_THROWS(empty.expand({3}));
+    _CATCH_REQUIRE_THROWS(empty.expand({3}));
   }
 
   // 1) out-place function with 2 args
-  SECTION( "out-place function with 2 args" ) {
+  CATCH_SECTION( "out-place function with 2 args" ) {
 
-    SECTION( "basic" ) {
+    CATCH_SECTION( "basic" ) {
       auto a = randn({3, 1}, T);
       auto b = randn({5}, T);
       std::vector<int64_t> expanded_sizes = {3, 5};
-      REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes)));
+      CATCH_REQUIRE((a + b).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes)));
     }
 
-    SECTION( "with scalar" ) {
+    CATCH_SECTION( "with scalar" ) {
       auto aScalar = ones({1}, T);
       aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
       auto b = randn({3, 5}, T);
-      REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes())));
+      CATCH_REQUIRE((aScalar + b).equal(aScalar.expand(b.sizes()) + b.expand(b.sizes())));
     }
 
-    SECTION( "old fallback behavior yields error" ) {
+    CATCH_SECTION( "old fallback behavior yields error" ) {
       auto a = randn({3, 5}, T);
       auto b = randn({5, 3}, T);
-      REQUIRE_THROWS(a + b);
+      _CATCH_REQUIRE_THROWS(a + b);
     }
 
-    SECTION( "with mismatched sizes" ) {
+    CATCH_SECTION( "with mismatched sizes" ) {
       auto a = randn({3, 5}, T);
       auto b = randn({7, 5}, T);
-      REQUIRE_THROWS(a + b);
+      _CATCH_REQUIRE_THROWS(a + b);
     }
   }
 
-  SECTION( "out-place function with 3 args" ) {
+  CATCH_SECTION( "out-place function with 3 args" ) {
 
-    SECTION( "basic" ) {
+    CATCH_SECTION( "basic" ) {
       auto a = randn({3, 1, 1}, T);
       auto b = randn({1, 2, 1}, T);
       auto c = randn({1, 1, 5}, T);
       std::vector<int64_t> expanded_sizes = {3, 2, 5};
-      REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes)));
+      CATCH_REQUIRE((a + b + c).equal(a.expand(expanded_sizes) + b.expand(expanded_sizes) + c.expand(expanded_sizes)));
     }
 
-    SECTION( "with scalar" ) {
+    CATCH_SECTION( "with scalar" ) {
       auto aTensorScalar = ones({1}, T);
       aTensorScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
       auto b = randn({3, 2, 1}, T);
       auto c = randn({1, 2, 5}, T);
       std::vector<int64_t> expanded_sizes = {3, 2, 5};
-      REQUIRE(aTensorScalar.addcmul(b, c).equal(
+      CATCH_REQUIRE(aTensorScalar.addcmul(b, c).equal(
                 aTensorScalar.expand(expanded_sizes).addcmul(b.expand(expanded_sizes), c.expand(expanded_sizes))));
     }
 
-    SECTION( "old fallback behavior yields error" ) {
+    CATCH_SECTION( "old fallback behavior yields error" ) {
       auto a = randn({3, 2, 5}, T);
       auto b = randn({2, 3, 5}, T);
       auto c = randn({5, 3, 2}, T);
-      REQUIRE_THROWS(a.addcmul(b, c));
+      _CATCH_REQUIRE_THROWS(a.addcmul(b, c));
     }
 
-    SECTION( "with mismatched sizes" ){
+    CATCH_SECTION( "with mismatched sizes" ){
       auto a = randn({3, 2, 5}, T);
       auto b = randn({2, 3, 5}, T);
       auto c = randn({5, 5, 5}, T);
-      REQUIRE_THROWS(a.addcmul(b, c));
+      _CATCH_REQUIRE_THROWS(a.addcmul(b, c));
     }
   }
 
-  SECTION( "in-place function with 2 args" ) {
-    SECTION( "basic" ) {
+  CATCH_SECTION( "in-place function with 2 args" ) {
+    CATCH_SECTION( "basic" ) {
       auto a = randn({3, 5}, T);
       auto b = randn({3, 1}, T);
-      REQUIRE((a + b).equal(a + b.expand({3, 5})));
+      CATCH_REQUIRE((a + b).equal(a + b.expand({3, 5})));
     }
 
-    SECTION( "with scalar" ) {
+    CATCH_SECTION( "with scalar" ) {
       auto a = randn({3, 5}, T);
       auto bScalar = ones({1}, T);
       bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes())));
+      CATCH_REQUIRE((a + bScalar).equal(a + bScalar.expand(a.sizes())));
     }
 
-    SECTION( "error: would have to expand inplace arg" ) {
+    CATCH_SECTION( "error: would have to expand inplace arg" ) {
       auto a = randn({1, 5}, T);
       auto b = randn({3, 1}, T);
-      REQUIRE_THROWS(a.add_(b));
+      _CATCH_REQUIRE_THROWS(a.add_(b));
     }
   }
 
-  SECTION( "in-place function with 3 args" ) {
+  CATCH_SECTION( "in-place function with 3 args" ) {
 
     auto a = randn({3, 5, 2}, T);
     auto b = randn({3, 1, 2}, T);
     auto c = randn({1, 5, 1}, T);
 
-    SECTION( "basic" ) {
+    CATCH_SECTION( "basic" ) {
       auto aClone = a.clone();
-      REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes()))));
+      CATCH_REQUIRE(a.addcmul_(b, c).equal(aClone.addcmul_(b.expand(a.sizes()), c.expand(a.sizes()))));
     }
 
-    SECTION( "with scalar" ) {
+    CATCH_SECTION( "with scalar" ) {
       auto aClone = a.clone();
       auto bScalar = ones({1}, T);
       bScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes()))));
+      CATCH_REQUIRE(a.addcmul_(bScalar, c).equal(aClone.addcmul_(bScalar.expand(a.sizes()), c.expand(a.sizes()))));
     }
 
-    SECTION( "error: would have to expand inplace arg" ) {
+    CATCH_SECTION( "error: would have to expand inplace arg" ) {
       auto a = randn({1, 3, 5}, T);
       auto b = randn({4, 1, 1}, T);
       auto c = randn({1, 3, 1}, T);
-      REQUIRE_THROWS(a.addcmul_(b, c));
+      _CATCH_REQUIRE_THROWS(a.addcmul_(b, c));
     }
   }
 
-  SECTION( "explicit dim specification" ) {
+  CATCH_SECTION( "explicit dim specification" ) {
 
     auto a = randn({1}, T);
     auto b = randn({5, 3}, T);
     auto c = randn({3, 7}, T);
 
-    SECTION( "basic" ) {
-      REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c)));
+    CATCH_SECTION( "basic" ) {
+      CATCH_REQUIRE(a.addmm(b, c).equal(a.expand({5,7}).addmm(b, c)));
     }
 
-    SECTION( "with scalar" ) {
+    CATCH_SECTION( "with scalar" ) {
       Tensor aScalar = ones({1}, T);
       aScalar.unsafeGetTensorImpl()->maybe_zero_dim(true);
-      REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c)));
+      CATCH_REQUIRE(aScalar.addmm(b, c).equal(aScalar.expand({5, 7}).addmm(b, c)));
     }
 
-    SECTION( "with mismatched sizes" ) {
+    CATCH_SECTION( "with mismatched sizes" ) {
       auto a = randn({3, 3}, T);
-      REQUIRE_THROWS(a.addmm(b, c));
+      _CATCH_REQUIRE_THROWS(a.addmm(b, c));
     }
   }
 }
diff --git a/aten/src/ATen/test/catch_utils.hpp b/aten/src/ATen/test/catch_utils.hpp
new file mode 100644
index 00000000000000..b9b0a87990a9ce
--- /dev/null
+++ b/aten/src/ATen/test/catch_utils.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+#define CATCH_CONFIG_PREFIX_ALL
+#include <catch.hpp>
+
+// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning;
+// define our own version that doesn't warn.
+#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
diff --git a/aten/src/ATen/test/cuda_half_test.cu b/aten/src/ATen/test/cuda_half_test.cu
index fa00e534ee07ef..cce267100589e1 100644
--- a/aten/src/ATen/test/cuda_half_test.cu
+++ b/aten/src/ATen/test/cuda_half_test.cu
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/cuda/NumericLimits.cuh"
@@ -82,9 +82,9 @@ void launch_function(){
   kernel<<<1,1>>>();
 }
 
-TEST_CASE( "half common math functions tests in device", "[cuda]" ) {
+CATCH_TEST_CASE( "half common math functions tests in device", "[cuda]" ) {
   launch_function();
   cudaError_t err = cudaDeviceSynchronize();
-  REQUIRE(err == cudaSuccess);
+  CATCH_REQUIRE(err == cudaSuccess);
 }
 
diff --git a/aten/src/ATen/test/cuda_optional_test.cu b/aten/src/ATen/test/cuda_optional_test.cu
index 9956dcf52b04ef..b64c530b355914 100644
--- a/aten/src/ATen/test/cuda_optional_test.cu
+++ b/aten/src/ATen/test/cuda_optional_test.cu
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/optional.h"
@@ -8,15 +8,15 @@
 
 using namespace at;
 
-TEST_CASE( "optional in cuda files", "[cuda]" ) {
+CATCH_TEST_CASE( "optional in cuda files", "[cuda]" ) {
   at::optional<int64_t> trivially_destructible;
   at::optional<std::vector<int64_t>> non_trivially_destructible;
-  REQUIRE(!trivially_destructible.has_value());
-  REQUIRE(!non_trivially_destructible.has_value());
+  CATCH_REQUIRE(!trivially_destructible.has_value());
+  CATCH_REQUIRE(!non_trivially_destructible.has_value());
 
   trivially_destructible = {5};
   non_trivially_destructible = std::vector<int64_t>{5, 10};
-  REQUIRE(trivially_destructible.has_value());
-  REQUIRE(non_trivially_destructible.has_value());
+  CATCH_REQUIRE(trivially_destructible.has_value());
+  CATCH_REQUIRE(non_trivially_destructible.has_value());
 }
 
diff --git a/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
new file mode 100644
index 00000000000000..a529f38d748a1b
--- /dev/null
+++ b/aten/src/ATen/test/cuda_packedtensoraccessor_test.cu
@@ -0,0 +1,46 @@
+#define CATCH_CONFIG_MAIN
+#include "catch_utils.hpp"
+
+#include "ATen/ATen.h"
+#include "test_seed.h"
+#include "ATen/core/TensorAccessor.h"
+#include "ATen/cuda/CUDAContext.h"
+
+#include <assert.h>
+
+using namespace at;
+
+__global__ void test_tensor_packed_accessor_kernel(PackedTensorAccessor<float,1,RestrictPtrTraits> resa,
+						   PackedTensorAccessor<float,2,RestrictPtrTraits> t1a,
+						   PackedTensorAccessor<float,1,RestrictPtrTraits> t2a){
+  for (int64_t i = 0; i < resa.size(0); i++) {
+    float val = 0.0f;
+    for (int64_t j = 0; j < t1a.size(1); j++) {
+      val += t1a[i][j] * t2a[j];
+    }
+    resa[i] = val;
+  }
+}
+
+CATCH_TEST_CASE( "test PackedTensorAccessor and Tensor.packed_accessor", "[cuda]" ) {
+  manual_seed(123, at::kCPU);
+  manual_seed(123, at::kCUDA);
+
+  Tensor t1 = rand({4, 4}, CUDA(kFloat));
+  Tensor t2 = rand({4}, CUDA(kFloat));
+  Tensor res = empty({4}, CUDA(kFloat));
+
+  auto t1a = t1.packed_accessor<float, 2, RestrictPtrTraits>();
+  auto t2a = t2.packed_accessor<float, 1, RestrictPtrTraits>();
+  auto resa = res.packed_accessor<float, 1, RestrictPtrTraits>();
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  
+  test_tensor_packed_accessor_kernel<<<1, 1, 0, stream>>>(resa, t1a, t2a);
+  cudaError_t err = cudaDeviceSynchronize();
+  CATCH_REQUIRE(err == cudaSuccess);
+
+  auto expected = mv(t1, t2);
+
+  CATCH_REQUIRE(res.allclose(expected));
+}
diff --git a/aten/src/ATen/test/cuda_rng_test.cpp b/aten/src/ATen/test/cuda_rng_test.cpp
index d32903dd2fe1f3..7b14174d3baeb3 100644
--- a/aten/src/ATen/test/cuda_rng_test.cpp
+++ b/aten/src/ATen/test/cuda_rng_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "cuda.h"
@@ -21,7 +21,7 @@ void testCudaRNGMultithread() {
   }
 };
 
-TEST_CASE( "CUDA RNG test", "[cuda]" ) {
-  SECTION( "multithread" )
+CATCH_TEST_CASE( "CUDA RNG test", "[cuda]" ) {
+  CATCH_SECTION( "multithread" )
     testCudaRNGMultithread();
 }
diff --git a/aten/src/ATen/test/cudnn_test.cpp b/aten/src/ATen/test/cudnn_test.cpp
index 31786e88a0944d..4391867d166772 100644
--- a/aten/src/ATen/test/cudnn_test.cpp
+++ b/aten/src/ATen/test/cudnn_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/cudnn/Descriptors.h"
@@ -9,7 +9,7 @@
 using namespace at;
 using namespace at::native;
 
-TEST_CASE( "cudnn", "[cuda]" ) {
+CATCH_TEST_CASE( "cudnn", "[cuda]" ) {
   manual_seed(123, at::kCUDA);
 
 #if CUDNN_VERSION < 7000
@@ -18,8 +18,8 @@ TEST_CASE( "cudnn", "[cuda]" ) {
   desc1.initialize_rng(at::CUDA(kByte), handle, 0.5, 42);
   desc2.set(handle, 0.5, desc1.state);
 
-  REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout);
-  REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates);
-  REQUIRE(desc1.desc()->states == desc2.desc()->states);
+  CATCH_REQUIRE(desc1.desc()->dropout == desc2.desc()->dropout);
+  CATCH_REQUIRE(desc1.desc()->nstates == desc2.desc()->nstates);
+  CATCH_REQUIRE(desc1.desc()->states == desc2.desc()->states);
 #endif
 }
diff --git a/aten/src/ATen/test/dlconvertor_test.cpp b/aten/src/ATen/test/dlconvertor_test.cpp
index 48829298760276..bf0cf93f7c4064 100644
--- a/aten/src/ATen/test/dlconvertor_test.cpp
+++ b/aten/src/ATen/test/dlconvertor_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/DLConvertor.h"
@@ -11,17 +11,17 @@
 
 using namespace at;
 
-TEST_CASE( "dlconvertor", "[cpu]" ) {
+CATCH_TEST_CASE( "dlconvertor", "[cpu]" ) {
 
   manual_seed(123, at::kCPU);
 
-  INFO( "convert ATen to DLTensor" );
+  CATCH_INFO( "convert ATen to DLTensor" );
 
   Tensor a = rand({3,4});
   DLManagedTensor* dlMTensor = toDLPack(a);
 
-  INFO( "convert DLTensor to ATen" );
+  CATCH_INFO( "convert DLTensor to ATen" );
   Tensor b = fromDLPack(dlMTensor);
 
-  REQUIRE(a.equal(b));
+  CATCH_REQUIRE(a.equal(b));
 }
diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp
index 3b2944803e6b5a..32177705a2f883 100644
--- a/aten/src/ATen/test/half_test.cpp
+++ b/aten/src/ATen/test/half_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include <ATen/ATen.h>
 #include <iostream>
@@ -12,53 +12,53 @@
 
 using namespace at;
 
-TEST_CASE( "half arithmetic", "[]" ) {
+CATCH_TEST_CASE( "half arithmetic", "[]" ) {
   Half zero = 0;
   Half one = 1;
-  REQUIRE(zero + one == one);
-  REQUIRE(zero + zero == zero);
-  REQUIRE(zero * one == zero);
-  REQUIRE(one * one == one);
-  REQUIRE(one / one == one);
-  REQUIRE(one - one == zero);
-  REQUIRE(one - zero == one);
-  REQUIRE(zero - one == -one);
-  REQUIRE(one + one == Half(2));
-  REQUIRE(one + one == 2);
+  CATCH_REQUIRE(zero + one == one);
+  CATCH_REQUIRE(zero + zero == zero);
+  CATCH_REQUIRE(zero * one == zero);
+  CATCH_REQUIRE(one * one == one);
+  CATCH_REQUIRE(one / one == one);
+  CATCH_REQUIRE(one - one == zero);
+  CATCH_REQUIRE(one - zero == one);
+  CATCH_REQUIRE(zero - one == -one);
+  CATCH_REQUIRE(one + one == Half(2));
+  CATCH_REQUIRE(one + one == 2);
 }
 
-TEST_CASE( "half comparisons", "[]" ) {
+CATCH_TEST_CASE( "half comparisons", "[]" ) {
   Half zero = 0;
   Half one = 1;
-  REQUIRE(zero < one);
-  REQUIRE(zero < 1);
-  REQUIRE(1 > zero);
-  REQUIRE(0 >= zero);
-  REQUIRE(0 != one);
-  REQUIRE(zero == 0);
-  REQUIRE(zero == zero);
-  REQUIRE(zero == -zero);
+  CATCH_REQUIRE(zero < one);
+  CATCH_REQUIRE(zero < 1);
+  CATCH_REQUIRE(1 > zero);
+  CATCH_REQUIRE(0 >= zero);
+  CATCH_REQUIRE(0 != one);
+  CATCH_REQUIRE(zero == 0);
+  CATCH_REQUIRE(zero == zero);
+  CATCH_REQUIRE(zero == -zero);
 }
 
-TEST_CASE( "half cast", "[]" ) {
+CATCH_TEST_CASE( "half cast", "[]" ) {
   Half value = 1.5f;
-  REQUIRE((int)value == 1);
-  REQUIRE((short)value == 1);
-  REQUIRE((long long)value == 1LL);
-  REQUIRE((float)value == 1.5f);
-  REQUIRE((double)value == 1.5);
-  REQUIRE((bool)value == true);
-  REQUIRE((bool)Half(0.0f) == false);
+  CATCH_REQUIRE((int)value == 1);
+  CATCH_REQUIRE((short)value == 1);
+  CATCH_REQUIRE((long long)value == 1LL);
+  CATCH_REQUIRE((float)value == 1.5f);
+  CATCH_REQUIRE((double)value == 1.5);
+  CATCH_REQUIRE((bool)value == true);
+  CATCH_REQUIRE((bool)Half(0.0f) == false);
 }
 
-TEST_CASE( "half construction", "[]" ) {
-  REQUIRE(Half((short)3) == Half(3.0f));
-  REQUIRE(Half((unsigned short)3) == Half(3.0f));
-  REQUIRE(Half(3) == Half(3.0f));
-  REQUIRE(Half(3U) == Half(3.0f));
-  REQUIRE(Half(3LL) == Half(3.0f));
-  REQUIRE(Half(3ULL) == Half(3.0f));
-  REQUIRE(Half(3.5) == Half(3.5f));
+CATCH_TEST_CASE( "half construction", "[]" ) {
+  CATCH_REQUIRE(Half((short)3) == Half(3.0f));
+  CATCH_REQUIRE(Half((unsigned short)3) == Half(3.0f));
+  CATCH_REQUIRE(Half(3) == Half(3.0f));
+  CATCH_REQUIRE(Half(3U) == Half(3.0f));
+  CATCH_REQUIRE(Half(3LL) == Half(3.0f));
+  CATCH_REQUIRE(Half(3ULL) == Half(3.0f));
+  CATCH_REQUIRE(Half(3.5) == Half(3.5f));
 }
 
 static std::string to_string(const Half& h) {
@@ -67,22 +67,22 @@ static std::string to_string(const Half& h) {
   return ss.str();
 }
 
-TEST_CASE( "half to string", "[]" ) {
-  REQUIRE(to_string(Half(3.5f)) == "3.5");
-  REQUIRE(to_string(Half(-100.0f)) == "-100");
+CATCH_TEST_CASE( "half to string", "[]" ) {
+  CATCH_REQUIRE(to_string(Half(3.5f)) == "3.5");
+  CATCH_REQUIRE(to_string(Half(-100.0f)) == "-100");
 }
 
-TEST_CASE( "half numeric limits", "[]" ) {
+CATCH_TEST_CASE( "half numeric limits", "[]" ) {
   using limits = std::numeric_limits<Half>;
-  REQUIRE(limits::lowest() == -65504.0f);
-  REQUIRE(limits::max() == 65504.0f);
-  REQUIRE(limits::min() > 0);
-  REQUIRE(limits::min() < 1);
-  REQUIRE(limits::denorm_min() > 0);
-  REQUIRE(limits::denorm_min() / 2  == 0);
-  REQUIRE(limits::infinity() == std::numeric_limits<float>::infinity());
-  REQUIRE(limits::quiet_NaN() != limits::quiet_NaN());
-  REQUIRE(limits::signaling_NaN() != limits::signaling_NaN());
+  CATCH_REQUIRE(limits::lowest() == -65504.0f);
+  CATCH_REQUIRE(limits::max() == 65504.0f);
+  CATCH_REQUIRE(limits::min() > 0);
+  CATCH_REQUIRE(limits::min() < 1);
+  CATCH_REQUIRE(limits::denorm_min() > 0);
+  CATCH_REQUIRE(limits::denorm_min() / 2  == 0);
+  CATCH_REQUIRE(limits::infinity() == std::numeric_limits<float>::infinity());
+  CATCH_REQUIRE(limits::quiet_NaN() != limits::quiet_NaN());
+  CATCH_REQUIRE(limits::signaling_NaN() != limits::signaling_NaN());
 }
 
 // Check the declared type of members of numeric_limits<Half> matches
@@ -119,7 +119,7 @@ ASSERT_SAME_TYPE(max_exponent10);
 ASSERT_SAME_TYPE(traps);
 ASSERT_SAME_TYPE(tinyness_before);
 
-TEST_CASE( "half common math functions test", "[]" ) {
+CATCH_TEST_CASE( "half common math functions test", "[]" ) {
   float threshold = 0.00001;
   assert(std::abs(std::lgamma(Half(10.0)) - std::lgamma(10.0f)) <= threshold);
   assert(std::abs(std::exp(Half(1.0)) - std::exp(1.0f)) <= threshold);
diff --git a/aten/src/ATen/test/integer_divider_test.cu b/aten/src/ATen/test/integer_divider_test.cu
index 4c63ab3a8fd205..d09a423d7ca72d 100644
--- a/aten/src/ATen/test/integer_divider_test.cu
+++ b/aten/src/ATen/test/integer_divider_test.cu
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 // Test IntegerDivider: this tests *all* 32-bit pairs (a, b) where a % b is 0 or
 // (b-1), so it takes a few minutes to run.
@@ -62,18 +62,18 @@ class IntDividerTester {
     cudaError_t err;
 
     err = cudaMalloc(&dividersBuf_, NUM_CASES * sizeof(IntDivider<Value>));
-    REQUIRE(err == cudaSuccess);
+    CATCH_REQUIRE(err == cudaSuccess);
     err = cudaMalloc(&testCasesBuf_, NUM_CASES * sizeof(TestCase<Value>));
-    REQUIRE(err == cudaSuccess);
+    CATCH_REQUIRE(err == cudaSuccess);
   }
 
   ~IntDividerTester() {
     cudaError_t err;
 
     err = cudaFree(dividersBuf_);
-    REQUIRE(err == cudaSuccess);
+    CATCH_REQUIRE(err == cudaSuccess);
     err = cudaFree(testCasesBuf_);
-    REQUIRE(err == cudaSuccess);
+    CATCH_REQUIRE(err == cudaSuccess);
   }
 
   void addTestCase(Value dividend, Value divisor, int steps) {
@@ -92,18 +92,18 @@ class IntDividerTester {
     cudaError_t err;
 
     if (testCases_.empty()) return;
-    REQUIRE(!dividers_.empty());
+    CATCH_REQUIRE(!dividers_.empty());
 
-    REQUIRE(dividers_.size() <= NUM_CASES);
-    REQUIRE(testCases_.size() <= NUM_CASES);
+    CATCH_REQUIRE(dividers_.size() <= NUM_CASES);
+    CATCH_REQUIRE(testCases_.size() <= NUM_CASES);
     err = cudaMemcpy(dividersBuf_, dividers_.data(),
                      dividers_.size() * sizeof(IntDivider<Value>),
                      cudaMemcpyHostToDevice);
-    REQUIRE(err == cudaSuccess);
+    CATCH_REQUIRE(err == cudaSuccess);
     err = cudaMemcpy(testCasesBuf_, testCases_.data(),
                      testCases_.size() * sizeof(TestCase<Value>),
                      cudaMemcpyHostToDevice);
-    REQUIRE(err == cudaSuccess);
+    CATCH_REQUIRE(err == cudaSuccess);
 
     int numCases = testCases_.size();
     testIntDivider<Value><<<512, 512>>>(
@@ -180,11 +180,11 @@ static void testUint64Divider()
   tester.flush();
 }
 
-TEST_CASE( "CUDA integer divider", "[cuda]" ) {
+CATCH_TEST_CASE( "CUDA integer divider", "[cuda]" ) {
 
   testUint64Divider();
   testUint32Divider();
 
   cudaError_t err = cudaDeviceSynchronize();
-  REQUIRE(err == cudaSuccess);
+  CATCH_REQUIRE(err == cudaSuccess);
 }
diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp
index e10de30ae8e023..4c57b7d8ee1d96 100644
--- a/aten/src/ATen/test/native_test.cpp
+++ b/aten/src/ATen/test/native_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
@@ -9,18 +9,18 @@ using namespace at;
 using Catch::Matchers::StartsWith;
 
 #define REQUIRE_EQUAL(t1, t2) \
-  REQUIRE(t1.equal(t2));
+  CATCH_REQUIRE(t1.equal(t2));
 
 #define REQUIRE_ALLCLOSE(t1, t2)   \
-  REQUIRE(t1.is_same_size(t2));    \
-  REQUIRE(t1.allclose(t2));
+  CATCH_REQUIRE(t1.is_same_size(t2));    \
+  CATCH_REQUIRE(t1.allclose(t2));
 
 #define REQUIRE_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol)   \
-  REQUIRE(t1.is_same_size(t2));    \
-  REQUIRE(t1.allclose(t2, atol, rtol));
+  CATCH_REQUIRE(t1.is_same_size(t2));    \
+  CATCH_REQUIRE(t1.allclose(t2, atol, rtol));
 
 void requireEqualTensorList(TensorList t1, TensorList t2) {
-  REQUIRE(t1.size() == t2.size());
+  CATCH_REQUIRE(t1.size() == t2.size());
   for (size_t i = 0; i < t1.size(); ++i) {
     REQUIRE_EQUAL(t1[ i ], t2[ i ]);
   }
@@ -29,7 +29,7 @@ void requireEqualTensorList(TensorList t1, TensorList t2) {
 void test(Type & T, Type & AccT) {
   auto t = randn({3, 3}, T);
 
-  SECTION( "split: test method, type, namespace give same result" ) {
+  CATCH_SECTION( "split: test method, type, namespace give same result" ) {
     auto splitMethod = t.split(1, 0);
     auto splitType = T.split(t, 1, 0);
     auto splitNs = at::split(t, 1, 0);
@@ -40,7 +40,7 @@ void test(Type & T, Type & AccT) {
     REQUIRE_EQUAL(at::cat(splitMethod, 0), t);
   }
 
-  SECTION( "chunk: test method, type, namespace give same result" ) {
+  CATCH_SECTION( "chunk: test method, type, namespace give same result" ) {
     // test method, type, namespace give same result
     auto chunkMethod = t.chunk(3, 0);
     auto chunkType = T.chunk(t, 3, 0);
@@ -53,7 +53,7 @@ void test(Type & T, Type & AccT) {
   }
 
   // stack
-  SECTION( "stack" ) {
+  CATCH_SECTION( "stack" ) {
     auto x = rand({2, 3, 4});
     auto y = rand({2, 3, 4});
     auto z = rand({2, 3, 4});
@@ -66,36 +66,36 @@ void test(Type & T, Type & AccT) {
       expected_size.insert(expected_size.end(), x.sizes().begin() + dim, x.sizes().end());
 
       REQUIRE_EQUAL(res, res_neg);
-      REQUIRE(res.sizes().equals(expected_size));
+      CATCH_REQUIRE(res.sizes().equals(expected_size));
       REQUIRE_EQUAL(res.select(dim, 0), x);
       REQUIRE_EQUAL(res.select(dim, 1), y);
       REQUIRE_EQUAL(res.select(dim, 2), z);
     }
   }
 
-  SECTION( "size / stride" ) {
+  CATCH_SECTION( "size / stride" ) {
     auto scalar = randn({}, T);
-    REQUIRE_THROWS_WITH(scalar.size(0), StartsWith("dimension specified as 0 but tensor has no dimensions"));
-    REQUIRE_THROWS_WITH(scalar.size(-1), StartsWith("dimension specified as -1 but tensor has no dimensions"));
-    REQUIRE_THROWS_WITH(scalar.stride(0), StartsWith("dimension specified as 0 but tensor has no dimensions"));
-    REQUIRE_THROWS_WITH(scalar.stride(-1), StartsWith("dimension specified as -1 but tensor has no dimensions"));
+    CATCH_REQUIRE_THROWS_WITH(scalar.size(0), StartsWith("dimension specified as 0 but tensor has no dimensions"));
+    CATCH_REQUIRE_THROWS_WITH(scalar.size(-1), StartsWith("dimension specified as -1 but tensor has no dimensions"));
+    CATCH_REQUIRE_THROWS_WITH(scalar.stride(0), StartsWith("dimension specified as 0 but tensor has no dimensions"));
+    CATCH_REQUIRE_THROWS_WITH(scalar.stride(-1), StartsWith("dimension specified as -1 but tensor has no dimensions"));
 
     auto empty = randn({0}, T);
-    REQUIRE(empty.size(0) == 0);
-    REQUIRE(empty.size(-1) == 0);
-    REQUIRE(empty.stride(0) == 1);
-    REQUIRE(empty.stride(-1) == 1);
+    CATCH_REQUIRE(empty.size(0) == 0);
+    CATCH_REQUIRE(empty.size(-1) == 0);
+    CATCH_REQUIRE(empty.stride(0) == 1);
+    CATCH_REQUIRE(empty.stride(-1) == 1);
   }
 
   // matmul
-  SECTION( "matmul" ) {
+  CATCH_SECTION( "matmul" ) {
     auto scalar = randn({}, T);
     auto d1 = randn({3}, T);
     auto d2 = randn({2, 3}, T);
 
     // 0-d
-    REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
-    REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
+    CATCH_REQUIRE_THROWS_WITH(scalar.matmul(d2), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
+    CATCH_REQUIRE_THROWS_WITH(d2.matmul(scalar), Catch::StartsWith("both arguments to matmul need to be at least 1D"));
 
     // 1-d
     REQUIRE_ALLCLOSE(d1.matmul(d1), d1.dot(d1));
@@ -140,11 +140,11 @@ void test(Type & T, Type & AccT) {
 
     // non-expandable case
     auto d5wrong = randn({2, 4, 2, 4, 3, 2}, T);
-    REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size"));
+    CATCH_REQUIRE_THROWS_WITH(d5.matmul(d5wrong), Catch::Contains("must match the size"));
   }
 
   // _standard_gamma_grad
-  SECTION( "_standard_gamma_grad" ) {
+  CATCH_SECTION( "_standard_gamma_grad" ) {
     // check empty
     auto empty = ones({0}, T);
     REQUIRE_EQUAL(empty, at::_standard_gamma_grad(empty, empty));
@@ -158,10 +158,10 @@ void test(Type & T, Type & AccT) {
     // check mixing types
     auto t1 = randn({3, 4}, T);
     auto t2 = randn({3, 4}, T).toType(kDouble);
-    REQUIRE_THROWS_WITH(at::_standard_gamma_grad(t1, t2), Catch::StartsWith("expected scalar type"));
+    CATCH_REQUIRE_THROWS_WITH(at::_standard_gamma_grad(t1, t2), Catch::StartsWith("expected scalar type"));
   }
 
-  SECTION( "where" ) {
+  CATCH_SECTION( "where" ) {
     // empty
     auto empty = ones({0}, T);
     auto &bT = T.toScalarType(ScalarType::Byte);
@@ -180,13 +180,13 @@ void test(Type & T, Type & AccT) {
   }
 }
 
-TEST_CASE( "native test CPU", "[cpu]" ) {
+CATCH_TEST_CASE( "native test CPU", "[cpu]" ) {
   manual_seed(123, at::kCPU);
 
   test(CPU(kFloat), CPU(kDouble));
 }
 
-TEST_CASE( "native test CUDA", "[cuda]" ) {
+CATCH_TEST_CASE( "native test CUDA", "[cuda]" ) {
   manual_seed(123, at::kCUDA);
 
   if (at::hasCUDA()) {
diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp
index d52dc27e20295e..964f6260e7d9ff 100644
--- a/aten/src/ATen/test/scalar_tensor_test.cpp
+++ b/aten/src/ATen/test/scalar_tensor_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
@@ -18,14 +18,14 @@ using namespace at;
       _passed = true;                                           \
       els;                                                      \
     } catch (std::exception &e) {                               \
-      REQUIRE(!_passed);                                        \
+      CATCH_REQUIRE(!_passed);                                        \
       catc;                                                     \
     }                                                           \
   }
 
 void require_equal_size_dim(const Tensor &lhs, const Tensor &rhs) {
-  REQUIRE(lhs.dim() == rhs.dim());
-  REQUIRE(lhs.sizes().equals(rhs.sizes()));
+  CATCH_REQUIRE(lhs.dim() == rhs.dim());
+  CATCH_REQUIRE(lhs.sizes().equals(rhs.sizes()));
 }
 
 bool should_expand(const IntList &from_size, const IntList &to_size) {
@@ -49,15 +49,15 @@ void test(Type &T) {
   for (auto s = sizes.begin(); s != sizes.end(); ++s) {
     // verify that the dim, sizes, strides, etc match what was requested.
     auto t = ones(*s, T);
-    REQUIRE((size_t)t.dim() == s->size());
-    REQUIRE((size_t)t.ndimension() == s->size());
-    REQUIRE(t.sizes().equals(*s));
-    REQUIRE(t.strides().size() == s->size());
+    CATCH_REQUIRE((size_t)t.dim() == s->size());
+    CATCH_REQUIRE((size_t)t.ndimension() == s->size());
+    CATCH_REQUIRE(t.sizes().equals(*s));
+    CATCH_REQUIRE(t.strides().size() == s->size());
     auto numel = std::accumulate(s->begin(), s->end(), 1, std::multiplies<int64_t>());
-    REQUIRE(t.numel() == numel);
+    CATCH_REQUIRE(t.numel() == numel);
     // verify we can output
     std::stringstream ss;
-    REQUIRE_NOTHROW(ss << t << std::endl);
+    CATCH_REQUIRE_NOTHROW(ss << t << std::endl);
 
     // set_
     auto t2 = ones(*s, T);
@@ -65,22 +65,22 @@ void test(Type &T) {
     require_equal_size_dim(t2, ones({0}, T));
 
     // unsqueeze
-    REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1);
+    CATCH_REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1);
 
     // unsqueeze_
     {
       auto t2 = ones(*s, T);
       auto r = t2.unsqueeze_(0);
-      REQUIRE(r.dim() == t.dim() + 1);
+      CATCH_REQUIRE(r.dim() == t.dim() + 1);
     }
 
     // squeeze (with dimension argument)
     if (t.dim() == 0 || t.sizes()[0] == 1) {
-      REQUIRE(t.squeeze(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      CATCH_REQUIRE(t.squeeze(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
     } else {
       // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1;
       // in NumPy this is an error.
-      REQUIRE(t.squeeze(0).dim() == t.dim());
+      CATCH_REQUIRE(t.squeeze(0).dim() == t.dim());
     }
 
     // squeeze (with no dimension argument)
@@ -99,11 +99,11 @@ void test(Type &T) {
       // squeeze_ (with dimension argument)
       auto t2 = ones(*s, T);
       if (t2.dim() == 0 ||  t2.sizes()[0] == 1) {
-        REQUIRE(t2.squeeze_(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+        CATCH_REQUIRE(t2.squeeze_(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
       } else {
         // In PyTorch, it is a no-op to try to squeeze a dimension that has size != 1;
         // in NumPy this is an error.
-        REQUIRE(t2.squeeze_(0).dim() == t.dim());
+        CATCH_REQUIRE(t2.squeeze_(0).dim() == t.dim());
       }
     }
 
@@ -122,31 +122,31 @@ void test(Type &T) {
 
     // reduce (with dimension argument and with 1 return argument)
     if (t.numel() != 0) {
-      REQUIRE(t.sum(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      CATCH_REQUIRE(t.sum(0).dim() == std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      REQUIRE(t.sum(0).equal(at::zeros({}, T)));
+      CATCH_REQUIRE(t.sum(0).equal(at::zeros({}, T)));
     }
 
     // reduce (with dimension argument and with 2 return arguments)
     if (t.numel() != 0) {
       auto ret = t.min(0);
-      REQUIRE(std::get<0>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
-      REQUIRE(std::get<1>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      CATCH_REQUIRE(std::get<0>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
+      CATCH_REQUIRE(std::get<1>(ret).dim() == std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      REQUIRE_THROWS(t.min(0));
+      _CATCH_REQUIRE_THROWS(t.min(0));
     }
 
     // simple indexing
     if (t.dim() > 0 && t.numel() != 0) {
-      REQUIRE(t[0].dim() == std::max<int64_t>(t.dim() - 1, 0));
+      CATCH_REQUIRE(t[0].dim() == std::max<int64_t>(t.dim() - 1, 0));
     } else {
-      REQUIRE_THROWS(t[0]);
+      _CATCH_REQUIRE_THROWS(t[0]);
     }
 
     // fill_ (argument to fill_ can only be a 0-dim tensor)
     TRY_CATCH_ELSE(t.fill_(t.sum(0)),
-                   REQUIRE(t.dim() > 1),
-                   REQUIRE(t.dim() <= 1));
+                   CATCH_REQUIRE(t.dim() > 1),
+                   CATCH_REQUIRE(t.dim() <= 1));
   }
 
   for (auto lhs_it = sizes.begin(); lhs_it != sizes.end(); ++lhs_it) {
@@ -156,8 +156,8 @@ void test(Type &T) {
           auto lhs = ones(*lhs_it, T);
           auto rhs = ones(*rhs_it, T);
           if(*lhs_it != *rhs_it) {
-            REQUIRE(!lhs.is_same_size(rhs));
-            REQUIRE(!rhs.is_same_size(lhs));
+            CATCH_REQUIRE(!lhs.is_same_size(rhs));
+            CATCH_REQUIRE(!rhs.is_same_size(lhs));
           }
       }
       // forced size functions (resize_, resize_as, set_)
@@ -192,7 +192,7 @@ void test(Type &T) {
             auto storage = T.storage(rhs.numel(), false);
             lhs.set_(storage);
             // should not be dim 0 because an empty storage is dim 1; all other storages aren't scalars
-            REQUIRE(lhs.dim() != 0);
+            CATCH_REQUIRE(lhs.dim() != 0);
           }
           {
             // with storage, offset, sizes, strides
@@ -211,8 +211,8 @@ void test(Type &T) {
         auto rhs = ones(*rhs_it, T);
         auto rhs_size = *rhs_it;
         TRY_CATCH_ELSE(auto result = lhs.view(rhs_size),
-                       REQUIRE(lhs.numel() != rhs.numel()),
-                       REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs););
+                       CATCH_REQUIRE(lhs.numel() != rhs.numel()),
+                       CATCH_REQUIRE(lhs.numel() == rhs.numel()); require_equal_size_dim(result, rhs););
       }
 
       // take
@@ -220,7 +220,7 @@ void test(Type &T) {
         auto lhs = ones(*lhs_it, T);
         auto rhs = zeros(*rhs_it, T).toType(ScalarType::Long);
         TRY_CATCH_ELSE(auto result = lhs.take(rhs),
-                       REQUIRE(lhs.numel() == 0); REQUIRE(rhs.numel() != 0),
+                       CATCH_REQUIRE(lhs.numel() == 0); CATCH_REQUIRE(rhs.numel() != 0),
                        require_equal_size_dim(result, rhs));
       }
 
@@ -230,7 +230,7 @@ void test(Type &T) {
         auto lhs = ones(*lhs_it, T);
         auto rhs = ones(*rhs_it, T);
         TRY_CATCH_ELSE(auto result = lhs.ger(rhs),
-                       REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)),
+                       CATCH_REQUIRE((lhs.numel() == 0 || rhs.numel() == 0 || lhs.dim() != 1 || rhs.dim() != 1)),
                        [&]() {
                          int64_t dim0 = lhs.dim() == 0 ? 1 : lhs.size(0);
                          int64_t dim1 = rhs.dim() == 0 ? 1 : rhs.size(0);
@@ -246,8 +246,8 @@ void test(Type &T) {
         auto rhs_size = *rhs_it;
         bool should_pass = should_expand(lhs_size, rhs_size);
         TRY_CATCH_ELSE(auto result = lhs.expand(rhs_size),
-                       REQUIRE(!should_pass),
-                       REQUIRE(should_pass); require_equal_size_dim(result, rhs););
+                       CATCH_REQUIRE(!should_pass),
+                       CATCH_REQUIRE(should_pass); require_equal_size_dim(result, rhs););
 
         // in-place functions (would be good if we can also do a non-broadcasting one, b/c
         // broadcasting functions will always end up operating on tensors of same size;
@@ -255,21 +255,21 @@ void test(Type &T) {
         {
           bool should_pass_inplace = should_expand(rhs_size, lhs_size);
           TRY_CATCH_ELSE(lhs.add_(rhs),
-                         REQUIRE(!should_pass_inplace),
-                         REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T)););
+                         CATCH_REQUIRE(!should_pass_inplace),
+                         CATCH_REQUIRE(should_pass_inplace); require_equal_size_dim(lhs, ones(*lhs_it, T)););
         }
       }
     }
   }
 }
 
-TEST_CASE( "scalar tensor test CPU", "[cpu]" ) {
+CATCH_TEST_CASE( "scalar tensor test CPU", "[cpu]" ) {
   manual_seed(123, at::kCPU);
 
   test(CPU(kFloat));
 }
 
-TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) {
+CATCH_TEST_CASE( "scalar tensor test CUDA", "[cuda]" ) {
   manual_seed(123, at::kCUDA);
 
   if (at::hasCUDA()) {
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index 72ef4e4ad3cf4c..247830c3cc839c 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include <iostream>
 // define constants like M_PI and C keywords for MSVC
@@ -33,25 +33,25 @@ struct Foo<Half> {
 
 void test_overflow() {
   auto s1 = Scalar(M_PI);
-  REQUIRE(s1.toFloat() == static_cast<float>(M_PI));
+  CATCH_REQUIRE(s1.toFloat() == static_cast<float>(M_PI));
   s1.toHalf();
 
   s1 = Scalar(100000);
-  REQUIRE(s1.toFloat() == 100000.0);
-  REQUIRE(s1.toInt() == 100000);
+  CATCH_REQUIRE(s1.toFloat() == 100000.0);
+  CATCH_REQUIRE(s1.toInt() == 100000);
 
-  REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error);
+  CATCH_REQUIRE_THROWS_AS(s1.toHalf(), std::domain_error);
 
   s1 = Scalar(NAN);
-  REQUIRE(std::isnan(s1.toFloat()));
-  REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
+  CATCH_REQUIRE(std::isnan(s1.toFloat()));
+  CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
 
   s1 = Scalar(INFINITY);
-  REQUIRE(std::isinf(s1.toFloat()));
-  REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
+  CATCH_REQUIRE(std::isinf(s1.toFloat()));
+  CATCH_REQUIRE_THROWS_AS(s1.toInt(), std::domain_error);
 }
 
-TEST_CASE( "scalar test", "[]" ) {
+CATCH_TEST_CASE( "scalar test", "[]" ) {
 
   manual_seed(123, at::kCPU);
   manual_seed(123, at::kCUDA);
@@ -62,7 +62,7 @@ TEST_CASE( "scalar test", "[]" ) {
   Scalar h2 = h;
   cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " " << bar.toDouble() << " " << what.isIntegral() <<  "\n";
   Generator & gen = at::globalContext().defaultGenerator(at::kCPU);
-  REQUIRE_NOTHROW(gen.seed());
+  CATCH_REQUIRE_NOTHROW(gen.seed());
   auto && C = at::globalContext();
   if(at::hasCUDA()) {
     auto t2 = zeros({4,4}, at::kCUDA);
@@ -71,12 +71,12 @@ TEST_CASE( "scalar test", "[]" ) {
   auto t = ones({4,4});
 
   auto wha2 = zeros({4,4}).add(t).sum();
-  REQUIRE( wha2.toCDouble() == 16.0 );
+  CATCH_REQUIRE( wha2.toCDouble() == 16.0 );
 
-  REQUIRE( t.sizes()[0] == 4 );
-  REQUIRE( t.sizes()[1] == 4 );
-  REQUIRE( t.strides()[0] == 4 );
-  REQUIRE( t.strides()[1] == 1 );
+  CATCH_REQUIRE( t.sizes()[0] == 4 );
+  CATCH_REQUIRE( t.sizes()[1] == 4 );
+  CATCH_REQUIRE( t.strides()[0] == 4 );
+  CATCH_REQUIRE( t.strides()[1] == 1 );
 
   Type & T = CPU(Float);
   Tensor x = randn({1,10}, T);
@@ -88,26 +88,26 @@ TEST_CASE( "scalar test", "[]" ) {
   Tensor next_h = i2h.add(h2h);
   next_h = next_h.tanh();
 
-  REQUIRE_THROWS(at::_local_scalar(Tensor{}));
+  _CATCH_REQUIRE_THROWS(at::_local_scalar(Tensor{}));
 
   test_overflow();
 
   if(at::hasCUDA()) {
     auto r = CUDA(Float).copy(next_h);
-    REQUIRE(CPU(Float).copy(r).equal(next_h));
+    CATCH_REQUIRE(CPU(Float).copy(r).equal(next_h));
   }
-  REQUIRE_NOTHROW(randn({10,10,2}, T));
+  CATCH_REQUIRE_NOTHROW(randn({10,10,2}, T));
 
   // check Scalar.toTensor on Scalars backed by different data types
-  REQUIRE(scalar_to_tensor(bar).type().scalarType() == kDouble);
-  REQUIRE(scalar_to_tensor(what).type().scalarType() == kLong);
-  REQUIRE(scalar_to_tensor(ones({})._local_scalar()).type().scalarType() == kDouble);
+  CATCH_REQUIRE(scalar_to_tensor(bar).type().scalarType() == kDouble);
+  CATCH_REQUIRE(scalar_to_tensor(what).type().scalarType() == kLong);
+  CATCH_REQUIRE(scalar_to_tensor(ones({})._local_scalar()).type().scalarType() == kDouble);
 
   if (x.type().scalarType() != ScalarType::Half) {
     AT_DISPATCH_ALL_TYPES(x.type(), "foo", [&] {
       scalar_t s = 1;
       std::stringstream ss;
-      REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n");
+      CATCH_REQUIRE_NOTHROW(ss << "hello, dispatch" << x.type().toString() << s << "\n");
       auto data = (scalar_t*)x.data_ptr();
       (void)data;
     });
@@ -116,10 +116,10 @@ TEST_CASE( "scalar test", "[]" ) {
   // test direct C-scalar type conversions
   {
     auto x = ones({1,2}, T);
-    REQUIRE_THROWS(x.toCFloat());
+    _CATCH_REQUIRE_THROWS(x.toCFloat());
   }
   auto float_one = ones({}, T);
-  REQUIRE(float_one.toCFloat() == 1);
-  REQUIRE(float_one.toCInt() == 1);
-  REQUIRE((float_one.toCHalf() == 1));
+  CATCH_REQUIRE(float_one.toCFloat() == 1);
+  CATCH_REQUIRE(float_one.toCInt() == 1);
+  CATCH_REQUIRE((float_one.toCHalf() == 1));
 }
diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp
index 145c4f4c261276..8dc015dd1d06ae 100644
--- a/aten/src/ATen/test/stream_test.cpp
+++ b/aten/src/ATen/test/stream_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/cuda/CUDAContext.h"
 #include "ATen/cuda/CUDAGuard.h"
@@ -14,7 +14,7 @@
 /*
 Tests related to ATen streams.
 */
-TEST_CASE(
+CATCH_TEST_CASE(
     "Copying and Moving Streams",
     "Verifies streams are live through copying and moving") {
   int32_t device = -1;
@@ -29,14 +29,14 @@ TEST_CASE(
 
     copyStream = s;
 
-    REQUIRE(copyStream.internals() == s.internals());
-    REQUIRE(copyStream.device() == device);
-    REQUIRE(copyStream.stream() == cuda_stream);
+    CATCH_REQUIRE(copyStream.internals() == s.internals());
+    CATCH_REQUIRE(copyStream.device() == device);
+    CATCH_REQUIRE(copyStream.stream() == cuda_stream);
   }
 
-  REQUIRE(copyStream.internals());
-  REQUIRE(copyStream.device() == device);
-  REQUIRE(copyStream.stream() == cuda_stream);
+  CATCH_REQUIRE(copyStream.internals());
+  CATCH_REQUIRE(copyStream.device() == device);
+  CATCH_REQUIRE(copyStream.stream() == cuda_stream);
 
   // Tests that moving works as expected and preserves the stream
   at::cuda::CUDAStream moveStream;
@@ -47,41 +47,41 @@ TEST_CASE(
 
     moveStream = std::move(s);
 
-    REQUIRE(moveStream.device() == device);
-    REQUIRE(moveStream.stream() == cuda_stream);
+    CATCH_REQUIRE(moveStream.device() == device);
+    CATCH_REQUIRE(moveStream.stream() == cuda_stream);
   }
 
-  REQUIRE(moveStream.internals());
-  REQUIRE(moveStream.device() == device);
-  REQUIRE(moveStream.stream() == cuda_stream);
+  CATCH_REQUIRE(moveStream.internals());
+  CATCH_REQUIRE(moveStream.device() == device);
+  CATCH_REQUIRE(moveStream.stream() == cuda_stream);
 }
 
-TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") {
+CATCH_TEST_CASE("Getting and Setting Streams", "Verifies streams are set properly") {
   at::cuda::CUDAStream myStream = at::cuda::createCUDAStream();
 
   // Sets and gets
   at::cuda::setCurrentCUDAStream(myStream);
   at::cuda::CUDAStream curStream = at::cuda::getCurrentCUDAStream();
 
-  REQUIRE(myStream == curStream);
+  CATCH_REQUIRE(myStream == curStream);
 
   // Gets, sets, and gets default stream
   at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
   at::cuda::setCurrentCUDAStream(defaultStream);
   curStream = at::cuda::getCurrentCUDAStream();
 
-  REQUIRE(defaultStream != myStream);
-  REQUIRE(curStream == defaultStream);
+  CATCH_REQUIRE(defaultStream != myStream);
+  CATCH_REQUIRE(curStream == defaultStream);
 }
 
 void thread_fun(at::cuda::CUDAStream& cur_thread_stream) {
   auto new_stream = at::cuda::createCUDAStream();
   at::cuda::setCurrentCUDAStream(new_stream);
   cur_thread_stream = at::cuda::getCurrentCUDAStream();
-  REQUIRE(cur_thread_stream == new_stream);
+  CATCH_REQUIRE(cur_thread_stream == new_stream);
 }
 
-TEST_CASE(
+CATCH_TEST_CASE(
     "Multithread Getting and Setting",
     "Ensures streams are thread local") {
   at::cuda::CUDAStream s0, s1;
@@ -94,25 +94,25 @@ TEST_CASE(
   at::cuda::CUDAStream cur_stream = at::cuda::getCurrentCUDAStream();
   at::cuda::CUDAStream default_stream = at::cuda::getDefaultCUDAStream();
 
-  REQUIRE(cur_stream == default_stream);
-  REQUIRE(cur_stream != s0);
-  REQUIRE(cur_stream != s1);
-  REQUIRE(s0 != s1);
+  CATCH_REQUIRE(cur_stream == default_stream);
+  CATCH_REQUIRE(cur_stream != s0);
+  CATCH_REQUIRE(cur_stream != s1);
+  CATCH_REQUIRE(s0 != s1);
 }
 
-TEST_CASE("CUDAGuard") {
+CATCH_TEST_CASE("CUDAGuard") {
   if (at::cuda::getNumGPUs() < 2) {
     return;
   }
 
   // -- begin setup
 
-  REQUIRE(at::cuda::current_device() == 0);
+  CATCH_REQUIRE(at::cuda::current_device() == 0);
   std::vector<at::cuda::CUDAStream> streams0 = {
       at::cuda::getDefaultCUDAStream(),
       at::cuda::createCUDAStream()};
-  REQUIRE(streams0[0].device() == 0);
-  REQUIRE(streams0[1].device() == 0);
+  CATCH_REQUIRE(streams0[0].device() == 0);
+  CATCH_REQUIRE(streams0[1].device() == 0);
   at::cuda::setCurrentCUDAStream(streams0[0]);
 
   std::vector<at::cuda::CUDAStream> streams1;
@@ -121,47 +121,47 @@ TEST_CASE("CUDAGuard") {
     streams1.push_back(at::cuda::getDefaultCUDAStream());
     streams1.push_back(at::cuda::createCUDAStream());
   }
-  REQUIRE(streams1[0].device() == 1);
-  REQUIRE(streams1[1].device() == 1);
+  CATCH_REQUIRE(streams1[0].device() == 1);
+  CATCH_REQUIRE(streams1[1].device() == 1);
   at::cuda::setCurrentCUDAStream(streams1[0]);
 
-  REQUIRE(at::cuda::current_device() == 0);
+  CATCH_REQUIRE(at::cuda::current_device() == 0);
 
   // -- end setup
 
   // Test that all original streams are recorded.
   {
     at::cuda::CUDAGuard guard;
-    REQUIRE(guard.original_streams().empty());
+    CATCH_REQUIRE(guard.original_streams().empty());
     guard.set_stream(streams0[0]);
-    REQUIRE(
+    CATCH_REQUIRE(
         guard.original_streams().size() == at::cuda::getNumGPUs());
-    REQUIRE(guard.original_streams()[0] == streams0[0]);
-    REQUIRE(guard.original_streams()[1] == streams1[0]);
+    CATCH_REQUIRE(guard.original_streams()[0] == streams0[0]);
+    CATCH_REQUIRE(guard.original_streams()[1] == streams1[0]);
   }
 
   // Setting a stream changes the current device and the stream on that device
   {
     at::cuda::CUDAGuard guard(streams1[1]);
-    REQUIRE(guard.last_device() == 1);
-    REQUIRE(at::cuda::current_device() == 1);
-    REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[1]);
+    CATCH_REQUIRE(guard.last_device() == 1);
+    CATCH_REQUIRE(at::cuda::current_device() == 1);
+    CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[1]);
   }
 
   // Device and stream are now reset
-  REQUIRE(at::cuda::current_device() == 0);
-  REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
+  CATCH_REQUIRE(at::cuda::current_device() == 0);
+  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
 
   // Setting only the device changes only the current device and not the stream
   {
     at::cuda::CUDAGuard guard(/*device=*/1);
-    REQUIRE(guard.last_device() == 1);
-    REQUIRE(at::cuda::current_device() == 1);
-    REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
+    CATCH_REQUIRE(guard.last_device() == 1);
+    CATCH_REQUIRE(at::cuda::current_device() == 1);
+    CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
   }
 
-  REQUIRE(at::cuda::current_device() == 0);
-  REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]);
+  CATCH_REQUIRE(at::cuda::current_device() == 0);
+  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]);
 
   // Setting the stream first, and then the device, first changes the devices
   // back, and then resets the stream on the initial device.
@@ -171,12 +171,12 @@ TEST_CASE("CUDAGuard") {
     guard.set_device(1);
   }
 
-  REQUIRE(at::cuda::current_device() == 0);
-  REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]);
-  REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
+  CATCH_REQUIRE(at::cuda::current_device() == 0);
+  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(0) == streams0[0]);
+  CATCH_REQUIRE(at::cuda::getCurrentCUDAStream(1) == streams1[0]);
 }
 
-TEST_CASE("CUDAGuardIsMovable") {
+CATCH_TEST_CASE("CUDAGuardIsMovable") {
   if (at::cuda::getNumGPUs() < 2) {
     return;
   }
@@ -185,17 +185,17 @@ TEST_CASE("CUDAGuardIsMovable") {
   at::cuda::CUDAGuard first(stream);
   first.set_device(1);
   at::cuda::CUDAGuard second(std::move(first));
-  REQUIRE(second.original_streams().size() == device_count);
-  REQUIRE(second.original_device() == 0);
-  REQUIRE(second.last_device() == 1);
+  CATCH_REQUIRE(second.original_streams().size() == device_count);
+  CATCH_REQUIRE(second.original_device() == 0);
+  CATCH_REQUIRE(second.last_device() == 1);
   at::cuda::CUDAGuard third;
   third = std::move(second);
-  REQUIRE(third.original_streams().size() == device_count);
-  REQUIRE(third.original_device() == 0);
-  REQUIRE(third.last_device() == 1);
+  CATCH_REQUIRE(third.original_streams().size() == device_count);
+  CATCH_REQUIRE(third.original_device() == 0);
+  CATCH_REQUIRE(third.last_device() == 1);
 }
 
-TEST_CASE("Streampool Round Robin") {
+CATCH_TEST_CASE("Streampool Round Robin") {
   std::vector<at::cuda::CUDAStream> streams{};
   for (int i = 0; i < 200; ++i) {
     streams.emplace_back(at::cuda::detail::CUDAStream_createStream());
@@ -209,10 +209,10 @@ TEST_CASE("Streampool Round Robin") {
     if (!result_pair.second) hasDuplicates = true;
   }
 
-  REQUIRE(hasDuplicates);
+  CATCH_REQUIRE(hasDuplicates);
 }
 
-TEST_CASE("Multi-GPU") {
+CATCH_TEST_CASE("Multi-GPU") {
   if (at::cuda::getNumGPUs() < 2) return;
 
   at::cuda::CUDAStream s0 = at::cuda::createCUDAStream(true, 0);
@@ -221,17 +221,17 @@ TEST_CASE("Multi-GPU") {
   at::cuda::setCurrentCUDAStream(s0);
   at::cuda::setCurrentCUDAStream(s1);
 
-  REQUIRE(s0 == at::cuda::getCurrentCUDAStream());
+  CATCH_REQUIRE(s0 == at::cuda::getCurrentCUDAStream());
 
   at::DeviceGuard device_guard{1};
-  REQUIRE(s1 == at::cuda::getCurrentCUDAStream());
+  CATCH_REQUIRE(s1 == at::cuda::getCurrentCUDAStream());
 }
 
-TEST_CASE("CUDAEvent Syncs") {
+CATCH_TEST_CASE("CUDAEvent Syncs") {
   const auto stream = at::cuda::createCUDAStream();
   at::cuda::CUDAEvent event;
 
-  REQUIRE(!event.happened());
+  CATCH_REQUIRE(!event.happened());
 
   event.recordOnce(stream);
 
@@ -242,10 +242,10 @@ TEST_CASE("CUDAEvent Syncs") {
   wait_stream1.synchronize_with(event);
 
   cudaStreamSynchronize(wait_stream0);
-  REQUIRE(event.happened());
+  CATCH_REQUIRE(event.happened());
 }
 
-TEST_CASE("Cross-Device Events") {
+CATCH_TEST_CASE("Cross-Device Events") {
   if (at::cuda::getNumGPUs() < 2) return;
 
   const auto stream0 = at::cuda::createCUDAStream();
@@ -260,10 +260,10 @@ TEST_CASE("Cross-Device Events") {
   
   event0 = std::move(event1);
   
-  REQUIRE(event0.device() == 1);
+  CATCH_REQUIRE(event0.device() == 1);
 
   stream0.synchronize_with(event0);
   
   cudaStreamSynchronize(stream0);
-  REQUIRE(event0.happened());
+  CATCH_REQUIRE(event0.happened());
 }
diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp
index 552328029ce03c..81701733b53693 100644
--- a/aten/src/ATen/test/test_parallel.cpp
+++ b/aten/src/ATen/test/test_parallel.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "ATen/DLConvertor.h"
@@ -11,7 +11,7 @@
 
 using namespace at;
 
-TEST_CASE( "parallel", "[cpu]" ) {
+CATCH_TEST_CASE( "parallel", "[cpu]" ) {
 
   manual_seed(123, at::kCPU);
   set_num_threads(1);
@@ -24,5 +24,5 @@ TEST_CASE( "parallel", "[cpu]" ) {
   as[0] = 1;
   as[1] = 0;
   as[2] = 0;
-  REQUIRE(a.sum(0).equal(as));
+  CATCH_REQUIRE(a.sum(0).equal(as));
 }
diff --git a/aten/src/ATen/test/undefined_tensor_test.cpp b/aten/src/ATen/test/undefined_tensor_test.cpp
index 1b605e7271c6c7..c01dff2d0038b1 100644
--- a/aten/src/ATen/test/undefined_tensor_test.cpp
+++ b/aten/src/ATen/test/undefined_tensor_test.cpp
@@ -1,14 +1,14 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 #include <string>
 #include "test_seed.h"
 
 using namespace at;
 
-TEST_CASE( "undefined tensor test", "[]" ) {
+CATCH_TEST_CASE( "undefined tensor test", "[]" ) {
   manual_seed(123, at::kCPU);
 
   // mainly test ops on undefined tensors don't segfault and give a reasonable errror message.
@@ -17,36 +17,36 @@ TEST_CASE( "undefined tensor test", "[]" ) {
 
   std::stringstream ss;
   ss << und << std::endl;
-  REQUIRE(!und.defined());
-  REQUIRE(std::string("UndefinedType") == und.toString());
-
-  REQUIRE_THROWS(und.strides());
-  REQUIRE_THROWS(und.dim());
-  REQUIRE_THROWS([]() {return Tensor();}() = Scalar(5));
-  REQUIRE_THROWS(und.add(und));
-  REQUIRE_THROWS(und.add(ft));
-  REQUIRE_THROWS(ft.add(und));
-  REQUIRE_THROWS(und.add(5));
-  REQUIRE_THROWS(und.mm(und));
+  CATCH_REQUIRE(!und.defined());
+  CATCH_REQUIRE(std::string("UndefinedType") == und.toString());
+
+  _CATCH_REQUIRE_THROWS(und.strides());
+  _CATCH_REQUIRE_THROWS(und.dim());
+  _CATCH_REQUIRE_THROWS([]() {return Tensor();}() = Scalar(5));
+  _CATCH_REQUIRE_THROWS(und.add(und));
+  _CATCH_REQUIRE_THROWS(und.add(ft));
+  _CATCH_REQUIRE_THROWS(ft.add(und));
+  _CATCH_REQUIRE_THROWS(und.add(5));
+  _CATCH_REQUIRE_THROWS(und.mm(und));
 
   und.toType(und.type());
-  REQUIRE_THROWS(und.toType(ft.type()));
-  REQUIRE_THROWS(ft.toType(und.type()));
+  _CATCH_REQUIRE_THROWS(und.toType(ft.type()));
+  _CATCH_REQUIRE_THROWS(ft.toType(und.type()));
   und.toType(ScalarType::Undefined);
-  REQUIRE_THROWS(und.toType(ScalarType::Float));
-  REQUIRE_THROWS(ft.toType(ScalarType::Undefined));
+  _CATCH_REQUIRE_THROWS(und.toType(ScalarType::Float));
+  _CATCH_REQUIRE_THROWS(ft.toType(ScalarType::Undefined));
 
   // copy_
-  REQUIRE_THROWS(und.copy_(und));
-  REQUIRE_THROWS(und.copy_(ft));
-  REQUIRE_THROWS(ft.copy_(und));
+  _CATCH_REQUIRE_THROWS(und.copy_(und));
+  _CATCH_REQUIRE_THROWS(und.copy_(ft));
+  _CATCH_REQUIRE_THROWS(ft.copy_(und));
 
   und.toBackend(Backend::Undefined);
-  REQUIRE_THROWS(und.toBackend(Backend::CPU));
-  REQUIRE_THROWS(ft.toBackend(Backend::Undefined));
+  _CATCH_REQUIRE_THROWS(und.toBackend(Backend::CPU));
+  _CATCH_REQUIRE_THROWS(ft.toBackend(Backend::Undefined));
 
   Tensor to_move = ones({1}, CPU(kFloat));
   Tensor m(std::move(to_move));
-  REQUIRE(!to_move.defined());
-  REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensor::singleton());
+  CATCH_REQUIRE(!to_move.defined());
+  CATCH_REQUIRE(to_move.unsafeGetTensorImpl() == UndefinedTensorImpl::singleton());
 }
diff --git a/aten/src/ATen/test/weakref_test.cpp b/aten/src/ATen/test/weakref_test.cpp
index 167520beb58382..42c9f61b19b5e1 100644
--- a/aten/src/ATen/test/weakref_test.cpp
+++ b/aten/src/ATen/test/weakref_test.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 
@@ -10,53 +10,53 @@
 using at::Tensor;
 using at::WeakTensor;
 
-TEST_CASE( "Weak pointer tests", "" ) {
-  SECTION("gets invalidated") {
+CATCH_TEST_CASE( "Weak pointer tests", "" ) {
+  CATCH_SECTION("gets invalidated") {
     Tensor a = at::ones({2, 2});
     WeakTensor b = a;
     a.reset();
-    REQUIRE_FALSE(b.lock().defined());
+    CATCH_REQUIRE_FALSE(b.lock().defined());
   }
 
-  SECTION("can successfully lock") {
+  CATCH_SECTION("can successfully lock") {
     Tensor a = at::ones({2, 2});
     WeakTensor b = a;
     auto c = b.lock();
-    REQUIRE(c.defined());
+    CATCH_REQUIRE(c.defined());
 
     a.reset();
-    REQUIRE(b.lock().defined());
+    CATCH_REQUIRE(b.lock().defined());
     c.reset();
-    REQUIRE_FALSE(b.lock().defined());
+    CATCH_REQUIRE_FALSE(b.lock().defined());
   }
 
-  SECTION("updates refcounts correctly") {
+  CATCH_SECTION("updates refcounts correctly") {
     Tensor a = at::ones({2, 2});
-    REQUIRE(a.use_count() == 1);
-    REQUIRE(a.weak_use_count() == 1);
+    CATCH_REQUIRE(a.use_count() == 1);
+    CATCH_REQUIRE(a.weak_use_count() == 1);
     {
       WeakTensor b = a;
-      REQUIRE(a.use_count() == 1);
-      REQUIRE(a.weak_use_count() == 2);
+      CATCH_REQUIRE(a.use_count() == 1);
+      CATCH_REQUIRE(a.weak_use_count() == 2);
     }
-    REQUIRE(a.use_count() == 1);
-    REQUIRE(a.weak_use_count() == 1);
+    CATCH_REQUIRE(a.use_count() == 1);
+    CATCH_REQUIRE(a.weak_use_count() == 1);
     {
       WeakTensor b = a;
-      REQUIRE(a.use_count() == 1);
+      CATCH_REQUIRE(a.use_count() == 1);
       auto locked = b.lock();
-      REQUIRE(locked.defined());
-      REQUIRE(a.use_count() == 2);
+      CATCH_REQUIRE(locked.defined());
+      CATCH_REQUIRE(a.use_count() == 2);
     }
-    REQUIRE(a.use_count() == 1);
-    REQUIRE(a.weak_use_count() == 1);
+    CATCH_REQUIRE(a.use_count() == 1);
+    CATCH_REQUIRE(a.weak_use_count() == 1);
     {
       WeakTensor b = a;
-      REQUIRE(a.use_count() == 1);
-      REQUIRE(a.weak_use_count() == 2);
+      CATCH_REQUIRE(a.use_count() == 1);
+      CATCH_REQUIRE(a.weak_use_count() == 2);
       a.reset();
-      REQUIRE(b.use_count() == 0);
-      REQUIRE(b.weak_use_count() == 1);
+      CATCH_REQUIRE(b.use_count() == 0);
+      CATCH_REQUIRE(b.weak_use_count() == 1);
     }
   }
 }
diff --git a/aten/src/ATen/test/wrapdim_test.cpp b/aten/src/ATen/test/wrapdim_test.cpp
index 8e813bc7f7deeb..f76dac212a0921 100644
--- a/aten/src/ATen/test/wrapdim_test.cpp
+++ b/aten/src/ATen/test/wrapdim_test.cpp
@@ -1,43 +1,43 @@
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include "ATen/ATen.h"
 #include "test_seed.h"
 
 using namespace at;
 
-TEST_CASE( "wrapdim test", "[]" ) {
+CATCH_TEST_CASE( "wrapdim test", "[]" ) {
   manual_seed(123, at::kCPU);
 
   Type & T = CPU(kFloat);
 
-  SECTION( "simple case" ) {
+  CATCH_SECTION( "simple case" ) {
     auto a = randn({2, 3, 4, 5}, T);
-    REQUIRE(a.prod(-4).equal(a.prod(0)));
-    REQUIRE(a.prod(3).equal(a.prod(-1)));
+    CATCH_REQUIRE(a.prod(-4).equal(a.prod(0)));
+    CATCH_REQUIRE(a.prod(3).equal(a.prod(-1)));
   }
 
-  SECTION( "expression specification" ) {
+  CATCH_SECTION( "expression specification" ) {
     auto a = randn({2, 3, 4, 5}, T);
-    REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0)));
-    REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1)));
+    CATCH_REQUIRE(a.unsqueeze(-5).equal(a.unsqueeze(0)));
+    CATCH_REQUIRE(a.unsqueeze(4).equal(a.unsqueeze(-1)));
 
     // can unsqueeze scalar
     auto b = randn(1, T);
     b.unsafeGetTensorImpl()->maybe_zero_dim(true);
-    REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1)));
+    CATCH_REQUIRE(b.unsqueeze(0).equal(b.unsqueeze(-1)));
   }
 
-  SECTION( "empty tensor" ) {
+  CATCH_SECTION( "empty tensor" ) {
     auto a = randn(0, T);
-    REQUIRE(a.prod(0).equal(at::ones({}, T)));
+    CATCH_REQUIRE(a.prod(0).equal(at::ones({}, T)));
   }
 
-  SECTION( "scalar vs 1-dim, 1-size" ) {
+  CATCH_SECTION( "scalar vs 1-dim, 1-size" ) {
     auto a = randn(1, T);
-    REQUIRE(a.prod(0).equal(a.prod(-1)));
+    CATCH_REQUIRE(a.prod(0).equal(a.prod(-1)));
     a.unsafeGetTensorImpl()->maybe_zero_dim(true);
-    REQUIRE(a.dim() == 0);
-    REQUIRE(a.prod(0).equal(a.prod(-1)));
+    CATCH_REQUIRE(a.dim() == 0);
+    CATCH_REQUIRE(a.prod(0).equal(a.prod(-1)));
   }
 }
diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt
index 9fe22beb0dc54e..10d43e1433c811 100644
--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@@ -26,7 +26,6 @@ SET(hdr
 
 set(ATen_TH_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/THGeneral.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/THHalf.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/THAllocator.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/THSize.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/THStorageFunctions.cpp
diff --git a/aten/src/TH/THDiskFile.cpp b/aten/src/TH/THDiskFile.cpp
index a806df6ac62ebf..ddf993df6605d7 100644
--- a/aten/src/TH/THDiskFile.cpp
+++ b/aten/src/TH/THDiskFile.cpp
@@ -359,9 +359,9 @@ READ_WRITE_METHODS(float, Float,
                    int ret = fscanf(dfself->handle, "%g", &data[i]); if(ret <= 0) break; else nread++,
                    int ret = fprintf(dfself->handle, "%.9g", data[i]); if(ret <= 0) break; else nwrite++)
 
-READ_WRITE_METHODS(THHalf, Half,
-                   float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= TH_float2half(buf); nread++; },
-                   int ret = fprintf(dfself->handle, "%.9g", TH_half2float(data[i])); if(ret <= 0) break; else nwrite++)
+READ_WRITE_METHODS(at::Half, Half,
+                   float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= static_cast<at::Half>(buf); nread++; },
+                   int ret = fprintf(dfself->handle, "%.9g", static_cast<float>(data[i])); if(ret <= 0) break; else nwrite++)
 
 READ_WRITE_METHODS(double, Double,
                    int ret = fscanf(dfself->handle, "%lg", &data[i]); if(ret <= 0) break; else nread++,
diff --git a/aten/src/TH/THGenerateHalfType.h b/aten/src/TH/THGenerateHalfType.h
index 09d4c878d0f11e..8e1b5eaed4d946 100644
--- a/aten/src/TH/THGenerateHalfType.h
+++ b/aten/src/TH/THGenerateHalfType.h
@@ -5,8 +5,8 @@
 #include "THHalf.h"
 #define scalar_t THHalf
 #define accreal float
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) TH_half2float(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) TH_float2half(_val)
+#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
+#define TH_CONVERT_ACCREAL_TO_REAL(_val) (scalar_t)(_val)
 #define Real Half
 #define THInf TH_HALF_BITS_TO_LITERAL(TH_HALF_INF)
 #define TH_REAL_IS_HALF
diff --git a/aten/src/TH/THHalf.cpp b/aten/src/TH/THHalf.cpp
deleted file mode 100644
index c95272c5f13e34..00000000000000
--- a/aten/src/TH/THHalf.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-#include "THHalf.h"
-#include <ATen/core/Half.h>
-
-/* Copyright 1993-2014 NVIDIA Corporation.  All rights reserved. */
-
-THHalf TH_float2half(float f)
-{
-  THHalf h;
-  TH_float2halfbits(&f, &h.x);
-  return h;
-}
-
-TH_API float TH_half2float(THHalf h)
-{
-  float f;
-  TH_halfbits2float(&h.x, &f);
-  return f;
-}
-
-
-void TH_halfbits2float(unsigned short* src, float* res)
-{
-  *res = at::detail::halfbits2float(*src);
-}
-
-
-void TH_float2halfbits(float* src, unsigned short* dest)
-{
-  *dest = at::detail::float2halfbits(*src);
-}
diff --git a/aten/src/TH/THHalf.h b/aten/src/TH/THHalf.h
index fb68639ec44752..f7c884f2cc67bd 100644
--- a/aten/src/TH/THHalf.h
+++ b/aten/src/TH/THHalf.h
@@ -1,10 +1,8 @@
 #ifndef TH_HALF_H
 #define TH_HALF_H
 
-#include <TH/THGeneral.h>
-
 #ifdef __cplusplus
-#include <ATen/TensorImpl.h>
+#include <ATen/core/Half.h>
 #endif
 
 #ifdef __cplusplus
@@ -14,10 +12,4 @@ typedef struct at_Half at_Half;
 #define THHalf at_Half
 #endif
 
-TH_API void TH_float2halfbits(float*, unsigned short*);
-TH_API void TH_halfbits2float(unsigned short*, float*);
-
-TH_API THHalf TH_float2half(float);
-TH_API float TH_half2float(THHalf);
-
 #endif
diff --git a/aten/src/TH/THMemoryFile.cpp b/aten/src/TH/THMemoryFile.cpp
index 3f2187b68f74ea..b74d7926ebff21 100644
--- a/aten/src/TH/THMemoryFile.cpp
+++ b/aten/src/TH/THMemoryFile.cpp
@@ -343,11 +343,11 @@ READ_WRITE_METHODS(float, Float,
                    nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", data[i]),
                    1)
 
-READ_WRITE_METHODS(THHalf, Half,
+READ_WRITE_METHODS(at::Half, Half,
                    int nByteRead_; float buf; \
                    int ret = sscanf((char*) THCharStorage_data(mfself->storage)+mfself->position, "%g%n", &buf, &nByteRead_); \
-                   data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", TH_half2float(data[i])),
+                   data[i] = static_cast<at::Half>(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf((char*) THCharStorage_data(mfself->storage)+mfself->position, mfself->storage->numel()-mfself->position, "%.9g", static_cast<float>(data[i])),
                    1)
 
 READ_WRITE_METHODS(double, Double,
diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp
index a5319e67dabe61..c4d1f778250ea2 100644
--- a/aten/src/TH/THStorageFunctions.cpp
+++ b/aten/src/TH/THStorageFunctions.cpp
@@ -15,9 +15,9 @@
 #include "generic/THStorageCopy.cpp"
 #include "THGenerateHalfType.h"
 
-THStorage* THStorage_new(at::ScalarType scalar_type) {
+THStorage* THStorage_new(caffe2::TypeMeta data_type) {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(scalar_type),
+      data_type,
       0,
       getTHDefaultAllocator(),
       true).release();
diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp
index 93a89f1753f000..95e5bacc2cef55 100644
--- a/aten/src/TH/THStorageFunctions.hpp
+++ b/aten/src/TH/THStorageFunctions.hpp
@@ -30,7 +30,7 @@
 //    If it is not, you must report that the storage is dead.
 //
 
-TH_CPP_API THStorage* THStorage_new(at::ScalarType scalar_type);
+TH_CPP_API THStorage* THStorage_new(caffe2::TypeMeta data_type);
 TH_API ptrdiff_t THStorage_size(const THStorage *self);
 
 TH_API void THStorage_retain(THStorage *storage);
diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp
index 0c731779b95685..887b0dcf2dff97 100644
--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@@ -8,7 +8,7 @@
 
 #include <numeric>
 
-// NB: This is NOT valid on UndefinedTensor
+// NB: This is NOT valid on UndefinedTensorImpl
 void THTensor_free(THTensor *self)
 {
   if (!self) return;
@@ -39,14 +39,14 @@ void THTensor_setStorageNd(THTensor *self, THStorage *storage, ptrdiff_t storage
     if (!THTensor_getStoragePtr(self)) {
       THError("Tensor: invalid null storage");
     }
-    auto scalar_type = at::dataTypeToScalarType(THTensor_getStoragePtr(self)->dtype());
+    auto data_type = THTensor_getStoragePtr(self)->dtype();
     if(storage)
     {
       c10::raw::intrusive_ptr::incref(storage);
       THTensor_stealAndSetStoragePtr(self, storage);
     }
     else {
-      THTensor_stealAndSetStoragePtr(self, THStorage_new(scalar_type));
+      THTensor_stealAndSetStoragePtr(self, THStorage_new(data_type));
     }
   }
 
@@ -123,7 +123,7 @@ void THTensor_resizeNd(THTensor *self, int nDimension, const int64_t *size, cons
   if(totalSize+self->storage_offset() > 0)
   {
     if(!THTensor_getStoragePtr(self)) {
-      THTensor_stealAndSetStoragePtr(self, THStorage_new(self->scalar_type()));
+      THTensor_stealAndSetStoragePtr(self, THStorage_new(self->dtype()));
     }
     if(totalSize+self->storage_offset() > THTensor_getStoragePtr(self)->numel()) {
       THStorage_resize(THTensor_getStoragePtr(self), totalSize+self->storage_offset());
@@ -202,5 +202,5 @@ void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) {
   // Caffe2 might have tensors whose storages are null, but we
   // don't allow it in PyTorch.
   AT_ASSERT(storage);
-  tensor->storage_ = at::Storage(storage);
+  tensor->storage_ = at::Storage(c10::intrusive_ptr<THStorage>::reclaim(storage));
 }
diff --git a/aten/src/TH/THTensorCopy.cpp b/aten/src/TH/THTensorCopy.cpp
index d8df519e26bdbc..482a7b986f5302 100644
--- a/aten/src/TH/THTensorCopy.cpp
+++ b/aten/src/TH/THTensorCopy.cpp
@@ -1,6 +1,8 @@
 #include "THTensor.hpp"
 #include "THVector.h"
 
+#include <algorithm>
+
 #include "generic/THTensorCopy.cpp"
 #include "THGenerateAllTypes.h"
 
diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp
index 7ed962567a31ff..6dfd90cfbe1bd2 100644
--- a/aten/src/TH/generic/THStorage.cpp
+++ b/aten/src/TH/generic/THStorage.cpp
@@ -21,13 +21,13 @@ size_t THStorage_(elementSize)()
 
 THStorage* THStorage_(new)(void)
 {
-  return THStorage_new(at::CTypeToScalarType<scalar_t>::to());
+  return THStorage_new(caffe2::TypeMeta::Make<scalar_t>());
 }
 
 THStorage* THStorage_(newWithSize)(ptrdiff_t size)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       size,
       getTHDefaultAllocator(),
       true).release();
@@ -38,7 +38,7 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
                                         at::Allocator *allocator)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       size,
       allocator,
       true).release();
@@ -48,18 +48,18 @@ THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
 
 THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags)
 {
-  auto scalar_type = at::CTypeToScalarType<scalar_t>::to();
+  auto type_meta = caffe2::TypeMeta::Make<scalar_t>();
   size_t actual_size = -1;
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(scalar_type),
+      type_meta,
       size,
       THMapAllocator::makeDataPtr(
-          filename, flags, size * at::elementSize(scalar_type), &actual_size),
+          filename, flags, size * type_meta.itemsize(), &actual_size),
       /* allocator */ nullptr,
       false).release();
 
   if (size <= 0) {
-    storage->set_numel(actual_size / at::elementSize(scalar_type));
+    storage->set_numel(actual_size / type_meta.itemsize());
   }
 
   return storage;
@@ -116,7 +116,7 @@ void THStorage_(free)(THStorage *storage)
 THStorage* THStorage_(newWithDataAndAllocator)(at::DataPtr&& data, ptrdiff_t size,
                                                at::Allocator* allocator) {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       size,
       std::move(data),
       allocator,
diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp
index 1de588bbd2d75b..ea8a0d5808cb16 100644
--- a/aten/src/TH/generic/THStorageCopy.cpp
+++ b/aten/src/TH/generic/THStorageCopy.cpp
@@ -29,40 +29,6 @@ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage
     data[i] = static_cast<scalar_t>(src_data[i]);                           \
 }
 
-#define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC)		\
-void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
-{ \
-  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \
-  ptrdiff_t i;								\
-  auto data = THStorage_(data)(storage);      \
-  auto src_data = TH##TYPENAMESRC##Storage_data(src); \
-  for(i = 0; i < storage->numel(); i++)					\
-    data[i] = (scalar_t)TH_half2float(src_data[i]); \
-}
-
-#define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC)		\
-void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
-{ \
-  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \
-  ptrdiff_t i;								\
-  auto data = THStorage_(data)(storage);      \
-  auto src_data = TH##TYPENAMESRC##Storage_data(src); \
-  for(i = 0; i < storage->numel(); i++)					\
-    data[i] = TH_float2half((float)(src_data[i])); \
-}
-
-#define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC)		\
-void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
-{ \
-  THArgCheck(storage->numel() == src->numel(), 2, "size mismatch"); \
-  ptrdiff_t i;								\
-  auto data = THStorage_(data)(storage);      \
-  auto src_data = TH##TYPENAMESRC##Storage_data(src); \
-  for(i = 0; i < storage->numel(); i++)					\
-    data[i] = static_cast<scalar_t>(src_data[i]); \
-}
-
-#ifndef TH_REAL_IS_HALF
 IMPLEMENT_THStorage_COPY(Byte)
 IMPLEMENT_THStorage_COPY(Char)
 IMPLEMENT_THStorage_COPY(Short)
@@ -70,18 +36,6 @@ IMPLEMENT_THStorage_COPY(Int)
 IMPLEMENT_THStorage_COPY(Long)
 IMPLEMENT_THStorage_COPY(Float)
 IMPLEMENT_THStorage_COPY(Double)
-IMPLEMENT_THStorage_COPY_FROM_HALF(Half)
-#else
-/* only allow pass-through for Half */
-IMPLEMENT_THStorage_COPY_TO_FROM_HALF(Half)
-IMPLEMENT_THStorage_COPY_TO_HALF(Byte)
-IMPLEMENT_THStorage_COPY_TO_HALF(Char)
-IMPLEMENT_THStorage_COPY_TO_HALF(Short)
-IMPLEMENT_THStorage_COPY_TO_HALF(Int)
-IMPLEMENT_THStorage_COPY_TO_HALF(Long)
-IMPLEMENT_THStorage_COPY_TO_HALF(Float)
-IMPLEMENT_THStorage_COPY_TO_HALF(Double)
-#endif
-
+IMPLEMENT_THStorage_COPY(Half)
 
 #endif
diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp
index 3f373ee2119c26..9a7bd0be736f25 100644
--- a/aten/src/TH/generic/THTensor.cpp
+++ b/aten/src/TH/generic/THTensor.cpp
@@ -54,13 +54,21 @@ scalar_t *THTensor_(data)(const THTensor *self) {
 /* Empty init */
 THTensor *THTensor_(new)(void)
 {
-  return c10::make_intrusive<at::TensorImpl, at::UndefinedTensor>(THStorage_(new)(), at::CPUTensorId(), false).release();
+  return c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
+    c10::intrusive_ptr<at::StorageImpl>::reclaim(THStorage_(new)()),
+    at::CPUTensorId(),
+    false
+  ).release();
 }
 
 /* Pointer-copy init */
 THTensor *THTensor_(newWithTensor)(THTensor *tensor)
 {
-  THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensor>(THStorage_(new)(), at::CPUTensorId(), false).release();
+  THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
+    c10::intrusive_ptr<at::StorageImpl>::reclaim(THStorage_(new)()),
+    at::CPUTensorId(),
+    false
+  ).release();
   THTensor_(setStorageNd)(self,
                           THTensor_getStoragePtr(tensor),
                           tensor->storage_offset(),
@@ -75,7 +83,11 @@ THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset,
   if (strides.data()) {
     AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
   }
-  THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensor>(THStorage_(new)(), at::CPUTensorId(), false).release();
+  THTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
+    c10::intrusive_ptr<at::StorageImpl>::reclaim(THStorage_(new)()),
+    at::CPUTensorId(),
+    false
+  ).release();
   THTensor_(setStorageNd)(self, storage, storageOffset, sizes.size(),
                           const_cast<int64_t*>(sizes.data()), const_cast<int64_t*>(strides.data()));
 
@@ -547,7 +559,7 @@ ptrdiff_t THTensor_(nElement)(const THTensor *self)
   }
 }
 
-// NB: It is INVALID to call this on an UndefinedTensor
+// NB: It is INVALID to call this on an UndefinedTensorImpl
 void THTensor_(retain)(THTensor *self)
 {
   c10::raw::intrusive_ptr::incref(self);
diff --git a/aten/src/TH/generic/THTensor.h b/aten/src/TH/generic/THTensor.h
index d5316919d4e31e..27807ea57a7dd0 100644
--- a/aten/src/TH/generic/THTensor.h
+++ b/aten/src/TH/generic/THTensor.h
@@ -5,7 +5,7 @@
 /* a la lua? dim, storageoffset, ...  et les methodes ? */
 
 #ifdef __cplusplus
-#include <ATen/TensorImpl.h>
+#include <ATen/core/TensorImpl.h>
 #endif
 
 #ifdef __cplusplus
diff --git a/aten/src/TH/generic/THTensorCopy.cpp b/aten/src/TH/generic/THTensorCopy.cpp
index a9e0564fb574c8..91a60275a6c03e 100644
--- a/aten/src/TH/generic/THTensorCopy.cpp
+++ b/aten/src/TH/generic/THTensorCopy.cpp
@@ -26,13 +26,11 @@ int THTensor_(copyTransposeValid)(THTensor *tensor, THTensor *src) {
 // special case copy where tensor is contiguous and src is a transposed matrix
 // This can be generalized to most copies, but it's tricker
 void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
-  #define MIN(x, y) (((x) < (y)) ? (x) : (y))
-  #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 
 #ifdef TH_REAL_IS_BYTE
-  const int BLOCK_SZ = 120;
+  const int64_t BLOCK_SZ = 120;
 #else
-  const int BLOCK_SZ = 60;
+  const int64_t BLOCK_SZ = 60;
 #endif
 
   THTensor *buf = THTensor_(newWithSize2d)(BLOCK_SZ, BLOCK_SZ);
@@ -48,8 +46,8 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
       scalar_t *spo = sp + R + C * NR;
       scalar_t *rpo = rp + C + R * NC;
 
-      int nr = MIN(NR - R, BLOCK_SZ);
-      int nc = MIN(NC - C, BLOCK_SZ);
+      int nr = std::min(NR - R, BLOCK_SZ);
+      int nc = std::min(NC - C, BLOCK_SZ);
 
       // 1. copy columns from src to buf
       for (int c = 0; c < nc; c++) {
@@ -57,10 +55,10 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
       }
 
       // 2. transpose buf in place
-      int rc_max = MAX(nr, nc);
-      int rc_min = MIN(nr, nc);
+      int rc_max = std::max(nr, nc);
+      int rc_min = std::min(nr, nc);
       for (int r = 0; r < rc_max; r++) {
-        int end = MIN(r, rc_min);
+        int end = std::min(r, rc_min);
         for (int c = 0; c < end; c++) {
           scalar_t tmp = bp[r + BLOCK_SZ * c];
           bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
@@ -75,8 +73,6 @@ void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
     }
   }
   c10::raw::intrusive_ptr::decref(buf);
-  #undef MIN
-  #undef MAX
 }
 
 void THTensor_(copy)(THTensor *tensor, THTensor *src)
@@ -203,28 +199,6 @@ void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src
                        static_cast<inter_copy_type_t<scalar_t>>(*src_data));) \
 }
 
-#define IMPLEMENT_THTensor_COPY_TO_HALF(TYPENAMESRC, TYPE_SRC) \
-void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
-{ \
- TH_TENSOR_APPLY2(scalar_t, tensor, TYPE_SRC, src, *tensor_data = TH_float2half((float)*src_data);) \
-}
-
-#define IMPLEMENT_THTensor_COPY_FROM_HALF(TYPENAMESRC, TYPE_SRC) \
-void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
-{ \
- TH_TENSOR_APPLY2(scalar_t, tensor, TYPE_SRC, src, \
-                  *tensor_data = static_cast<scalar_t>( \
-                      static_cast<inter_copy_type_t<scalar_t>>( \
-                          TH_half2float(*src_data)));) \
-}
-
-#define IMPLEMENT_THTensor_COPY_TO_FROM_HALF(TYPENAMESRC, TYPE_SRC) \
-void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
-{ \
- TH_TENSOR_APPLY2(scalar_t, tensor, TYPE_SRC, src, *tensor_data = *src_data;) \
-}
-
-#ifndef TH_REAL_IS_HALF
 IMPLEMENT_THTensor_COPY(Byte, uint8_t)
 IMPLEMENT_THTensor_COPY(Char, int8_t)
 IMPLEMENT_THTensor_COPY(Short, int16_t)
@@ -232,18 +206,6 @@ IMPLEMENT_THTensor_COPY(Int, int32_t)
 IMPLEMENT_THTensor_COPY(Long, int64_t)
 IMPLEMENT_THTensor_COPY(Float, float)
 IMPLEMENT_THTensor_COPY(Double, double)
-IMPLEMENT_THTensor_COPY_FROM_HALF(Half, THHalf)
-#else
-/* only allow pass-through for Half */
-IMPLEMENT_THTensor_COPY_TO_FROM_HALF(Half, THHalf)
-IMPLEMENT_THTensor_COPY_TO_HALF(Byte, uint8_t)
-IMPLEMENT_THTensor_COPY_TO_HALF(Char, int8_t)
-IMPLEMENT_THTensor_COPY_TO_HALF(Short, int16_t)
-IMPLEMENT_THTensor_COPY_TO_HALF(Int, int32_t)
-IMPLEMENT_THTensor_COPY_TO_HALF(Long, int64_t)
-IMPLEMENT_THTensor_COPY_TO_HALF(Float, float)
-IMPLEMENT_THTensor_COPY_TO_HALF(Double, double)
-
-#endif /* REAL_IS_HALF */
+IMPLEMENT_THTensor_COPY(Half, at::Half)
 
 #endif
diff --git a/aten/src/THC/CMakeLists.txt b/aten/src/THC/CMakeLists.txt
index 44f5d188d5b42b..4b8ab9f4ab101a 100644
--- a/aten/src/THC/CMakeLists.txt
+++ b/aten/src/THC/CMakeLists.txt
@@ -92,7 +92,6 @@ INSTALL(FILES
           THCGenerateFloatType.h
           THCGenerateFloatTypes.h
           THCGenerateDoubleType.h
-          THCHalf.h
           THCIntegerDivider.cuh
           THCNumerics.cuh
           THCTensorSort.cuh
diff --git a/aten/src/THC/THCAtomics.cuh b/aten/src/THC/THCAtomics.cuh
index 8fec96dde6f647..756fa0f905ac13 100644
--- a/aten/src/THC/THCAtomics.cuh
+++ b/aten/src/THC/THCAtomics.cuh
@@ -2,7 +2,7 @@
 #define THC_ATOMICS_INC
 
 #include "THC.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCNumerics.cuh"
 #include "ATen/ATen.h"
 
@@ -95,7 +95,7 @@ static inline __device__ void atomicAdd(int64_t *address, int64_t val) {
   AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
 }
 
-static inline  __device__ void atomicAdd(half *address, half val) {
+static inline  __device__ void atomicAdd(at::Half *address, at::Half val) {
   unsigned int * address_as_ui =
     (unsigned int *) ((char *)address - ((size_t)address & 2));
   unsigned int old = *address_as_ui;
@@ -103,23 +103,13 @@ static inline  __device__ void atomicAdd(half *address, half val) {
 
   do {
     assumed = old;
-#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
-    half hsum;
+    at::Half hsum;
     hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-    hsum = THCNumerics<half>::add(hsum, val);
-#else
-    __half_raw hsum;
-    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-    half tmpres = THCNumerics<half>::add(hsum, val);
-    hsum = __half_raw(tmpres);
-#endif
+    hsum = THCNumerics<at::Half>::add(hsum, val);
     old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
     old = atomicCAS(address_as_ui, assumed, old);
   } while (assumed != old);
 }
-static inline __device__ void atomicAdd(at::Half *address, at::Half val) {
-  atomicAdd(reinterpret_cast<half*>(address), val);
-}
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
 // from CUDA C Programmic Guide
diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu
index 20b13d82b9a152..6375ced8c691cc 100644
--- a/aten/src/THC/THCBlas.cu
+++ b/aten/src/THC/THCBlas.cu
@@ -1,6 +1,6 @@
 #include "THCBlas.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 
 #include <algorithm>
 
@@ -50,7 +50,7 @@ double THCudaBlas_Ddot(THCState *state, int64_t n, double *x, int64_t incx, doub
   return 0;
 }
 
-half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, int64_t incy)
+at::Half THCudaBlas_Hdot(THCState *state, int64_t n, at::Half *x, int64_t incx, at::Half *y, int64_t incy)
 {
 #if CUDA_VERSION >= 8000
   if (n == 1) {
@@ -59,7 +59,7 @@ half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y,
   }
 
   if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) {
-    half result;
+    at::Half result;
     cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
     cublasSetStream(handle, THCState_getCurrentStream(state));
     THCublasCheck(cublasDotEx(handle, n,
@@ -72,10 +72,10 @@ half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y,
 
   THError("Cublas_Hdot only supports n, incx and incy "
           "up to signed integer limits: %d", INT_MAX);
-  return THC_float2half(0);
+  return 0.0;
 #else
   THError("Cublas_Hdot requires CUDA 8.0+");
-  return THC_float2half(0);
+  return 0.0;
 #endif
 }
 
@@ -267,7 +267,7 @@ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int6
 #  define CUDA_R_16F CUBLAS_DATA_HALF
 #endif
 
-void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc)
+void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::Half alpha, at::Half *a, int64_t lda, at::Half *b, int64_t ldb, at::Half beta, at::Half *c, int64_t ldc)
 {
   adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   cublasOperation_t opa = convertTransToCublasOperation(transa);
@@ -293,8 +293,8 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6
 #else
 
       // Simulated Hgemm
-      float fAlpha = THC_half2float(alpha);
-      float fBeta = THC_half2float(beta);
+      float fAlpha = alpha;
+      float fBeta = beta;
 
 #if CUDA_VERSION < 9000
       THCublasCheck(cublasSgemmEx(handle, opa, opb,
@@ -355,8 +355,8 @@ void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int6
 
 #if CUDA_VERSION >= 9010
 void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
-                             half alpha, const half *a, int64_t lda, int64_t strideA, const half *b, int64_t ldb, int64_t strideB,
-                             half beta, half *c, int64_t ldc, int64_t strideC, int64_t batchCount)
+                             at::Half alpha, const at::Half *a, int64_t lda, int64_t strideA, const at::Half *b, int64_t ldb, int64_t strideB,
+                             at::Half beta, at::Half *c, int64_t ldc, int64_t strideC, int64_t batchCount)
 {
   if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
 
@@ -371,8 +371,8 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i
 
   cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
   cublasSetStream(handle, THCState_getCurrentStream(state));
-  float fAlpha = THC_half2float(alpha);
-  float fBeta = THC_half2float(beta);
+  float fAlpha = alpha;
+  float fBeta = beta;
   THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
   THCublasCheck(cublasGemmStridedBatchedEx(handle,
                                    opa, opb, (int)m, (int)n, (int)k,
diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h
index 36a0b35294dff2..a73b47f162c3f3 100644
--- a/aten/src/THC/THCBlas.h
+++ b/aten/src/THC/THCBlas.h
@@ -2,12 +2,12 @@
 #define THC_BLAS_INC
 
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 
 /* Level 1 */
 THC_API float THCudaBlas_Sdot(THCState *state, int64_t n, float *x, int64_t incx, float *y, int64_t incy);
 THC_API double THCudaBlas_Ddot(THCState *state, int64_t n, double *x, int64_t incx, double *y, int64_t incy);
-THC_API half THCudaBlas_Hdot(THCState *state, int64_t n, half *x, int64_t incx, half *y, int64_t incy);
+THC_API THHalf THCudaBlas_Hdot(THCState *state, int64_t n, THHalf *x, int64_t incx, THHalf *y, int64_t incy);
 
 /* Level 2 */
 THC_API void THCudaBlas_Sgemv(THCState *state, char trans, int64_t m, int64_t n, float alpha, float *a, int64_t lda, float *x, int64_t incx, float beta, float *y, int64_t incy);
@@ -19,7 +19,7 @@ THC_API void THCudaBlas_Dger(THCState *state, int64_t m, int64_t n, double alpha
 THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc);
 THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
 
-THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, half alpha, half *a, int64_t lda, half *b, int64_t ldb, half beta, half *c, int64_t ldc);
+THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, THHalf alpha, THHalf *a, int64_t lda, THHalf *b, int64_t ldb, THHalf beta, THHalf *c, int64_t ldc);
 
 THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb,
@@ -38,8 +38,8 @@ THC_API void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char t
 
 #if CUDA_VERSION >= 9010
 void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
-                                     half alpha, const half *a, int64_t lda, int64_t strideA, const half *b, int64_t ldb, int64_t strideB,
-                                                                  half beta, half *c, int64_t ldc, int64_t strideC, int64_t batchCount);
+                                     THHalf alpha, const THHalf *a, int64_t lda, int64_t strideA, const THHalf *b, int64_t ldb, int64_t strideB,
+                                                                  THHalf beta, THHalf *c, int64_t ldc, int64_t strideC, int64_t batchCount);
 #endif
 
 /* Inverse */
diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp
index 6bc7da7cee6e2d..05da61bd56e754 100644
--- a/aten/src/THC/THCGeneral.cpp
+++ b/aten/src/THC/THCGeneral.cpp
@@ -125,7 +125,7 @@ void THCudaShutdown(THCState* state)
   for (int dev = 0; dev < deviceCount; ++dev) {
     THCudaCheck(cudaSetDevice(dev));
     THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]);
-    
+
     // Frees BLAS handle
     if (res->blasHandle) {
       THCublasCheck(cublasDestroy(res->blasHandle));
@@ -256,7 +256,7 @@ cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
     THError("THCState and sparseHandles must be set as there is no default sparseHandle");
     return NULL;
   }
-    
+
   int device;
   THCudaCheck(cudaGetDevice(&device));
 
@@ -280,7 +280,7 @@ cusparseHandle_t THCState_getCurrentSparseHandle(THCState *state)
 
   int device;
   THCudaCheck(cudaGetDevice(&device));
-  
+
   // Creates the sparse handle if not created yet
   THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
   if (!res->sparseHandle) {
@@ -474,30 +474,3 @@ cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalB
 
 #include "THCStorage.cpp"
 #include "THCAllocator.cpp"
-
-/* from THCHalf.h */
-
-half THC_float2half(float f)
-{
-#if CUDA_VERSION < 9000
-  half h;
-  TH_float2halfbits(&f, &h.x);
-  return h;
-#else
-  __half_raw h_raw;
-  TH_float2halfbits(&f, &h_raw.x);
-  return half(h_raw);
-#endif
-}
-
-float  THC_half2float(half h)
-{
-  float f;
-#if CUDA_VERSION < 9000
-  TH_halfbits2float(&h.x, &f);
-#else
-  __half_raw h_raw(h);
-  TH_halfbits2float(&h_raw.x, &f);
-#endif
-  return f;
-}
diff --git a/aten/src/THC/THCGenerateHalfType.h b/aten/src/THC/THCGenerateHalfType.h
index 54358a9be1a59d..596ea47904820f 100644
--- a/aten/src/THC/THCGenerateHalfType.h
+++ b/aten/src/THC/THCGenerateHalfType.h
@@ -2,9 +2,9 @@
 #error "You must define THC_GENERIC_FILE before including THGenerateHalfType.h"
 #endif
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 
-#define scalar_t half
+#define scalar_t THHalf
 #define accreal float
 #define Real Half
 
diff --git a/aten/src/THC/THCHalf.h b/aten/src/THC/THCHalf.h
deleted file mode 100644
index aeae06fc4739ba..00000000000000
--- a/aten/src/THC/THCHalf.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef THC_HALF_CONVERSION_INC
-#define THC_HALF_CONVERSION_INC
-
-#include "THCGeneral.h"
-
-#include <cuda_fp16.h>
-#include <stdint.h>
-
-#if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
-#ifndef __cplusplus
-typedef __half_raw half;
-#endif
-#endif
-
-THC_API half THC_float2half(float a);
-THC_API float THC_half2float(half a);
-
-#endif
diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh
index 8368141d474ec4..ed6f44c2114726 100644
--- a/aten/src/THC/THCNumerics.cuh
+++ b/aten/src/THC/THCNumerics.cuh
@@ -4,7 +4,7 @@
 #include <limits>
 #include <cuda.h>
 #include <assert.h>
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "ATen/ATen.h"
 #include "ATen/cuda/NumericLimits.cuh"
 
@@ -165,214 +165,81 @@ struct THCNumerics<int64_t> {
 
 // DEPRECATED: use math functions from std and NumericLimits.cuh
 template <>
-struct THCNumerics<half> {
-  static inline __host__ __device__ half min() { return at::numeric_limits<at::Half>::lowest(); }
-  static inline __host__ __device__ half max() { return at::numeric_limits<at::Half>::max(); }
-  static inline __host__ __device__ half lower_bound() { return at::numeric_limits<at::Half>::lower_bound(); }
-  static inline __host__ __device__ half upper_bound() { return at::numeric_limits<at::Half>::upper_bound(); }
-
-  static inline __host__ __device__ bool lt(half a, half b) {
-    return static_cast<at::Half>(a) < static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ bool le(half a, half b) {
-    return static_cast<at::Half>(a) <= static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ bool gt(half a, half b) {
-    return static_cast<at::Half>(a) > static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ bool ge(half a, half b) {
-    return static_cast<at::Half>(a) >= static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ bool eq(half a, half b) {
-    // has to be explicitly casted to float for now, otherwise get error: more than one operator "==" matches these operands
-    // Note: find the overloading for == and != (probably THCTensorTypeUtils.cuh) and resolve
-    return static_cast<float>(static_cast<at::Half>(a)) == static_cast<float>(static_cast<at::Half>(b));
-  }
-
-  static inline __host__ __device__ bool ne(half a, half b) {
-    // has to be explicitly casted to float for now, otherwise get error: more than one operator "==" matches these operands
-    // Note: find the overloading for == and != (probably THCTensorTypeUtils.cuh) and resolve
-    return static_cast<float>(static_cast<at::Half>(a)) != static_cast<float>(static_cast<at::Half>(b));
-  }
-
-  static inline __host__ __device__ half exp(half a) {
-    return static_cast<at::Half>(std::exp(static_cast<at::Half>(a)));
-  }
-
-  // note that exp10 is not in the std namespace.
-  static inline __host__ __device__ half exp10(half a) {
-    return static_cast<at::Half>(::exp10(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half log(half a) {
-    return static_cast<at::Half>(::log(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half log10(half a) {
-    return static_cast<at::Half>(::log10(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half log1p(half a) {
-    return static_cast<at::Half>(::log1p(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half log2(half a) {
-    return static_cast<at::Half>(::log2(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half lgamma(half a) {
-    return static_cast<at::Half>(::lgamma(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half expm1(half a) {
-    return static_cast<at::Half>(::expm1(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half cos(half a) {
-    return static_cast<at::Half>(::cos(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half sin(half a) {
-    return static_cast<at::Half>(::sin(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half sqrt(half a) {
-    return static_cast<at::Half>(::sqrt(static_cast<at::Half>(a)));
-  }
-
-  // note that rsqrt is not in the std namespace.
-  static inline __host__ __device__ half rsqrt(half a) {
-    return static_cast<at::Half>(::rsqrt(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half ceil(half a) {
-    return static_cast<at::Half>(::ceil(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half floor(half a) {
-    return static_cast<at::Half>(::floor(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half trunc(half a) {
-    return static_cast<at::Half>(::trunc(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half neg(half a) {
-    return static_cast<at::Half>(-(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half acos(half a) {
-    return static_cast<at::Half>(::acos(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half cosh(half a) {
-    return static_cast<at::Half>(::cosh(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half asin(half a) {
-    return static_cast<at::Half>(::asin(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half sinh(half a) {
-    return static_cast<at::Half>(::sinh(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half tan(half a) {
-    return static_cast<at::Half>(::tan(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half atan(half a) {
-    return static_cast<at::Half>(::atan(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half tanh(half a) {
-    return static_cast<at::Half>(::tanh(static_cast<at::Half>(a)));
-  }
-
-
-   static inline __host__ __device__ half erf(half a) {
-    return static_cast<at::Half>(::erf(static_cast<at::Half>(a)));
-  }
-
-
-   static inline __host__ __device__ half erfc(half a) {
-    return static_cast<at::Half>(::erfc(static_cast<at::Half>(a)));
-  }
-
-  // note that erfinv is not in the std namespace.
-  static inline __host__ __device__ half erfinv(half a) {
-    return static_cast<at::Half>(::erfinv(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half abs(half a) {
-#ifdef __HIP_PLATFORM_HCC__
-       return static_cast<at::Half>(std::abs(static_cast<at::Half>(a)));
-#else
-       return static_cast<at::Half>(::abs(static_cast<at::Half>(a)));
-#endif
-  }
-
-  static inline __host__ __device__ half round(half a) {
-    return static_cast<at::Half>(::round(static_cast<at::Half>(a)));
-  }
-
-  static inline __host__ __device__ half frac(half a) {
+struct THCNumerics<at::Half> {
+  static inline __host__ __device__ at::Half min() { return at::numeric_limits<at::Half>::lowest(); }
+  static inline __host__ __device__ at::Half max() { return at::numeric_limits<at::Half>::max(); }
+  static inline __host__ __device__ at::Half lower_bound() { return at::numeric_limits<at::Half>::lower_bound(); }
+  static inline __host__ __device__ at::Half upper_bound() { return at::numeric_limits<at::Half>::upper_bound(); }
+
+  static inline __host__ __device__ bool lt(at::Half a, at::Half b) { return a < b; }
+  static inline __host__ __device__ bool le(at::Half a, at::Half b) { return a <= b; }
+  static inline __host__ __device__ bool gt(at::Half a, at::Half b) { return a > b; }
+  static inline __host__ __device__ bool ge(at::Half a, at::Half b) { return a >= b; }
+  static inline __host__ __device__ bool eq(at::Half a, at::Half b) { return a == b; }
+  static inline __host__ __device__ bool ne(at::Half a, at::Half b) { return a != b; }
+
+  static inline __host__ __device__ at::Half exp(at::Half a) { return std::exp(a); }
+  static inline __host__ __device__ at::Half exp10(at::Half a) { return ::exp10(a); }
+  static inline __host__ __device__ at::Half log(at::Half a) { return ::log(a); }
+  static inline __host__ __device__ at::Half log10(at::Half a) { return ::log10(a); }
+  static inline __host__ __device__ at::Half log1p(at::Half a) { return ::log1p(a); }
+  static inline __host__ __device__ at::Half log2(at::Half a) { return ::log2(a); }
+  static inline __host__ __device__ at::Half lgamma(at::Half a) { return ::lgamma(a); }
+  static inline __host__ __device__ at::Half expm1(at::Half a) { return ::expm1(a); }
+  static inline __host__ __device__ at::Half cos(at::Half a) { return ::cos(a); }
+  static inline __host__ __device__ at::Half sin(at::Half a) { return ::sin(a); }
+  static inline __host__ __device__ at::Half sqrt(at::Half a) { return ::sqrt(a); }
+  static inline __host__ __device__ at::Half rsqrt(at::Half a) { return ::rsqrt(a); }
+  static inline __host__ __device__ at::Half ceil(at::Half a) { return ::ceil(a); }
+  static inline __host__ __device__ at::Half floor(at::Half a) { return ::floor(a); }
+  static inline __host__ __device__ at::Half trunc(at::Half a) { return ::trunc(a); }
+  static inline __host__ __device__ at::Half neg(at::Half a) { return -a; }
+  static inline __host__ __device__ at::Half acos(at::Half a) { return ::acos(a); }
+  static inline __host__ __device__ at::Half cosh(at::Half a) { return ::cosh(a); }
+  static inline __host__ __device__ at::Half asin(at::Half a) { return ::asin(a); }
+  static inline __host__ __device__ at::Half sinh(at::Half a) { return ::sinh(a); }
+  static inline __host__ __device__ at::Half tan(at::Half a) { return ::tan(a); }
+  static inline __host__ __device__ at::Half atan(at::Half a) { return ::atan(a); }
+  static inline __host__ __device__ at::Half tanh(at::Half a) { return ::tanh(a); }
+  static inline __host__ __device__ at::Half erf(at::Half a) { return ::erf(a); }
+  static inline __host__ __device__ at::Half erfc(at::Half a) { return ::erfc(a); }
+  static inline __host__ __device__ at::Half erfinv(at::Half a) { return ::erfinv(a); }
+  static inline __host__ __device__ at::Half abs(at::Half a) { return std::abs(a); }
+  static inline __host__ __device__ at::Half round(at::Half a) { return ::round(a); }
+
+  static inline __host__ __device__ at::Half frac(at::Half a) {
     #ifdef __CUDA_ARCH__
-        return static_cast<at::Half>(a) - static_cast<at::Half>(::trunc(static_cast<at::Half>(a)));
+        return a - ::trunc(a);
     #else // __CUDA_ARCH__
-        return static_cast<at::Half>(a) - static_cast<at::Half>(::floor(static_cast<at::Half>(a)));
+        return a - ::floor(a);
     #endif
   }
 
-  static inline __host__ __device__ half cinv(half a) {
-    return static_cast<at::Half>(1.0f / static_cast<at::Half>(a));
-  }
-
-  static inline __host__ __device__ half add(half a, half b) {
-    return static_cast<at::Half>(a) + static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ half div(half a, half b) {
-    return static_cast<at::Half>(a) / static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ half mul(half a, half b) {
-    return static_cast<at::Half>(a) * static_cast<at::Half>(b);
-  }
-
-  static inline __host__ __device__ half sub(half a, half b) {
-    return static_cast<at::Half>(a) - static_cast<at::Half>(b);
-  }
+  static inline __host__ __device__ at::Half cinv(at::Half a) { return 1.0f / a; }
+  static inline __host__ __device__ at::Half add(at::Half a, at::Half b) { return a + b; }
+  static inline __host__ __device__ at::Half div(at::Half a, at::Half b) { return a / b; }
+  static inline __host__ __device__ at::Half mul(at::Half a, at::Half b) { return a * b; }
+  static inline __host__ __device__ at::Half sub(at::Half a, at::Half b) { return a - b; }
+  static inline __host__ __device__ at::Half pow(at::Half a, at::Half b) { return ::pow(a, b); }
+  static inline __host__ __device__ at::Half atan2(at::Half a, at::Half b) { return ::atan2(a, b); }
 
-  static inline __host__ __device__ half pow(half a, half b) {
-    return static_cast<at::Half>(::pow(static_cast<at::Half>(a), static_cast<at::Half>(b)));
-  }
-
-  static inline __host__ __device__ half atan2(half a, half b) {
-    return static_cast<at::Half>(::atan2(static_cast<at::Half>(a), static_cast<at::Half>(b)));
-  }
-
-  static inline __host__ __device__ bool isnan(half a) {
+  static inline __host__ __device__ bool isnan(at::Half a) {
     #ifdef _MSC_VER
       // Windows requires this explicit conversion. The reason is unclear
       // related issue with clang: https://reviews.llvm.org/D37906
-      return ::isnan((float)static_cast<at::Half>(a));
+      return ::isnan((float) a);
     #else
-      return ::isnan(static_cast<at::Half>(a));
+      return ::isnan(a);
     #endif
   }
 
-  static inline __host__ __device__ bool isinf(half a) {
+  static inline __host__ __device__ bool isinf(at::Half a) {
     #ifdef _MSC_VER
       // Windows requires this explicit conversion. The reason is unclear
       // related issue with clang: https://reviews.llvm.org/D37906
-      return ::isinf((float)static_cast<at::Half>(a));
+      return ::isinf((float) a);
     #else
-      return ::isinf(static_cast<at::Half>(a));
+      return ::isinf(a);
     #endif
   }
 
@@ -510,35 +377,6 @@ struct ScalarConvert {
   static __host__ __device__ Out to(const In v) { return (Out) v; }
 };
 
-template <typename Out>
-struct ScalarConvert<half, Out> {
-  static __host__ __device__ Out to(const half v) {
-#ifdef __CUDA_ARCH__
-    return (Out) __half2float(v);
-#else
-    return (Out) THC_half2float(v);
-#endif
-  }
-};
-
-template <typename In>
-struct ScalarConvert<In, half> {
-  static __host__ __device__ half to(const In v) {
-#ifdef __CUDA_ARCH__
-    return __float2half((float) v);
-#else
-    return THC_float2half((float) v);
-#endif
-  }
-};
-
-template <>
-struct ScalarConvert<half, half> {
-  static __host__ __device__ half to(const half v) {
-    return v;
-  }
-};
-
 // DEPRECATED: use static_cast in kernels instead of scalar_cast
 template <typename T, typename U>
 __host__ __device__ T scalar_cast(U u) {
diff --git a/aten/src/THC/THCStorage.cpp b/aten/src/THC/THCStorage.cpp
index 96e3938e20b0f9..b6c52791eb56d6 100644
--- a/aten/src/THC/THCStorage.cpp
+++ b/aten/src/THC/THCStorage.cpp
@@ -1,7 +1,7 @@
 #include "THCStorage.hpp"
 #include "THCGeneral.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 
 #include <new>
 
@@ -55,9 +55,9 @@ int THCStorage_getDevice(THCState* state, const THCStorage* storage) {
 
 THC_API THCStorage* THCStorage_new(
     THCState* state,
-    at::ScalarType scalar_type) {
+    caffe2::TypeMeta data_type) {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(scalar_type),
+      data_type,
       0,
       state->cudaDeviceAllocator,
       true).release();
diff --git a/aten/src/THC/THCStorage.cu b/aten/src/THC/THCStorage.cu
index 43a293422335fc..97b66ac2b5f891 100644
--- a/aten/src/THC/THCStorage.cu
+++ b/aten/src/THC/THCStorage.cu
@@ -7,7 +7,7 @@
 #include <thrust/system/cuda/execution_policy.h>
 #endif
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 
 #include "generic/THCStorage.cu"
 #include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCStorage.hpp b/aten/src/THC/THCStorage.hpp
index ee683feced4bfb..e8dfc2213076f1 100644
--- a/aten/src/THC/THCStorage.hpp
+++ b/aten/src/THC/THCStorage.hpp
@@ -9,14 +9,20 @@
 
 #include "ATen/ScalarType.h"
 
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
 namespace at {
 
+#if defined(__CUDACC__) || defined(__HIP_PLATFORM_HCC__)
 template <>
 struct CTypeToScalarType<__half> : public CTypeToScalarType<Half> {};
+#endif
 
 }
 
-THC_API THCStorage* THCStorage_new(THCState* state, at::ScalarType);
+THC_API THCStorage* THCStorage_new(THCState* state, caffe2::TypeMeta);
 
 THC_API void THCStorage_retain(THCState *state, THCStorage *storage);
 
diff --git a/aten/src/THC/THCStorageCopy.cu b/aten/src/THC/THCStorageCopy.cu
index 8d7c869c12c004..c4f53f7160ca5b 100644
--- a/aten/src/THC/THCStorageCopy.cu
+++ b/aten/src/THC/THCStorageCopy.cu
@@ -1,7 +1,7 @@
 #include "THCStorageCopy.h"
 #include "THCGeneral.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCTensorCopy.h"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THC/THCStorageCopy.h b/aten/src/THC/THCStorageCopy.h
index 837056fc3801d2..250b60fcb2fe74 100644
--- a/aten/src/THC/THCStorageCopy.h
+++ b/aten/src/THC/THCStorageCopy.h
@@ -3,7 +3,7 @@
 
 #include "THCStorage.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 
 #include "generic/THCStorageCopy.h"
 #include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp
index b35551301383f8..bfef8fffb0f89d 100644
--- a/aten/src/THC/THCTensor.cpp
+++ b/aten/src/THC/THCTensor.cpp
@@ -40,8 +40,9 @@ int64_t THCTensor_strideLegacyNoScalars(THCState *state, const THCTensor *self,
   return THTensor_strideLegacyNoScalars(self, dim);
 }
 
-THCTensor *THCTensor_new(THCState *state, at::ScalarType scalar_type) {
-  switch(scalar_type) {
+THCTensor *THCTensor_new(THCState *state, caffe2::TypeMeta type_meta) {
+  auto scalar_type = at::dataTypeToScalarType(type_meta.id());
+  switch (scalar_type) {
     case at::ScalarType::Byte:
       return THCudaByteTensor_new(state);
     case at::ScalarType::Char:
@@ -189,13 +190,12 @@ void THCTensor_setStorageNd(THCState *state, THCTensor *self, THCStorage *storag
     if (!THTensor_getStoragePtr(self)) {
       THError("Tensor: invalid null storage");
     }
-    auto scalar_type = at::dataTypeToScalarType(THTensor_getStoragePtr(self)->dtype());
-
+    auto data_type = THTensor_getStoragePtr(self)->dtype();
     if (storage) {
       c10::raw::intrusive_ptr::incref(storage);
       THTensor_stealAndSetStoragePtr(self, storage);
     } else {
-      THTensor_stealAndSetStoragePtr(self, THCStorage_new(state, scalar_type));
+      THTensor_stealAndSetStoragePtr(self, THCStorage_new(state, data_type));
     }
   }
 
diff --git a/aten/src/THC/THCTensor.hpp b/aten/src/THC/THCTensor.hpp
index 3505354c834bb4..8fecaf0b2296fc 100644
--- a/aten/src/THC/THCTensor.hpp
+++ b/aten/src/THC/THCTensor.hpp
@@ -21,7 +21,7 @@ THC_API int64_t THCTensor_sizeLegacyNoScalars(THCState *state, const THCTensor *
 THC_API int64_t THCTensor_stride(THCState *state, const THCTensor *self, int dim);
 THC_API int64_t THCTensor_strideLegacyNoScalars(THCState *state, const THCTensor *self, int dim);
 
-THC_API THCTensor *THCTensor_new(THCState *state, at::ScalarType scalar_type);
+THC_API THCTensor *THCTensor_new(THCState *state, caffe2::TypeMeta type_meta);
 
 THC_API void THCTensor_resize(THCState *state, THCTensor *tensor, at::IntList size, at::IntList stride);
 THC_API void THCTensor_resizeNd(THCState *state, THCTensor *tensor, int nDimension, const int64_t *size, const int64_t *stride);
diff --git a/aten/src/THC/THCTensorCopy.cu b/aten/src/THC/THCTensorCopy.cu
index 259912a8d91806..844539c4ac0b27 100644
--- a/aten/src/THC/THCTensorCopy.cu
+++ b/aten/src/THC/THCTensorCopy.cu
@@ -1,5 +1,5 @@
 #include "THCApply.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCNumerics.cuh"
 #include "THCTensorCopy.hpp"
 #include <type_traits>
@@ -136,8 +136,7 @@ void THC_copyTensor(THCState* state, THCTensor* dst, THCTensor* src) {
       } else {
         // Types are different
         // Copy into the new format, contiguous, on the source device
-        srcContig = THCTensor_new(state,
-                                  at::CTypeToScalarType<ScalarTypeDst>::to());
+        srcContig = THCTensor_new(state, caffe2::TypeMeta::Make<ScalarTypeDst>());
         THCTensor_resizeAs(state, srcContig, dst);
 
         bool succ =
diff --git a/aten/src/THC/THCTensorCopy.h b/aten/src/THC/THCTensorCopy.h
index 74f2b592f54657..48dcc64b9fbedc 100644
--- a/aten/src/THC/THCTensorCopy.h
+++ b/aten/src/THC/THCTensorCopy.h
@@ -3,7 +3,7 @@
 
 #include "THCTensor.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCStream.h"
 
 #include "generic/THCTensorCopy.h"
diff --git a/aten/src/THC/THCTensorIndex.cu b/aten/src/THC/THCTensorIndex.cu
index 0ea5951d4ea734..6b4a77ea816225 100644
--- a/aten/src/THC/THCTensorIndex.cu
+++ b/aten/src/THC/THCTensorIndex.cu
@@ -4,7 +4,7 @@
 #include "THCBlas.h"
 #include "THCTensorCopy.h"
 #include "THCTensorRandom.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCApply.cuh"
 #include "THCReduce.cuh"
 #include "THCDeviceUtils.cuh"
diff --git a/aten/src/THC/THCTensorMathPairwise.cu b/aten/src/THC/THCTensorMathPairwise.cu
index b1be123b886e03..cd5a77c2227181 100644
--- a/aten/src/THC/THCTensorMathPairwise.cu
+++ b/aten/src/THC/THCTensorMathPairwise.cu
@@ -1,6 +1,6 @@
 #include "THCTensorMath.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCNumerics.cuh"
@@ -21,26 +21,6 @@ struct TensorAddConstantOp {
   const T val;
 };
 
-template <>
-struct TensorAddConstantOp<half> {
-  TensorAddConstantOp(half v) : fval(THC_half2float(v)) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fin = __half2float(*in);
-    float fout = fin + fval;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    float fv = __half2float(*v);
-    fv += fval;
-    *v = __float2half(fv);
-  }
-
-  const float fval;
-};
-
-
 template <typename T>
 struct TensorSubConstantOp {
   TensorSubConstantOp(T v) : val(v) {}
@@ -55,27 +35,6 @@ struct TensorSubConstantOp {
   const T val;
 };
 
-
-template <>
-struct TensorSubConstantOp<half> {
-  TensorSubConstantOp(half v): fval(-(THC_half2float(v))) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fin = __half2float(*in);
-    float fout = fin + fval;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    float fv = __half2float(*v);
-    fv += fval;
-    *v = __float2half(fv);
-  }
-
-  const float fval;
-};
-
-
 template <typename T>
 struct TensorMulConstantOp {
   TensorMulConstantOp(T v) : val(v) {}
@@ -90,25 +49,6 @@ struct TensorMulConstantOp {
   const T val;
 };
 
-template <>
-struct TensorMulConstantOp<half> {
-  TensorMulConstantOp(half v) : fval(THC_half2float(v)) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fin = __half2float(*in);
-    float fout = fin * fval;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    float fv = __half2float(*v);
-    fv *= fval;
-    *v = __float2half(fv);
-  }
-
-  const float fval;
-};
-
 template <typename T>
 struct TensorDivConstantOp {
   TensorDivConstantOp(T v) : val(v) {}
@@ -151,24 +91,6 @@ struct TensorDivConstantOp<double> {
   const double val;
 };
 
-template <>
-struct TensorDivConstantOp<half> {
-  TensorDivConstantOp(half v) : fval(1.f / THC_half2float(v)) {}
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fin = __half2float(*in);
-    float fout = fin * fval;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    float fv = __half2float(*v);
-    fv *= fval;
-    *v = __float2half(fv);
-  }
-
-  const float fval;
-};
-
 template<typename T>
 static __device__ __forceinline__
 typename std::enable_if<std::is_signed<T>::value, bool>::type
@@ -232,22 +154,18 @@ struct TensorRemainderOp<double> {
 };
 
 template <>
-struct TensorRemainderOp<half> {
-  TensorRemainderOp(half v): fval(THC_half2float(v)) {}
+struct TensorRemainderOp<at::Half> {
+  TensorRemainderOp(at::Half v): val(v) {}
 
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fin = __half2float(*in);
-    float fout = fin - fval * floorf(fin / fval);
-    *out = __float2half(fout);
+  __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) {
+    *out = *in - val * floorf(*in / val);
   }
 
-  __device__ __forceinline__ void operator()(half* v) {
-    float fv = __half2float(*v);
-    fv = fv - fval * floorf(fv / fval);
-    *v = __float2half(fv);
+  __device__ __forceinline__ void operator()(at::Half* v) {
+    *v = *v - val * floorf(*v / val);
   }
 
-  const float fval;
+  const at::Half val;
 };
 
 template <typename T>
@@ -278,21 +196,6 @@ struct TensorFmodOp<double> {
   const double val;
 };
 
-template <>
-struct TensorFmodOp<half> {
-  TensorFmodOp(half v): fval(THC_half2float(v)) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    *out = __float2half(fmodf(__half2float(*in), fval));
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    *v = __float2half(fmodf(__half2float(*v), fval));
-  }
-
-  const float fval;
-};
-
 template <typename T, int Upper>
 struct TensorTriOp {
   TensorTriOp(T *start_, int64_t stride0_, int64_t stride1_, int64_t k_)
diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh
index fb15a05b155c0f..fb1072b8bbddc1 100644
--- a/aten/src/THC/THCTensorMathPointwise.cuh
+++ b/aten/src/THC/THCTensorMathPointwise.cuh
@@ -4,7 +4,7 @@
 #include <type_traits>
 #include "THCTensorMath.h"
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCNumerics.cuh"
@@ -31,19 +31,6 @@ struct TensorSigmoidOp {
   }
 };
 
-template <>
-struct TensorSigmoidOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) const {
-    float fin = __half2float(*in);
-    *out = __float2half(1.0f / (1.0f + expf(- fin)));
-  }
-
-  __device__ __forceinline__ void operator()(half* v) const {
-    float fv = __half2float(*v);
-    *v = __float2half(1.0f / (1.0f + expf(- fv)));
-  }
-};
-
 template <typename T>
 struct TensorSignOp {
   __device__ __forceinline__ void operator()(T* out, T* in) {
@@ -70,19 +57,6 @@ struct TensorSignOp<unsigned char> {
   }
 };
 
-template <>
-struct TensorSignOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float orig = __half2float(*in);
-    *out = __float2half((orig > 0) - (orig < 0));
-  }
-
-  __device__ __forceinline__ void operator()(half* v) {
-    float orig = __half2float(*v);
-    *v = __float2half((orig > 0) - (orig < 0));
-  }
-};
-
 template <typename T>
 struct TensorCAddOp {
   TensorCAddOp(T v) : val(v) {}
@@ -98,31 +72,6 @@ struct TensorCAddOp {
   T val;
 };
 
-template <>
-struct TensorCAddOp<half> {
-  TensorCAddOp(half v) : val(v) {}
-
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fout = __half2float(*out);
-    float fval = __half2float(val);
-    float fin = __half2float(*in);
-
-    fout += fval * fin;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fval = __half2float(val);
-
-    float fout = fin1 + fval * fin2;
-    *out = __float2half(fout);
-  }
-
-  half val;
-};
-
 template <typename T>
 struct TensorMulOp {
   __device__ __forceinline__ void operator()(T* out, T* in) {
@@ -134,23 +83,6 @@ struct TensorMulOp {
   }
 };
 
-template <>
-struct TensorMulOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout *= fin;
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = fin1 * fin2;
-    *out = __float2half(fout);
-  }
-};
-
 template<typename T, int StaticExp>
 struct TensorPowOp {
   TensorPowOp(T v) : val(v) {}
@@ -249,7 +181,6 @@ struct TensorCPowOp<float> {
   }
 };
 
-
 template <>
 struct TensorCPowOp<double> {
   __device__ __forceinline__ void operator()(double* out, double* in) {
@@ -261,25 +192,6 @@ struct TensorCPowOp<double> {
   }
 };
 
-template <>
-struct TensorCPowOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    // No fp16 pow function yet
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    fout = powf(fout, fin);
-    *out = __float2half(fout);
-  }
-
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    // No fp16 pow function yet
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    float fout = powf(fin1, fin2);
-    *out = __float2half(fout);
-  }
-};
-
 template<typename T>
 static __device__ __forceinline__
 typename std::enable_if<std::is_signed<T>::value, bool>::type
@@ -336,17 +248,13 @@ struct TensorCRemainderOp<double> {
 };
 
 template <>
-struct TensorCRemainderOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    float fout = __half2float(*out);
-    float fin = __half2float(*in);
-    *out = fin != 0 ? __float2half(fout - fin * floorf(fout / fin)) : __float2half(NAN);
+struct TensorCRemainderOp<at::Half> {
+  __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) {
+    *out = *in != 0.f ? *out - *in * floorf(*out / *in) : NAN;
   }
 
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    float fin1 = __half2float(*in1);
-    float fin2 = __half2float(*in2);
-    *out = fin2 != 0 ? __float2half(fin1 - fin2 * floorf(fin1 / fin2)) : __float2half(NAN);
+  __device__ __forceinline__ void operator()(at::Half* out, at::Half* in1, at::Half* in2) {
+    *out = *in2 != 0.f ? *in1 - *in2 * floorf(*in1 / *in2) : NAN;
   }
 };
 
@@ -384,13 +292,13 @@ struct TensorCFmodOp<double> {
 };
 
 template <>
-struct TensorCFmodOp<half> {
-  __device__ __forceinline__ void operator()(half* out, half* in) {
-    *out = __float2half(fmodf(__half2float(*out), __half2float(*in)));
+struct TensorCFmodOp<at::Half> {
+  __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) {
+    *out = fmodf(*out, *in);
   }
 
-  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
-    *out = __float2half(fmodf(__half2float(*in1), __half2float(*in2)));
+  __device__ __forceinline__ void operator()(at::Half* out, at::Half* in1, at::Half* in2) {
+    *out = fmodf(*in1, *in2);
   }
 };
 
@@ -594,7 +502,6 @@ struct TensorRShiftOp {
   }
 };
 
-
 template <>
 struct TensorRShiftOp<float> {
   __device__ __forceinline__ void
@@ -671,7 +578,7 @@ template <typename T, typename accreal>
 struct TensorDigammaOp {
   __device__ __forceinline__ void
   operator()(T* out, T* in) {
-    using compute_type = typename std::conditional<std::is_same<T, half>::value, accreal, T>::type;
+    using compute_type = typename std::conditional<std::is_same<T, at::Half>::value, accreal, T>::type;
     static const double PI_f64 = 3.14159265358979323846;
     static const compute_type PSI_10 = 2.25175258906672110764;
     static const compute_type A[] = {
@@ -731,7 +638,7 @@ struct TensorDigammaOp {
 
 template <typename T, typename accreal>
 struct TensorTrigammaOp {
-  using compute_type = typename std::conditional<std::is_same<T, half>::value, accreal, T>::type;
+  using compute_type = typename std::conditional<std::is_same<T, at::Half>::value, accreal, T>::type;
   __device__ __forceinline__ void
   operator()(T* out, T* in) {
     const compute_type PI = 3.14159265358979323846;
diff --git a/aten/src/THC/THCTensorMode.cuh b/aten/src/THC/THCTensorMode.cuh
index 0158f254a2d014..a3ed2ae8e4be93 100644
--- a/aten/src/THC/THCTensorMode.cuh
+++ b/aten/src/THC/THCTensorMode.cuh
@@ -7,33 +7,33 @@
 
 struct ThrustHalfLess
 {
-  __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
-    return THCNumerics<half>::lt(lhs, rhs);
+  __host__ __device__ inline bool operator()(const at::Half& lhs, const at::Half& rhs) {
+    return THCNumerics<at::Half>::lt(lhs, rhs);
   }
 };
 
 struct ThrustHalfNotEqualTo
 {
-  __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
-    return THCNumerics<half>::ne(lhs, rhs);
+  __host__ __device__ inline bool operator()(const at::Half& lhs, const at::Half& rhs) {
+    return THCNumerics<at::Half>::ne(lhs, rhs);
   }
 };
 
 struct ThrustHalfEqualTo
 {
-  __host__ __device__ inline bool operator()(const half& lhs, const half& rhs) {
-    return THCNumerics<half>::eq(lhs, rhs);
+  __host__ __device__ inline bool operator()(const at::Half& lhs, const at::Half& rhs) {
+    return THCNumerics<at::Half>::eq(lhs, rhs);
   }
 };
 
 struct ThrustHalfEqualToPredicate
 {
-  ThrustHalfEqualToPredicate(half val): val_(val) {}
-  __host__ __device__ inline bool operator()(half x) {
-    return THCNumerics<half>::eq(val_, x);
+  ThrustHalfEqualToPredicate(at::Half val): val_(val) {}
+  __host__ __device__ inline bool operator()(at::Half x) {
+    return THCNumerics<at::Half>::eq(val_, x);
   }
 
-  half val_;
+  at::Half val_;
 };
 
 template <typename T>
diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu
index 8eb580169cc953..386473a430329a 100644
--- a/aten/src/THC/THCTensorRandom.cu
+++ b/aten/src/THC/THCTensorRandom.cu
@@ -14,7 +14,7 @@
 #include <curand.h>
 #include <curand_kernel.h>
 
-#define MAX_NUM_BLOCKS 200 
+#define MAX_NUM_BLOCKS 200
 #define BLOCK_SIZE 256
 
 
@@ -107,11 +107,11 @@ __device__ inline T reverse_bounds(T value) {
 }
 
 
-__device__ inline half half_uniform_scale_and_shift(float x, double a, double b) {
-  half width = ScalarConvert<double, half>::to(b - a);
-  half start = ScalarConvert<double, half>::to(a);
-  half scaled = THCNumerics<half>::mul(reverse_bounds(ScalarConvert<float, half>::to(x)), width);
-  return THCNumerics<half>::add(scaled, start);
+__device__ inline at::Half half_uniform_scale_and_shift(float x, double a, double b) {
+  at::Half width = ScalarConvert<double, at::Half>::to(b - a);
+  at::Half start = ScalarConvert<double, at::Half>::to(a);
+  at::Half scaled = THCNumerics<at::Half>::mul(reverse_bounds(ScalarConvert<float, at::Half>::to(x)), width);
+  return THCNumerics<at::Half>::add(scaled, start);
 }
 
 #define GENERATE_KERNEL1(NAME, T, ARG1, CURAND_T, CURAND_FUNC, TRANSFORM)      \
@@ -181,10 +181,10 @@ GENERATE_KERNEL1(generate_exponential, double, double lambda, double, curand_uni
 GENERATE_KERNEL2(generate_cauchy, float, double median, double sigma, float, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5))))
 GENERATE_KERNEL2(generate_cauchy, double, double median, double sigma, double, curand_uniform_double, (double)(median + sigma * tan(M_PI*(x-0.5))))
 
-GENERATE_KERNEL2(generate_uniform, half, double a, double b, float, curand_uniform, (half_uniform_scale_and_shift(x, a, b)))
-GENERATE_KERNEL2(generate_normal, half, double mean, double stdv, float, curand_normal, (ScalarConvert<float, half>::to((x * stdv) + mean)))
-GENERATE_KERNEL1(generate_exponential, half, double lambda, float, curand_uniform, (ScalarConvert<float, half>::to((float)(-1. / lambda * log(x)))))
-GENERATE_KERNEL2(generate_cauchy, half, double median, double sigma, float, curand_uniform, (ScalarConvert<float, half>::to((float)(median + sigma * tan(M_PI*(x-0.5))))))
+GENERATE_KERNEL2(generate_uniform, at::Half, double a, double b, float, curand_uniform, (half_uniform_scale_and_shift(x, a, b)))
+GENERATE_KERNEL2(generate_normal, at::Half, double mean, double stdv, float, curand_normal, (ScalarConvert<float, at::Half>::to((x * stdv) + mean)))
+GENERATE_KERNEL1(generate_exponential, at::Half, double lambda, float, curand_uniform, (ScalarConvert<float, at::Half>::to((float)(-1. / lambda * log(x)))))
+GENERATE_KERNEL2(generate_cauchy, at::Half, double median, double sigma, float, curand_uniform, (ScalarConvert<float, at::Half>::to((float)(median + sigma * tan(M_PI*(x-0.5))))))
 
 #include "generic/THCTensorRandom.cu"
 #include "THCGenerateAllTypes.h"
diff --git a/aten/src/THC/THCTensorSort.cuh b/aten/src/THC/THCTensorSort.cuh
index 9b75a7355a00f4..b8feedcc1f21c9 100644
--- a/aten/src/THC/THCTensorSort.cuh
+++ b/aten/src/THC/THCTensorSort.cuh
@@ -1,6 +1,8 @@
 #ifndef THC_TENSORSORT_CUH
 #define THC_TENSORSORT_CUH
 
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
 #include "THCReduceApplyUtils.cuh"
 #include "THCSortUtils.cuh"
 #include "THCTensorCopy.h"
diff --git a/aten/src/THC/THCTensorTopK.cuh b/aten/src/THC/THCTensorTopK.cuh
index 4f7a6b8c697913..6d1ef1b6bbfe37 100644
--- a/aten/src/THC/THCTensorTopK.cuh
+++ b/aten/src/THC/THCTensorTopK.cuh
@@ -113,10 +113,10 @@ struct TopKTypeConfig<double> {
 };
 
 template <>
-struct TopKTypeConfig<half> {
+struct TopKTypeConfig<at::Half> {
   typedef uint32_t RadixType;
 
-  static inline __device__ RadixType convert(half v) {
+  static inline __device__ RadixType convert(at::Half v) {
 #if CUDA_VERSION >= 8000
     RadixType x = __half_as_ushort(v);
     RadixType mask = -((x >> 15)) | 0x8000;
@@ -127,16 +127,16 @@ struct TopKTypeConfig<half> {
 #endif
   }
 
-  static inline __device__ half deconvert(RadixType v) {
+  static inline __device__ at::Half deconvert(RadixType v) {
 #if CUDA_VERSION >= 8000
     RadixType mask = ((v >> 15) - 1) | 0x8000;
     return __ushort_as_half(v ^ mask);
 #else
     assert(false);
-    return ScalarConvert<int, half>::to(0);
+    return ScalarConvert<int, at::Half>::to(0);
 #endif
   }
-}; 
+};
 
 // This function counts the distribution of all input values in a
 // slice we are selecting by radix digit at `radixDigitPos`, but only
@@ -214,7 +214,7 @@ __device__ DataType findPattern(DataType* smem,
                              BitDataType desired,
                              BitDataType desiredMask) {
 #ifdef __HIP_PLATFORM_HCC__
-  if (threadIdx.x < 64) { 
+  if (threadIdx.x < 64) {
 #else
   if (threadIdx.x < 32) {
 #endif
diff --git a/aten/src/THC/THCTensorTypeUtils.cuh b/aten/src/THC/THCTensorTypeUtils.cuh
index 377b363c006ba2..aecd8f01713d55 100644
--- a/aten/src/THC/THCTensorTypeUtils.cuh
+++ b/aten/src/THC/THCTensorTypeUtils.cuh
@@ -4,7 +4,7 @@
 #include <cuda.h>
 #include <assert.h>
 #include "THCGeneral.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCTensor.hpp"
 #include "THCTensorInfo.cuh"
 #include "THCTensor.hpp"
@@ -80,56 +80,4 @@ struct ScalarInv {
   static __host__ __device__ T to(const T v) { return ((T) 1) / v; }
 };
 
-template <>
-struct ScalarNegate<half> {
-  static __host__ __device__ half to(const half v) {
-#ifdef __CUDA_ARCH__
-    return __float2half(-__half2float(v));
-#else
-#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
-    half out = v;
-#else
-    __half_raw out = __half_raw(v);
-#endif
-    out.x ^= 0x8000; // toggle sign bit
-    return out;
-#endif
-  }
-};
-
-template <>
-struct ScalarInv<half> {
-  static __host__ __device__ half to(const half v) {
-#if defined (__CUDA_ARCH_) || defined(__HIP_PLATFORM_HCC__)
-    return __float2half(1.0f / __half2float(v));
-#else
-    float fv = THC_half2float(v);
-    fv = 1.0f / fv;
-    return THC_float2half(fv);
-#endif
-  }
-};
-
-inline bool operator==(half a, half b) {
-#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
-  return a.x == b.x;
-#else
-  __half_raw araw, braw;
-  araw = __half_raw(a);
-  braw = __half_raw(b);
-  return araw.x == braw.x;
-#endif
-}
-
-inline bool operator!=(half a, half b) {
-#if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
-    return a.x != b.x;
-#else
-  __half_raw araw, braw;
-  araw = __half_raw(a);
-  braw = __half_raw(b);
-  return araw.x != braw.x;
-#endif
-}
-
 #endif // THC_TENSOR_TYPE_UTILS_INC
diff --git a/aten/src/THC/generic/THCStorage.cpp b/aten/src/THC/generic/THCStorage.cpp
index 8918a449e19585..36a001059a5787 100644
--- a/aten/src/THC/generic/THCStorage.cpp
+++ b/aten/src/THC/generic/THCStorage.cpp
@@ -3,6 +3,7 @@
 #else
 
 #include <ATen/core/intrusive_ptr.h>
+#include <ATen/core/typeid.h>
 
 scalar_t* THCStorage_(data)(THCState *state, const THCStorage *self)
 {
@@ -43,7 +44,7 @@ scalar_t THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t ind
 THCStorage* THCStorage_(new)(THCState *state)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       0,
       state->cudaDeviceAllocator,
       true).release();
@@ -53,7 +54,7 @@ THCStorage* THCStorage_(new)(THCState *state)
 THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       size,
       state->cudaDeviceAllocator,
       true).release();
@@ -64,7 +65,7 @@ THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size,
                                           at::Allocator* allocator)
 {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       size,
       allocator,
       true).release();
@@ -117,7 +118,7 @@ THCStorage* THCStorage_(newWithDataAndAllocator)(
     ptrdiff_t size,
     at::Allocator* allocator) {
   THStorage* storage = c10::make_intrusive<at::StorageImpl>(
-      at::scalarTypeToDataType(at::CTypeToScalarType<scalar_t>::to()),
+      caffe2::TypeMeta::Make<scalar_t>(),
       size,
       std::move(data),
       allocator,
diff --git a/aten/src/THC/generic/THCStorage.cu b/aten/src/THC/generic/THCStorage.cu
index 88ed2e5541820e..cd12d2bcae0fb6 100644
--- a/aten/src/THC/generic/THCStorage.cu
+++ b/aten/src/THC/generic/THCStorage.cu
@@ -18,7 +18,7 @@ void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size)
   THCStorage_resize(state, self, size);
 }
 
-THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) {
+int THCStorage_(getDevice)(THCState* state, const THCStorage* storage) {
   return THCStorage_getDevice(state, storage);
 }
 
diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp
index db2b44511c2329..a7779047863466 100644
--- a/aten/src/THC/generic/THCTensor.cpp
+++ b/aten/src/THC/generic/THCTensor.cpp
@@ -63,13 +63,21 @@ scalar_t *THCTensor_(data)(THCState *state, const THCTensor *self)
 /* Empty init */
 THCTensor *THCTensor_(new)(THCState *state)
 {
-  return c10::make_intrusive<at::TensorImpl, at::UndefinedTensor>(THCStorage_(new)(state), at::CUDATensorId(), false).release();
+  return c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
+    c10::intrusive_ptr<at::StorageImpl>::reclaim(THCStorage_(new)(state)),
+    at::CUDATensorId(),
+    false
+  ).release();
 }
 
 /* Pointer-copy init */
 THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor)
 {
-  THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensor>(THCStorage_(new)(state), at::CUDATensorId(), false).release();
+  THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
+    c10::intrusive_ptr<at::StorageImpl>::reclaim(THCStorage_(new)(state)),
+    at::CUDATensorId(),
+    false
+  ).release();
   THCTensor_(setStorageNd)(state,
                            self,
                            THTensor_getStoragePtr(tensor),
@@ -85,7 +93,11 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrd
   if (strides.data()) {
     AT_CHECK(sizes.size() == strides.size(), "number of sizes and strides must match");
   }
-  THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensor>(THCStorage_(new)(state), at::CUDATensorId(), false).release();
+  THCTensor *self = c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
+    c10::intrusive_ptr<at::StorageImpl>::reclaim(THCStorage_(new)(state)),
+    at::CUDATensorId(),
+    false
+  ).release();
   THCTensor_(setStorageNd)(state, self, storage, storageOffset, sizes.size(),
                            const_cast<int64_t*>(sizes.data()), const_cast<int64_t*>(strides.data()));
 
@@ -594,13 +606,13 @@ int THCTensor_(checkGPU)(THCState *state, unsigned int nTensors, ...)
     }
 
     const int tensorDev = THCTensor_(getDevice)(state, tensor);
-    
+
     // Skips CPU tensors
     if (tensorDev == -1) { continue; }
 
     // Checks all tensors are on the same device
-    if (tensorDev != curDev) { 
-      valid = 0; 
+    if (tensorDev != curDev) {
+      valid = 0;
       break;
     }
   }
diff --git a/aten/src/THC/generic/THCTensor.cu b/aten/src/THC/generic/THCTensor.cu
index 98478341575c75..0c694f5e4e25c8 100644
--- a/aten/src/THC/generic/THCTensor.cu
+++ b/aten/src/THC/generic/THCTensor.cu
@@ -2,7 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensor.cu"
 #else
 
-THC_API int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) {
+int THCTensor_(getDevice)(THCState* state, const THCTensor* tensor) {
   return THCTensor_getDevice(state, tensor);
 }
 
diff --git a/aten/src/THC/generic/THCTensorCopy.cpp b/aten/src/THC/generic/THCTensorCopy.cpp
index 96ab307182639c..0c20edfbd9fd36 100644
--- a/aten/src/THC/generic/THCTensorCopy.cpp
+++ b/aten/src/THC/generic/THCTensorCopy.cpp
@@ -58,6 +58,13 @@ void THTensor_(copyCuda)(THCState *state, THTensor *self, struct THCTensor *src)
 
   {
     THTensor *selfc = THTensor_(newContiguous)(self);
+    int tensorDevice = THCTensor_(getDevice)(state, src);
+    int currentDevice;
+    THCudaCheck(cudaGetDevice(&currentDevice));
+
+    if (currentDevice != tensorDevice) {
+      THCudaCheck(cudaSetDevice(tensorDevice));
+    }
     src = THCTensor_(newContiguous)(state, src);
 
     cudaStream_t stream = THCState_getCurrentStream(state);
@@ -68,6 +75,10 @@ void THTensor_(copyCuda)(THCState *state, THTensor *self, struct THCTensor *src)
                                 stream));
     THCudaCheck(cudaStreamSynchronize(stream));
 
+    if (currentDevice != tensorDevice) {
+      THCudaCheck(cudaSetDevice(currentDevice));
+    }
+
     THCTensor_(free)(state, src);
     THTensor_(freeCopyTo)(selfc, self);
   }
diff --git a/aten/src/THC/generic/THCTensorCopy.cu b/aten/src/THC/generic/THCTensorCopy.cu
index 0320fdfe8035a5..2c05c74f247a48 100644
--- a/aten/src/THC/generic/THCTensorCopy.cu
+++ b/aten/src/THC/generic/THCTensorCopy.cu
@@ -2,16 +2,15 @@
 #define THC_GENERIC_FILE "generic/THCTensorCopy.cu"
 #else
 
-THC_API void
-THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) {
+void THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) {
   if (dst == src) return;
   THC_copyTensor<scalar_t, scalar_t>(state, dst, src);
 }
 
 template <>
 THCTensor *THCTensor_newClone<scalar_t>(THCState *state, THCTensor *self) {
-  THCTensor* tensor = THCTensor_new(
-      state, at::dataTypeToScalarType(THTensor_getStoragePtr(self)->dtype()));
+  THCTensor* tensor =
+      THCTensor_new(state, THTensor_getStoragePtr(self)->dtype());
   THCTensor_resizeAs(state, tensor, self);
   THC_copyTensor<scalar_t, scalar_t>(state, tensor, self);
   return tensor;
@@ -51,16 +50,14 @@ void THCTensor_copyIgnoringOverlaps<scalar_t>(THCState* state, THCTensor* dst, T
     ReadOnly);
 }
 
-THC_API void
-THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) {
+void THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) {
   THCTensor_copyIgnoringOverlaps<scalar_t>(state, dst, src);
 }
 
 #define IMPLEMENT_THC_CUDA_TENSOR_COPY(TYPEC, TYPECUDA, SCALARC)        \
-  THC_API void                                                          \
-  THCTensor_(copyCuda##TYPEC)(THCState *state,                          \
-                              THCTensor *self,                          \
-                              THCuda##TYPECUDA##Tensor *src) {          \
+  void THCTensor_(copyCuda##TYPEC)(THCState *state,                     \
+                                   THCTensor *self,                     \
+                                   THCuda##TYPECUDA##Tensor *src) {     \
     THC_copyTensor<scalar_t, SCALARC>(state, self, src); \
   }
 
@@ -72,7 +69,7 @@ IMPLEMENT_THC_CUDA_TENSOR_COPY(Long, Long, int64_t)
 // THCudaTensor aka the non-existent THCudaFloatTensor
 IMPLEMENT_THC_CUDA_TENSOR_COPY(Float, , float)
 IMPLEMENT_THC_CUDA_TENSOR_COPY(Double, Double, double)
-IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half, half)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half, at::Half)
 
 #undef IMPLEMENT_THC_CUDA_TENSOR_COPY
 
diff --git a/aten/src/THC/generic/THCTensorMasked.cu b/aten/src/THC/generic/THCTensorMasked.cu
index f7e3e3f32a9a18..684ce31b141f79 100644
--- a/aten/src/THC/generic/THCTensorMasked.cu
+++ b/aten/src/THC/generic/THCTensorMasked.cu
@@ -3,9 +3,8 @@
 #else
 
 
-THC_API void
-THCTensor_(maskedFill)(THCState* state,
-                       THCTensor *tensor, THCudaByteTensor *mask, scalar_t value)
+void THCTensor_(maskedFill)(THCState* state,
+                            THCTensor *tensor, THCudaByteTensor *mask, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, mask));
   THArgCheck(THCTensor_(nElement)(state, tensor) ==
@@ -20,9 +19,8 @@ THCTensor_(maskedFill)(THCState* state,
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(maskedFillByte)(THCState* state,
-                           THCTensor *tensor, THByteTensor *mask, scalar_t value)
+void THCTensor_(maskedFillByte)(THCState* state,
+                                THCTensor *tensor, THByteTensor *mask, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, tensor));
   THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, mask->sizes(), {});
@@ -31,9 +29,8 @@ THCTensor_(maskedFillByte)(THCState* state,
   THCudaByteTensor_free(state, maskCuda);
 }
 
-THC_API void
-THCTensor_(maskedCopy)(THCState* state,
-                       THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src)
+void THCTensor_(maskedCopy)(THCState* state,
+                            THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
   ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask);
@@ -98,9 +95,8 @@ THCTensor_(maskedCopy)(THCState* state,
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(maskedCopyByte)(THCState* state,
-                           THCTensor *tensor, THByteTensor *mask, THCTensor *src) {
+void THCTensor_(maskedCopyByte)(THCState* state,
+                                THCTensor *tensor, THByteTensor *mask, THCTensor *src) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
   THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, mask->sizes(), {});
   THCudaByteTensor_copyByte(state, maskCuda, mask);
@@ -108,9 +104,8 @@ THCTensor_(maskedCopyByte)(THCState* state,
   THCudaByteTensor_free(state, maskCuda);
 }
 
-THC_API void
-THCTensor_(maskedSelect)(THCState* state,
-                         THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) {
+void THCTensor_(maskedSelect)(THCState* state,
+                              THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
   THArgCheck(THCudaByteTensor_nElement(state, mask) ==
              THCTensor_(nElement)(state, src),
@@ -171,9 +166,8 @@ THCTensor_(maskedSelect)(THCState* state,
 }
 
 // FIXME: remove now that we have THCudaByteTensor?
-THC_API void
-THCTensor_(maskedSelectByte)(THCState* state,
-                             THCTensor *tensor, THCTensor *src, THByteTensor *mask)
+void THCTensor_(maskedSelectByte)(THCState* state,
+                                  THCTensor *tensor, THCTensor *src, THByteTensor *mask)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, tensor, src));
   THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, mask->sizes(), {});
diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu
index 9ffe626dd8425f..c4f7afb6a227b9 100644
--- a/aten/src/THC/generic/THCTensorMath.cu
+++ b/aten/src/THC/generic/THCTensorMath.cu
@@ -2,8 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensorMath.cu"
 #else
 
-THC_API void
-THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value)
+void THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
 
@@ -15,8 +14,7 @@ THCTensor_(fill)(THCState* state, THCTensor *self_, scalar_t value)
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(zero)(THCState *state, THCTensor *self_)
+void THCTensor_(zero)(THCState *state, THCTensor *self_)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   if (THCTensor_(isContiguous)(state, self_)) {
@@ -35,16 +33,14 @@ THCTensor_(zero)(THCState *state, THCTensor *self_)
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(zerosLike)(THCState *state, THCTensor *r_, THCTensor *input)
+void THCTensor_(zerosLike)(THCState *state, THCTensor *r_, THCTensor *input)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, input));
   THCTensor_(resizeAs)(state, r_, input);
   THCTensor_(zero)(state, r_);
 }
 
-THC_API void
-THCTensor_(onesLike)(THCState *state, THCTensor *r_, THCTensor *input)
+void THCTensor_(onesLike)(THCState *state, THCTensor *r_, THCTensor *input)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, r_, input));
   THCTensor_(resizeAs)(state, r_, input);
diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu
index a37645de394de8..d83b8ff929b4f6 100644
--- a/aten/src/THC/generic/THCTensorMathBlas.cu
+++ b/aten/src/THC/generic/THCTensorMathBlas.cu
@@ -5,8 +5,7 @@
 #define ERROR_ONLY_FP_TYPES(func) \
   THError("%s for CUDA tensors only supports floating-point types. Try converting the tensors with .float()", func);
 
-THC_API accreal
-THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
+accreal THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
@@ -27,11 +26,10 @@ THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
                                 THCTensor_(data)(state, self), 1,
                                 THCTensor_(data)(state, src), 1);
 #elif defined(THC_REAL_IS_HALF)
-  accreal result = ScalarConvert<half, accreal>::to(
-                   THCudaBlas_Hdot(state,
+  accreal result = THCudaBlas_Hdot(state,
                                 THCTensor_(nElement)(state, self),
                                 THCTensor_(data)(state, self), 1,
-                                THCTensor_(data)(state, src), 1));
+                                THCTensor_(data)(state, src), 1);
 #endif
 
   THCTensor_(free)(state, src);
@@ -44,8 +42,7 @@ THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
 #endif
 }
 
-THC_API void
-THCTensor_(addmv)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *mat, THCTensor *vec)
+void THCTensor_(addmv)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *mat, THCTensor *vec)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec));
@@ -152,8 +149,7 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, s
 #endif
 }
 
-THC_API void
-THCTensor_(addr)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *vec1, THCTensor *vec2)
+void THCTensor_(addr)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *vec1, THCTensor *vec2)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2));
@@ -250,8 +246,7 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, sc
 #endif
 }
 
-THC_API void
-THCTensor_(addmm)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *m1, THCTensor *m2)
+void THCTensor_(addmm)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, scalar_t alpha, THCTensor *m1, THCTensor *m2)
 {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
 
@@ -414,9 +409,8 @@ THCTensor_(addmm)(THCState *state, THCTensor *r_, scalar_t beta, THCTensor *t, s
 #endif
 }
 
-THC_API void
-THCTensor_(addbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t,
-                   scalar_t alpha, THCTensor *batch1, THCTensor *batch2) {
+void THCTensor_(addbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t,
+                        scalar_t alpha, THCTensor *batch1, THCTensor *batch2) {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
   THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, t) == 2, 4, "expected 2D tensor");
@@ -479,9 +473,8 @@ __global__ void createBatchGemmBuffer3(const scalar_t** buffer1, const scalar_t
   }
 }
 
-THC_API void
-THCTensor_(baddbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t,
-                    scalar_t alpha, THCTensor *batch1, THCTensor *batch2) {
+void THCTensor_(baddbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor *t,
+                         scalar_t alpha, THCTensor *batch1, THCTensor *batch2) {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
   THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, t) == 3, 4, "expected 3D tensor");
@@ -746,7 +739,7 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, scalar_t beta, THCTensor
 #endif
 }
 
-THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, int pivot, THCTensor *a)
+void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTensor *rpivots_, THCudaIntTensor *rinfo_, int pivot, THCTensor *a)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THAssert(THCTensor_(checkGPU)(state, 2, ra_, a));
@@ -853,8 +846,8 @@ THC_API void THCTensor_(btrifact)(THCState *state, THCTensor *ra_, THCudaIntTens
 }
 
 
-THC_API void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b,
-                              THCTensor *atf, THCudaIntTensor *pivots)
+void THCTensor_(btrisolve)(THCState *state, THCTensor *rb_, THCTensor *b,
+                           THCTensor *atf, THCudaIntTensor *pivots)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THAssert(THCTensor_(checkGPU)(state, 3, rb_, atf, b));
diff --git a/aten/src/THC/generic/THCTensorMathCompare.cu b/aten/src/THC/generic/THCTensorMathCompare.cu
index 0a0041ab9e4784..3c8e8ce0c325a7 100644
--- a/aten/src/THC/generic/THCTensorMathCompare.cu
+++ b/aten/src/THC/generic/THCTensorMathCompare.cu
@@ -2,7 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensorMathCompare.cu"
 #else
 
-THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<uint8_t, scalar_t>(state, self_, src,
@@ -10,7 +10,7 @@ THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTe
                                   unsigned char>(value));
 }
 
-THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<uint8_t, scalar_t>(state, self_, src,
@@ -18,7 +18,7 @@ THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTe
                                   unsigned char>(value));
 }
 
-THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<uint8_t, scalar_t>(state, self_, src,
@@ -26,7 +26,7 @@ THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTe
                                   unsigned char>(value));
 }
 
-THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<uint8_t, scalar_t>(state, self_, src,
@@ -34,7 +34,7 @@ THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTe
                                   unsigned char>(value));
 }
 
-THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<uint8_t, scalar_t>(state, self_, src,
@@ -42,7 +42,7 @@ THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTe
                                   unsigned char>(value));
 }
 
-THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<uint8_t, scalar_t>(state, self_, src,
@@ -50,7 +50,7 @@ THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTe
                                   unsigned char>(value));
 }
 
-THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<scalar_t, scalar_t>(state, self_, src,
@@ -58,7 +58,7 @@ THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *
                                   scalar_t>(value));
 }
 
-THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<scalar_t, scalar_t>(state, self_, src,
@@ -66,7 +66,7 @@ THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *
                               scalar_t>(value));
 }
 
-THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<scalar_t, scalar_t>(state, self_, src,
@@ -74,7 +74,7 @@ THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *
                                scalar_t>(value));
 }
 
-THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<scalar_t, scalar_t>(state, self_, src,
@@ -82,7 +82,7 @@ THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *
                                scalar_t>(value));
 }
 
-THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<scalar_t, scalar_t>(state, self_, src,
@@ -90,7 +90,7 @@ THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *
                                scalar_t>(value));
 }
 
-THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
+void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
   THC_logicalValue<scalar_t, scalar_t>(state, self_, src,
diff --git a/aten/src/THC/generic/THCTensorMathCompareT.cu b/aten/src/THC/generic/THCTensorMathCompareT.cu
index 6397a0b7caaa96..1bd4b9909fc26d 100644
--- a/aten/src/THC/generic/THCTensorMathCompareT.cu
+++ b/aten/src/THC/generic/THCTensorMathCompareT.cu
@@ -2,8 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensorMathCompareT.cu"
 #else
 
-THC_API void
-THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<uint8_t, scalar_t>(state, self_, src1, src2,
@@ -11,8 +10,7 @@ THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
                                    unsigned char>());
 }
 
-THC_API void
-THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<uint8_t, scalar_t>(state, self_, src1, src2,
@@ -20,8 +18,7 @@ THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
                                    unsigned char>());
 }
 
-THC_API void
-THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<uint8_t, scalar_t>(state, self_, src1, src2,
@@ -29,8 +26,7 @@ THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
                                    unsigned char>());
 }
 
-THC_API void
-THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<uint8_t, scalar_t>(state, self_, src1, src2,
@@ -38,8 +34,7 @@ THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
                                    unsigned char>());
 }
 
-THC_API void
-THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<uint8_t, scalar_t>(state, self_, src1, src2,
@@ -47,8 +42,7 @@ THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
                                    unsigned char>());
 }
 
-THC_API void
-THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<uint8_t, scalar_t>(state, self_, src1, src2,
@@ -56,8 +50,7 @@ THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1,
                                    unsigned char>());
 }
 
-THC_API void
-THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<scalar_t, scalar_t>(state, self_, src1, src2,
@@ -65,8 +58,7 @@ THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
                                 scalar_t>());
 }
 
-THC_API void
-THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<scalar_t, scalar_t>(state, self_, src1, src2,
@@ -74,8 +66,7 @@ THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
                                 scalar_t>());
 }
 
-THC_API void
-THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<scalar_t, scalar_t>(state, self_, src1, src2,
@@ -83,8 +74,7 @@ THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
                                 scalar_t>());
 }
 
-THC_API void
-THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<scalar_t, scalar_t>(state, self_, src1, src2,
@@ -92,8 +82,7 @@ THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
                                 scalar_t>());
 }
 
-THC_API void
-THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<scalar_t, scalar_t>(state, self_, src1, src2,
@@ -101,8 +90,7 @@ THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
                                 scalar_t>());
 }
 
-THC_API void
-THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THC_logicalTensor<scalar_t, scalar_t>(state, self_, src1, src2,
diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu
index ecf39d9a1bf0f2..29c7999f74a9b6 100644
--- a/aten/src/THC/generic/THCTensorMathMagma.cu
+++ b/aten/src/THC/generic/THCTensorMathMagma.cu
@@ -60,7 +60,7 @@ static THCTensor* THCTensor_(newColumnMajor)(THCState *state, THCTensor *self, T
 }
 
 
-THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
+void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
@@ -98,8 +98,8 @@ THC_API void THCTensor_(gesv)(THCState *state, THCTensor *rb_, THCTensor *ra_, T
 #endif
 }
 
-THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_,
-                               const char *uplo, const char *trans, const char *diag)
+void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_,
+                       const char *uplo, const char *trans, const char *diag)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
@@ -135,7 +135,7 @@ THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_,
 #endif
 }
 
-THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
+void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 1, "A should be (non-empty) 2 dimensional");
@@ -182,7 +182,7 @@ THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, T
 #endif
 }
 
-THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos)
+void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a, const char *jobzs, const char *uplos)
 {
 #ifdef USE_MAGMA
   int64_t n = THTensor_sizeLegacyNoScalars(a, 0);
@@ -247,7 +247,7 @@ THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, T
 #endif
 }
 
-THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs)
+void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvrs)
 {
 #ifdef USE_MAGMA
   THArgCheck(a_->dim() == 2, 3, "A should be 2 dimensional");
@@ -321,7 +321,7 @@ THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, T
 #endif
 }
 
-THC_API void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu)
+void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu)
 {
 #ifdef USE_MAGMA
   THCTensor *ra_ = THCTensor_(new)(state);
@@ -332,7 +332,7 @@ THC_API void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_,
 #endif
 }
 
-THC_API void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus)
+void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
@@ -396,7 +396,7 @@ THC_API void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_,
 #endif
 }
 
-THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a)
+void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a)
 {
   THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
   THArgCheck(a->size(0) == a->size(1), 2, "A should be square");
@@ -524,7 +524,7 @@ __global__ void THCTensor_(copyLowerSymmetric)(scalar_t *input, int n, int len)
   }
 }
 
-THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
+void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
@@ -564,7 +564,7 @@ THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, co
 #endif
 }
 
-THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
+void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be (non-empty) 2 dimensional");
@@ -600,7 +600,7 @@ THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, co
 #endif
 }
 
-THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo)
+void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, THCTensor *a, const char *uplo)
 {
 #ifdef USE_MAGMA
   THArgCheck(a->size(0) == a->size(1), 2, "A should be square");
@@ -632,7 +632,7 @@ THC_API void THCTensor_(potrs)(THCState *state, THCTensor *rb_, THCTensor *b, TH
 #endif
 }
 
-THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_)
+void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_, THCTensor *a_)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional");
@@ -669,7 +669,7 @@ THC_API void THCTensor_(geqrf)(THCState *state, THCTensor *ra_, THCTensor *rtau_
 #endif
 }
 
-THC_API void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_)
+void THCTensor_(qr)(THCState *state, THCTensor *rq_, THCTensor *rr_, THCTensor *a_)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a_->is_empty() && a_->dim() == 2, 2, "A should be non-empty 2 dimensional");
diff --git a/aten/src/THC/generic/THCTensorMathPairwise.cu b/aten/src/THC/generic/THCTensorMathPairwise.cu
index 40d6bdb6382983..0f2d0067ff5546 100644
--- a/aten/src/THC/generic/THCTensorMathPairwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPairwise.cu
@@ -2,8 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensorMathPairwise.cu"
 #else
 
-THC_API void
-THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
@@ -21,8 +20,7 @@ THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
@@ -40,30 +38,17 @@ THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(add_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha)
+void THCTensor_(add_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha)
 {
-#ifdef THC_REAL_IS_HALF
-  auto v = THC_half2float(value) * THC_half2float(alpha);
-  THCTensor_(add)(state, self_, src_, THC_float2half(v));
-#else
   THCTensor_(add)(state, self_, src_, value * alpha);
-#endif
 }
 
-THC_API void
-THCTensor_(sub_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha)
+void THCTensor_(sub_scaled)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value, scalar_t alpha)
 {
-#ifdef THC_REAL_IS_HALF
-  auto v = THC_half2float(value) * THC_half2float(alpha);
-  THCTensor_(sub)(state, self_, src_, THC_float2half(v));
-#else
   THCTensor_(sub)(state, self_, src_, value * alpha);
-#endif
 }
 
-THC_API void
-THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
@@ -81,8 +66,7 @@ THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t val
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   THArgCheck(value != ScalarConvert<int, scalar_t>::to(0), 3, "divide by zero");
@@ -102,8 +86,7 @@ THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t val
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THCTensor_(mul)(state, self_, src_, pow(2, value));
@@ -126,8 +109,7 @@ THCTensor_(lshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t
 #endif
 }
 
-THC_API void
-THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THCTensor_(mul)(state, self_, src_, pow(2, -value));
@@ -150,8 +132,7 @@ THCTensor_(rshift)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t
 #endif
 }
 
-THC_API void
-THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
@@ -169,8 +150,7 @@ THCTensor_(fmod)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t va
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(remainder)(THCState *state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (self_ == src_) {
@@ -245,7 +225,7 @@ void THCTensor_(triu)(THCState *state, THCTensor *self_, THCTensor *src_, int64_
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_)
+int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   if (!THCTensor_(isSameSizeAs(state, self_, src_))) {
@@ -269,8 +249,7 @@ THC_API int THCTensor_(equal)(THCState *state, THCTensor *self_, THCTensor *src_
   return min != 0;
 }
 
-THC_API void
-THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   return THError("bitand only supported for integer type tensors");
@@ -291,8 +270,7 @@ THCTensor_(bitand)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t
 #endif
 }
 
-THC_API void
-THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   return THError("bitor only supported for integer type tensors");
@@ -313,8 +291,7 @@ THCTensor_(bitor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t v
 #endif
 }
 
-THC_API void
-THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
+void THCTensor_(bitxor)(THCState* state, THCTensor *self_, THCTensor *src_, scalar_t value)
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   return THError("bitxor only supported for integer type tensors");
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu
index 9192d6c9f9a1d5..4ff836fd53dda6 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPointwise.cu
@@ -108,8 +108,7 @@ void THCTensor_(clamp)(THCState *state, THCTensor *self_, THCTensor *src, scalar
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension)
+void THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int dimension)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, x, y));
 
@@ -211,8 +210,7 @@ void THCTensor_(polygamma)(THCState* state, THCTensor* self_, int64_t n, THCTens
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, scalar_t w)
+void THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b, scalar_t w)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, result, a, b));
   THArgCheck(THCTensor_(nElement)(state, a) ==
@@ -228,39 +226,42 @@ THCTensor_(lerp)(THCState *state, THCTensor *result, THCTensor *a, THCTensor *b,
 
 #endif
 
-THC_API void
-THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2)
+namespace {
+c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl> retainTensorImpl(THCTensor* self) {
+  c10::raw::intrusive_ptr::incref(self);
+  return c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(self);
+}
+}
+
+void THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2)
 {
-  auto out = at::Tensor(self_, true);
+  auto out = at::Tensor(retainTensorImpl(self_));
 #ifdef THC_REAL_IS_HALF
   auto alpha = at::Half(value);
 #else
   auto alpha = value;
 #endif
-  at::add_out(out, at::Tensor(src1, true), at::Tensor(src2, true), alpha);
+  at::add_out(out, retainTensorImpl(src1), retainTensorImpl(src2), alpha);
 }
 
-THC_API void
-THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2)
+void THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, scalar_t value, THCTensor *src2)
 {
-  auto out = at::Tensor(self_, true);
+  auto out = at::Tensor(retainTensorImpl(self_));
 #ifdef THC_REAL_IS_HALF
   auto alpha = at::Half(value);
 #else
   auto alpha = value;
 #endif
-  at::sub_out(out, at::Tensor(src1, true), at::Tensor(src2, true), alpha);
+  at::sub_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)), alpha);
 }
 
-THC_API void
-THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  auto out = at::Tensor(self_, true);
-  at::mul_out(out, at::Tensor(src1, true), at::Tensor(src2, true));
+  auto out = at::Tensor(retainTensorImpl(self_));
+  at::mul_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)));
 }
 
-THC_API void
-THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
@@ -367,15 +368,13 @@ void THCTensor_(tpow)(THCState *state, THCTensor *self_, scalar_t value, THCTens
 
   THCudaCheck(cudaGetLastError());
 }
-THC_API void
-THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
-  auto out = at::Tensor(self_, true);
-  at::div_out(out, at::Tensor(src1, true), at::Tensor(src2, true));
+  auto out = at::Tensor(retainTensorImpl(self_));
+  at::div_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)));
 }
 
-THC_API void
-THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
 #if defined(THC_REAL_IS_HALF)
   return THError("clshift not supported for torch.CudaHalfTensor");
@@ -402,8 +401,7 @@ THCTensor_(clshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTenso
 #endif
 }
 
-THC_API void
-THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
 #if defined(THC_REAL_IS_HALF)
   return THError("crshift not supported for torch.CudaHalfTensor");
@@ -430,8 +428,7 @@ THCTensor_(crshift)(THCState* state, THCTensor *self_, THCTensor *src1, THCTenso
 #endif
 }
 
-THC_API void
-THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
@@ -449,8 +446,7 @@ THCTensor_(cmax)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *s
   }
 }
 
-THC_API void
-THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
@@ -468,8 +464,7 @@ THCTensor_(cmin)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *s
   }
 }
 
-THC_API void
-THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
@@ -487,8 +482,7 @@ THCTensor_(cremainder)(THCState *state, THCTensor *self, THCTensor *src1, THCTen
   }
 }
 
-THC_API void
-THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
   THArgCheck(THCTensor_(nElement)(state, src1) ==
@@ -506,8 +500,7 @@ THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *
   }
 }
 
-THC_API void
-THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value)
+void THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
 
@@ -523,8 +516,7 @@ THCTensor_(cmaxValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t
   }
 }
 
-THC_API void
-THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value)
+void THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
 
@@ -540,8 +532,7 @@ THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *src, scalar_t
   }
 }
 
-THC_API void
-THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2)
+void THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
   if(self_ != t)
@@ -565,8 +556,7 @@ THCTensor_(addcmul)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t va
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2)
+void THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t value, THCTensor *src1, THCTensor *src2)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, self_, t, src1, src2));
   if(self_ != t)
@@ -589,8 +579,7 @@ THCTensor_(addcdiv)(THCState *state, THCTensor *self_, THCTensor *t, scalar_t va
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   return THError("cbitand is only supported for integer type tensors");
@@ -617,8 +606,7 @@ THCTensor_(cbitand)(THCState* state, THCTensor *self_, THCTensor *src1, THCTenso
 #endif
 }
 
-THC_API void
-THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   return THError("cbitor is only supported for integer type tensors");
@@ -645,8 +633,7 @@ THCTensor_(cbitor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor
 #endif
 }
 
-THC_API void
-THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+void THCTensor_(cbitxor)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
 {
 #if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   return THError("cbitor is only supported for integer type tensors");
diff --git a/aten/src/THC/generic/THCTensorMathReduce.cu b/aten/src/THC/generic/THCTensorMathReduce.cu
index 958bf623b85f05..4d7e81750c0ec7 100644
--- a/aten/src/THC/generic/THCTensorMathReduce.cu
+++ b/aten/src/THC/generic/THCTensorMathReduce.cu
@@ -2,8 +2,7 @@
 #define THC_GENERIC_FILE "generic/THCTensorMathReduce.cu"
 #else
 
-THC_API void
-THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) {
+void THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   if (!THC_reduceDim<scalar_t>(state, self, src,
                            thrust::identity<accreal>{},
@@ -18,8 +17,7 @@ THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, int dimension,
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) {
+void THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension, int keepdim) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   if (!THC_reduceDim<scalar_t>(state, self, src,
                            thrust::identity<accreal>{},
@@ -34,8 +32,7 @@ THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, int dimension
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim)
+void THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int keepdim)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   const accreal size = scalar_cast<accreal>(THCTensor_(size)(state, src, dim));
@@ -54,8 +51,7 @@ THCTensor_(mean)(THCState *state, THCTensor *self, THCTensor *src, int dim, int
 
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
-THC_API void
-THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t value, int dimension, scalar_t maxnorm)
+void THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t value, int dimension, scalar_t maxnorm)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
   THCTensor *self_;
@@ -89,8 +85,7 @@ THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t va
   THCTensor_(free)(state, data);
 }
 
-THC_API void
-THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim)
+void THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
 
@@ -117,8 +112,7 @@ THCTensor_(std)(THCState *state, THCTensor *self_, THCTensor *src, int dimension
   }
 }
 
-THC_API void
-THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim)
+void THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension, int biased, int keepdim)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src));
 
@@ -145,15 +139,13 @@ THCTensor_(var)(THCState *state, THCTensor *self_, THCTensor *src, int dimension
   }
 }
 
-THC_API accreal
-THCTensor_(stdall)(THCState *state, THCTensor *self, int biased)
+accreal THCTensor_(stdall)(THCState *state, THCTensor *self, int biased)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   return THCNumerics<accreal>::sqrt((THCTensor_(varall)(state, self, biased)));
 }
 
-THC_API accreal
-THCTensor_(varall)(THCState *state, THCTensor *self, int biased)
+accreal THCTensor_(varall)(THCState *state, THCTensor *self, int biased)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal mean = THCTensor_(meanall)(state, self);
@@ -176,8 +168,7 @@ THCTensor_(varall)(THCState *state, THCTensor *self, int biased)
   return val;
 }
 
-THC_API void
-THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t _value, int dimension, int keepdim)
+void THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t _value, int dimension, int keepdim)
 {
   const accreal value = scalar_cast<accreal>(_value);
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, src));
@@ -221,8 +212,7 @@ THCTensor_(norm)(THCState *state, THCTensor* self, THCTensor* src, scalar_t _val
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API accreal
-THCTensor_(normall)(THCState *state, THCTensor *self, scalar_t _value)
+accreal THCTensor_(normall)(THCState *state, THCTensor *self, scalar_t _value)
 {
   const accreal value = scalar_cast<accreal>(_value);
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
@@ -295,8 +285,7 @@ accreal THCTensor_(dist)(THCState *state, THCTensor *self,
 
 #endif
 
-THC_API accreal
-THCTensor_(sumall)(THCState *state, THCTensor *self) {
+accreal THCTensor_(sumall)(THCState *state, THCTensor *self) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal val;
   if (!THC_reduceAll<scalar_t>(state, self,
@@ -311,8 +300,7 @@ THCTensor_(sumall)(THCState *state, THCTensor *self) {
   return val;
 }
 
-THC_API accreal
-THCTensor_(prodall)(THCState *state, THCTensor *self) {
+accreal THCTensor_(prodall)(THCState *state, THCTensor *self) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal val;
   if (!THC_reduceAll<scalar_t>(state, self,
@@ -327,15 +315,13 @@ THCTensor_(prodall)(THCState *state, THCTensor *self) {
   return val;
 }
 
-THC_API accreal
-THCTensor_(meanall)(THCState *state, THCTensor *self)
+accreal THCTensor_(meanall)(THCState *state, THCTensor *self)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   return THCTensor_(sumall)(state, self)/THCTensor_(nElement)(state, self);
 }
 
-THC_API scalar_t
-THCTensor_(minall)(THCState *state, THCTensor *self) {
+scalar_t THCTensor_(minall)(THCState *state, THCTensor *self) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal val;
   if (!THC_reduceAll<scalar_t>(state, self,
@@ -349,8 +335,7 @@ THCTensor_(minall)(THCState *state, THCTensor *self) {
   return scalar_cast<scalar_t>(val);
 }
 
-THC_API scalar_t
-THCTensor_(maxall)(THCState *state, THCTensor *self) {
+scalar_t THCTensor_(maxall)(THCState *state, THCTensor *self) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
   accreal val;
   if (!THC_reduceAll<scalar_t>(state, self,
@@ -364,8 +349,7 @@ THCTensor_(maxall)(THCState *state, THCTensor *self) {
   return scalar_cast<scalar_t>(val);
 }
 
-THC_API scalar_t
-THCTensor_(medianall)(THCState *state, THCTensor *self) {
+scalar_t THCTensor_(medianall)(THCState *state, THCTensor *self) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
 
   scalar_t val;
@@ -392,13 +376,12 @@ THCTensor_(medianall)(THCState *state, THCTensor *self) {
   return val;
 }
 
-THC_API void
-THCTensor_(median)(THCState *state,
-                   THCTensor *values,
-                   THCudaLongTensor *indices,
-                   THCTensor *self,
-                   int dimension,
-                   int keepdim) {
+void THCTensor_(median)(THCState *state,
+                        THCTensor *values,
+                        THCudaLongTensor *indices,
+                        THCTensor *self,
+                        int dimension,
+                        int keepdim) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self));
 
   int64_t t_size_dim, k;
@@ -434,13 +417,12 @@ THCTensor_(median)(THCState *state,
   THCudaCheck(cudaGetLastError());
 }
 
-THC_API void
-THCTensor_(max)(THCState *state,
-                THCTensor *values,
-                THCudaLongTensor *indices,
-                THCTensor *src,
-                int dimension,
-                int keepdim) {
+void THCTensor_(max)(THCState *state,
+                     THCTensor *values,
+                     THCudaLongTensor *indices,
+                     THCTensor *src,
+                     int dimension,
+                     int keepdim) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src));
 
   thrust::pair<scalar_t, int64_t>
@@ -453,13 +435,12 @@ THCTensor_(max)(THCState *state,
     MaxValuePair<scalar_t, int64_t>());
 }
 
-THC_API void
-THCTensor_(min)(THCState *state,
-                THCTensor *values,
-                THCudaLongTensor *indices,
-                THCTensor *src,
-                int dimension,
-                int keepdim) {
+void THCTensor_(min)(THCState *state,
+                     THCTensor *values,
+                     THCudaLongTensor *indices,
+                     THCTensor *src,
+                     int dimension,
+                     int keepdim) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, values, indices, src));
 
   thrust::pair<scalar_t, int64_t>
diff --git a/aten/src/THC/generic/THCTensorMode.cu b/aten/src/THC/generic/THCTensorMode.cu
index 274093ef105ae5..db72921bcfd1b0 100644
--- a/aten/src/THC/generic/THCTensorMode.cu
+++ b/aten/src/THC/generic/THCTensorMode.cu
@@ -2,13 +2,13 @@
 #define THC_GENERIC_FILE "generic/THCTensorMode.cu"
 #else
 
-THC_API void THCTensor_(calculateMode)(THCState *state,
-                                        THCTensor *values,
-                                        THCudaLongTensor *indices,
-                                        THCTensor *input,
-                                        THCudaLongStorage *sortBuffer,
-                                        int dimension,
-                                        THLongStorage *position) {
+void THCTensor_(calculateMode)(THCState *state,
+                               THCTensor *values,
+                               THCudaLongTensor *indices,
+                               THCTensor *input,
+                               THCudaLongStorage *sortBuffer,
+                               int dimension,
+                               THLongStorage *position) {
   THAssert(THCTensor_(isContiguous)(state, input));
 
   // Because the input is contiguous, we want to get a reference to the
@@ -129,14 +129,14 @@ THC_API void THCTensor_(calculateMode)(THCState *state,
 }
 
 // this probably could be a loop, not a recursive algorithm
-THC_API void THCTensor_(dimApplyMode)(THCState *state,
-                               THCTensor *values,
-                               THCudaLongTensor *indices,
-                               THCTensor *input,
-                               THCudaLongStorage *sortBuffer,
-                               int dimension,
-                               THLongStorage *position,
-                               int curDim) {
+void THCTensor_(dimApplyMode)(THCState *state,
+                              THCTensor *values,
+                              THCudaLongTensor *indices,
+                              THCTensor *input,
+                              THCudaLongStorage *sortBuffer,
+                              int dimension,
+                              THLongStorage *position,
+                              int curDim) {
   int64_t ndim = THCTensor_(nDimensionLegacyAll)(state, input);
 
   // Because we have transposed the Tensor, the data for the dimension we are mode'ing along
@@ -155,12 +155,12 @@ THC_API void THCTensor_(dimApplyMode)(THCState *state,
 #define MAX_GRID_SIZE  65535
 #define MAX_BLOCK_SIZE 1024
 
-THC_API void THCTensor_(mode)(THCState *state,
-                              THCTensor *values,
-                              THCudaLongTensor *indices,
-                              THCTensor *input,
-                              int dimension,
-                              int keepdim) {
+void THCTensor_(mode)(THCState *state,
+                      THCTensor *values,
+                      THCudaLongTensor *indices,
+                      THCTensor *input,
+                      int dimension,
+                      int keepdim) {
   THCTensor *transposed, *contiguous, *valuesTransposed;
   THLongStorage *position;
   THCudaLongStorage *sortBuffer;
diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu
index 620c73e9af01d3..4cbc6dd1a29999 100644
--- a/aten/src/THC/generic/THCTensorRandom.cu
+++ b/aten/src/THC/generic/THCTensorRandom.cu
@@ -6,7 +6,7 @@
 
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
-THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b)
+void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
@@ -21,7 +21,7 @@ THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, do
   THCTensor_(freeCopyTo)(state, self, self_);
 };
 
-THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv)
+void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
@@ -36,13 +36,13 @@ THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean,
   THCTensor_(freeCopyTo)(state, self, self_);
 };
 
-THC_API void THCTensor_(normal_means)(THCState *state, THCTensor *self, THCTensor *means, double stddev) {
+void THCTensor_(normal_means)(THCState *state, THCTensor *self, THCTensor *means, double stddev) {
   THCTensor_(resizeAs)(state, self, means);
   THCTensor_(normal)(state, self, 0, stddev);
   THCTensor_(cadd)(state, self, self, ScalarConvert<int, scalar_t>::to(1), means);
 }
 
-THC_API void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double mean, THCTensor *stddevs)
+void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double mean, THCTensor *stddevs)
 {
   THCTensor_(resizeAs)(state, self, stddevs);
   THCTensor_(normal)(state, self, 0, 1);
@@ -50,7 +50,7 @@ THC_API void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double
   THCTensor_(add)(state, self, self, ScalarConvert<double, scalar_t>::to(mean));
 }
 
-THC_API void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs)
+void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs)
 {
   THCTensor_(resizeAs)(state, self, means);
   THCTensor_(normal)(state, self, 0, 1);
@@ -58,7 +58,7 @@ THC_API void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self,
   THCTensor_(cadd)(state, self, self, ScalarConvert<int, scalar_t>::to(1), means);
 }
 
-THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv)
+void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv)
 {
 
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
@@ -75,7 +75,7 @@ THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mea
   THCTensor_(freeCopyTo)(state, self, self_);
 };
 
-THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda)
+void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
@@ -91,7 +91,7 @@ THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double l
   THCTensor_(freeCopyTo)(state, self, self_);
 };
 
-THC_API void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma)
+void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
@@ -128,11 +128,11 @@ void THCTensor_(renormRows)(struct THCState* state,
                                         rows, cols);
 }
 
-THC_API void THCTensor_(multinomial)(struct THCState *state,
-                                      THCudaLongTensor *self,
-                                      THCTensor *prob_dist,
-                                      int n_sample,
-                                      int with_replacement)
+void THCTensor_(multinomial)(struct THCState *state,
+                              THCudaLongTensor *self,
+                              THCTensor *prob_dist,
+                              int n_sample,
+                              int with_replacement)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self, prob_dist));
   THCGenerator* gen = THCRandom_getGenerator(state);
@@ -299,7 +299,7 @@ THC_API void THCTensor_(multinomial)(struct THCState *state,
   }
 }
 
-THC_API void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCudaLongTensor *_J, THCTensor *_q){
+void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCudaLongTensor *_J, THCTensor *_q){
   THAssert(THCTensor_(isContiguous)(state, _q));
   THAssert(THCudaLongTensor_isContiguous(state, _J));
   THAssert(THCTensor_(isContiguous)(state, _probs));
@@ -354,7 +354,7 @@ THC_API void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_prob
   THCudaLongTensor_free(state, larger_short);
 }
 
-THC_API void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCudaLongTensor *_J, THCTensor *_q){
+void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, THCudaLongTensor *_J, THCTensor *_q){
   THAssert(THCTensor_(isContiguous)(state, _q));
   THAssert(THCudaLongTensor_isContiguous(state, _J));
   THCGenerator* gen = THCRandom_getGenerator(state);
@@ -388,7 +388,7 @@ GENERATE_KERNEL1(generate_bernoulli, double, double p, double, curand_uniform_do
 GENERATE_KERNEL1(generate_bernoulli, scalar_t, double p, float, curand_uniform, (ScalarConvert<bool, scalar_t>::to(x <= p)))
 #endif
 
-THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
+void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
@@ -413,7 +413,7 @@ void THCTensor_(bernoulli_Tensor)(THCState *state, THCTensor *self, THCTensor* p
 }
 
 #define DEFINE_BERNOULLI_TENSOR(NAME, PROB_TYPE, PROB_DATA_TYPE)               \
-THC_API void THCTensor_(NAME)(THCState* state,                                 \
+void THCTensor_(NAME)(THCState* state,                                 \
         THCTensor *self_, PROB_TYPE *probs_)                                   \
 {                                                                              \
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, probs_));             \
@@ -458,7 +458,7 @@ GENERATE_KERNEL2(generate_random, scalar_t, int32_t base, uint32_t range, uint32
     static_cast<scalar_t>(static_cast<int32_t>(x % range + base)))
 #endif
 
-THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p)
+void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
@@ -474,7 +474,7 @@ THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p)
   THCTensor_(freeCopyTo)(state, self, self_);
 };
 
-THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_t min_val, int64_t max_val)
+void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_t min_val, int64_t max_val)
 {
   THArgCheck(min_val < max_val, 2,
              "max must be greater than min, but got: min = %lld, max = %lld", min_val, max_val);
@@ -502,14 +502,14 @@ THC_API void THCTensor_(clampedRandom)(THCState* state, THCTensor *self_, int64_
   THCTensor_(freeCopyTo)(state, self, self_);
 };
 
-THC_API void THCTensor_(cappedRandom)(THCState* state, THCTensor *self_, int64_t max_val)
+void THCTensor_(cappedRandom)(THCState* state, THCTensor *self_, int64_t max_val)
 {
   THCTensor_(clampedRandom)(state, self_, 0LL, max_val);
 };
 
 #define HLF_MANT_DIG 11
 
-THC_API void THCTensor_(random)(THCState* state, THCTensor *self_)
+void THCTensor_(random)(THCState* state, THCTensor *self_)
 {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
   ptrdiff_t size = THCTensor_(nElement)(state, self_);
diff --git a/aten/src/THC/generic/THCTensorSort.cu b/aten/src/THC/generic/THCTensorSort.cu
index e310e422600595..03aad5f2d305fd 100644
--- a/aten/src/THC/generic/THCTensorSort.cu
+++ b/aten/src/THC/generic/THCTensorSort.cu
@@ -5,10 +5,10 @@
 // In alignment with default sort on a c++ map, this function
 // will permute key and value tensors identically, and
 // in such a way that the 'key' tensor is ordered numerically
-THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
-                                           THCTensor* key,
-                                           THCudaLongTensor* value,
-                                           int dim, bool dir) {
+void THCTensor_(sortKeyValueInplace)(THCState* state,
+                                     THCTensor* key,
+                                     THCudaLongTensor* value,
+                                     int dim, bool dir) {
   THArgCheck(key->sizes().equals(value->sizes()), 2,
              "Key tensor must have same size as value tensor");
   int dims = THCudaLongTensor_nDimensionLegacyNoScalars(state, value);
@@ -274,11 +274,11 @@ void THCTensor_(sortViaThrust)(THCState* state,
   THCudaLongTensor_freeCopyTo(state, trContigIndices, indices);
 }
 
-THC_API void THCTensor_(sort)(THCState* state,
-                               THCTensor *sorted,
-                               THCudaLongTensor *indices,
-                               THCTensor *input,
-                               int dim, int order) {
+void THCTensor_(sort)(THCState* state,
+                      THCTensor *sorted,
+                      THCudaLongTensor *indices,
+                      THCTensor *input,
+                      int dim, int order) {
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, sorted, input));
   THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, indices));
   int64_t dims = THCTensor_(nDimensionLegacyNoScalars)(state, sorted);
diff --git a/aten/src/THC/generic/THCTensorSort.h b/aten/src/THC/generic/THCTensorSort.h
index 009d825a223975..eba93fb75773c3 100644
--- a/aten/src/THC/generic/THCTensorSort.h
+++ b/aten/src/THC/generic/THCTensorSort.h
@@ -4,10 +4,17 @@
 
 /* Performs an in-place sort of (keys, values). Only works for slice sizes
    <= 2048 at the moment (slice size == size of keys/values dim `dim`) */
+#ifdef __cplusplus
+THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
+                                             THCTensor* keys,
+                                             THCudaLongTensor* values,
+                                             int dim, bool dir);
+#else
 THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
                                              THCTensor* keys,
                                              THCudaLongTensor* values,
                                              int dim, int order);
+#endif
 
 /* Performs an out-of-place sort of `input`, returning the per-slice indices
    in `indices` and the sorted values in `sorted` */
diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu
index 71ee008659b12a..a195dfbe5ca7a8 100644
--- a/aten/src/THC/generic/THCTensorTopK.cu
+++ b/aten/src/THC/generic/THCTensorTopK.cu
@@ -2,11 +2,11 @@
 #define THC_GENERIC_FILE "generic/THCTensorTopK.cu"
 #else
 
-THC_API void THCTensor_(topk)(THCState* state,
-                               THCTensor *topK,
-                               THCudaLongTensor *indices,
-                               THCTensor *input_,
-                               int64_t k, int dim, int dir, int sorted) {
+void THCTensor_(topk)(THCState* state,
+                      THCTensor *topK,
+                      THCudaLongTensor *indices,
+                      THCTensor *input_,
+                      int64_t k, int dim, int dir, int sorted) {
   THAssert(topK != NULL && indices != NULL && input_ != NULL);
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, topK, indices, input_));
   THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, topK) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
diff --git a/aten/src/THCUNN/Abs.cu b/aten/src/THCUNN/Abs.cu
index 72b7ff3c2f53d9..98542eda7e8a75 100644
--- a/aten/src/THCUNN/Abs.cu
+++ b/aten/src/THCUNN/Abs.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/AbsCriterion.cu b/aten/src/THCUNN/AbsCriterion.cu
index cb0f47510bc559..30aa975594a160 100644
--- a/aten/src/THCUNN/AbsCriterion.cu
+++ b/aten/src/THCUNN/AbsCriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCApply.cuh"
 
diff --git a/aten/src/THCUNN/BCECriterion.cu b/aten/src/THCUNN/BCECriterion.cu
index 3624588015c8a1..e458bb81c9725e 100644
--- a/aten/src/THCUNN/BCECriterion.cu
+++ b/aten/src/THCUNN/BCECriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCThrustAllocator.cuh"
 #include "THCApply.cuh"
diff --git a/aten/src/THCUNN/BatchNormalization.cu b/aten/src/THCUNN/BatchNormalization.cu
index 0c393c3a9d6db0..97579d1c4aefd5 100644
--- a/aten/src/THCUNN/BatchNormalization.cu
+++ b/aten/src/THCUNN/BatchNormalization.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 
diff --git a/aten/src/THCUNN/ClassNLLCriterion.cu b/aten/src/THCUNN/ClassNLLCriterion.cu
index 1043454ff1528b..dd430e9b88d120 100644
--- a/aten/src/THCUNN/ClassNLLCriterion.cu
+++ b/aten/src/THCUNN/ClassNLLCriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
diff --git a/aten/src/THCUNN/Col2Im.cu b/aten/src/THCUNN/Col2Im.cu
index d7fd995de4b88b..73eca7ff16ad30 100644
--- a/aten/src/THCUNN/Col2Im.cu
+++ b/aten/src/THCUNN/Col2Im.cu
@@ -4,7 +4,7 @@
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/Col2Im.cu"
diff --git a/aten/src/THCUNN/DistKLDivCriterion.cu b/aten/src/THCUNN/DistKLDivCriterion.cu
index e4e85b71045f8e..bd26c0c003bb7e 100644
--- a/aten/src/THCUNN/DistKLDivCriterion.cu
+++ b/aten/src/THCUNN/DistKLDivCriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCApply.cuh"
 
diff --git a/aten/src/THCUNN/ELU.cu b/aten/src/THCUNN/ELU.cu
index 9c4c2ea1fdc8b6..ec441645e8d7d5 100644
--- a/aten/src/THCUNN/ELU.cu
+++ b/aten/src/THCUNN/ELU.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/GatedLinearUnit.cu b/aten/src/THCUNN/GatedLinearUnit.cu
index aba9f1e794e308..376fdb3855eb23 100644
--- a/aten/src/THCUNN/GatedLinearUnit.cu
+++ b/aten/src/THCUNN/GatedLinearUnit.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 #include "common.h"
diff --git a/aten/src/THCUNN/HardTanh.cu b/aten/src/THCUNN/HardTanh.cu
index 539b22fec5a515..8d6a7953975f75 100644
--- a/aten/src/THCUNN/HardTanh.cu
+++ b/aten/src/THCUNN/HardTanh.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/Im2Col.cu b/aten/src/THCUNN/Im2Col.cu
index 95bdcd4e8b9cd7..252c488df33f00 100644
--- a/aten/src/THCUNN/Im2Col.cu
+++ b/aten/src/THCUNN/Im2Col.cu
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "im2col.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THCUNN/IndexLinear.cu b/aten/src/THCUNN/IndexLinear.cu
index eebb7efc153d88..032e8e31d2cffb 100644
--- a/aten/src/THCUNN/IndexLinear.cu
+++ b/aten/src/THCUNN/IndexLinear.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 #include "THCTensor.hpp"
diff --git a/aten/src/THCUNN/L1Cost.cu b/aten/src/THCUNN/L1Cost.cu
index eda58c18e2c6e4..4f11a94f8106c3 100644
--- a/aten/src/THCUNN/L1Cost.cu
+++ b/aten/src/THCUNN/L1Cost.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/device_ptr.h>
diff --git a/aten/src/THCUNN/LeakyReLU.cu b/aten/src/THCUNN/LeakyReLU.cu
index ec9efb836c3441..dec13dfd112dcc 100644
--- a/aten/src/THCUNN/LeakyReLU.cu
+++ b/aten/src/THCUNN/LeakyReLU.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/LogSigmoid.cu b/aten/src/THCUNN/LogSigmoid.cu
index 59ee39a6871bf7..e318fcea4f92f5 100644
--- a/aten/src/THCUNN/LogSigmoid.cu
+++ b/aten/src/THCUNN/LogSigmoid.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/LookupTable.cu b/aten/src/THCUNN/LookupTable.cu
index 59aa7e8f4612ba..05eb432871ed34 100644
--- a/aten/src/THCUNN/LookupTable.cu
+++ b/aten/src/THCUNN/LookupTable.cu
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "THCThrustAllocator.cuh"
 #include <thrust/unique.h>
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensorSort.cuh"
 #include "../THC/THCTensorMathReduce.cuh"
diff --git a/aten/src/THCUNN/LookupTableBag.cu b/aten/src/THCUNN/LookupTableBag.cu
index 83a60efbb32207..2c9e09cdb7624d 100644
--- a/aten/src/THCUNN/LookupTableBag.cu
+++ b/aten/src/THCUNN/LookupTableBag.cu
@@ -11,7 +11,7 @@
 #include <thrust/system/cuda/execution_policy.h>
 #endif
 #include <thrust/unique.h>
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensorSort.cuh"
 
diff --git a/aten/src/THCUNN/MSECriterion.cu b/aten/src/THCUNN/MSECriterion.cu
index e9571fe06c4e30..99b287baba38a9 100644
--- a/aten/src/THCUNN/MSECriterion.cu
+++ b/aten/src/THCUNN/MSECriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCThrustAllocator.cuh"
 #include "THCApply.cuh"
diff --git a/aten/src/THCUNN/MarginCriterion.cu b/aten/src/THCUNN/MarginCriterion.cu
index 7ccdbb725fe6b1..459d62a4e57a81 100644
--- a/aten/src/THCUNN/MarginCriterion.cu
+++ b/aten/src/THCUNN/MarginCriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/fill.h>
diff --git a/aten/src/THCUNN/MultiLabelMarginCriterion.cu b/aten/src/THCUNN/MultiLabelMarginCriterion.cu
index 13b432c15c38c6..220de837d2349e 100644
--- a/aten/src/THCUNN/MultiLabelMarginCriterion.cu
+++ b/aten/src/THCUNN/MultiLabelMarginCriterion.cu
@@ -2,7 +2,7 @@
 #include "THCTensor.hpp"
 #include "common.h"
 #include "THCReduceApplyUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/functional.h>
diff --git a/aten/src/THCUNN/MultiMarginCriterion.cu b/aten/src/THCUNN/MultiMarginCriterion.cu
index c2fa2134626101..de802c82979985 100644
--- a/aten/src/THCUNN/MultiMarginCriterion.cu
+++ b/aten/src/THCUNN/MultiMarginCriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THCUNN/PReLU.cu b/aten/src/THCUNN/PReLU.cu
index cdc6b2b71a1ee4..c89152487d2f3f 100644
--- a/aten/src/THCUNN/PReLU.cu
+++ b/aten/src/THCUNN/PReLU.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 #include "THCTensor.hpp"
@@ -69,7 +69,7 @@ struct PReLUAccGradParametersShared
 {
   __device__ __forceinline__ void operator()(T *gradInput, T  *input, T *gradOutput)
   {
-    *gradInput = (*input) * (*gradOutput) * (*input <= 0);
+    *gradInput = (*input) * (*gradOutput) * static_cast<int>(*input <= 0);
   }
 };
 
@@ -84,7 +84,7 @@ struct PReLUAccGradParameters
 
   __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput)
   {
-    *gradInput = (*input) * (*gradOutput) * scale * (*input <= 0);
+    *gradInput = (*input) * (*gradOutput) * scale * static_cast<int>(*input <= 0);
   }
 };
 
@@ -99,7 +99,7 @@ struct PReLUAccGradParameters1to1
 
   __device__ __forceinline__ void operator()(T *gradWeight, T *input, T *gradOutput)
   {
-    *gradWeight += (*input) * (*gradOutput) * scale * (*input <= 0);
+    *gradWeight += (*input) * (*gradOutput) * scale * static_cast<int>(*input <= 0);
   }
 };
 
diff --git a/aten/src/THCUNN/RReLU.cu b/aten/src/THCUNN/RReLU.cu
index 388627791a012f..d044fadadfb0d1 100644
--- a/aten/src/THCUNN/RReLU.cu
+++ b/aten/src/THCUNN/RReLU.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 #include "common.h"
@@ -15,8 +15,8 @@ template<typename T>
 inline T __device__ curand_uniform_type(curandStateMtgp32 *state);
 
 template <>
-inline half __device__ curand_uniform_type<half>(curandStateMtgp32 *state) {
-  return ScalarConvert<float, half>::to(curand_uniform(state));
+inline THHalf __device__ curand_uniform_type<THHalf>(curandStateMtgp32 *state) {
+  return ScalarConvert<float, THHalf>::to(curand_uniform(state));
 }
 
 template <>
diff --git a/aten/src/THCUNN/Sigmoid.cu b/aten/src/THCUNN/Sigmoid.cu
index 0be57d6a621dfc..6ade198fe02af1 100644
--- a/aten/src/THCUNN/Sigmoid.cu
+++ b/aten/src/THCUNN/Sigmoid.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/SmoothL1Criterion.cu b/aten/src/THCUNN/SmoothL1Criterion.cu
index c8018d997365dc..48a86e3efec996 100644
--- a/aten/src/THCUNN/SmoothL1Criterion.cu
+++ b/aten/src/THCUNN/SmoothL1Criterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCThrustAllocator.cuh"
 #include "THCApply.cuh"
diff --git a/aten/src/THCUNN/SoftMarginCriterion.cu b/aten/src/THCUNN/SoftMarginCriterion.cu
index ee53e76dca2625..63b0ef3cc7ca7e 100644
--- a/aten/src/THCUNN/SoftMarginCriterion.cu
+++ b/aten/src/THCUNN/SoftMarginCriterion.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCApply.cuh"
 
diff --git a/aten/src/THCUNN/SoftPlus.cu b/aten/src/THCUNN/SoftPlus.cu
index 42b2c3c5ef0617..a8a36c22fa918a 100644
--- a/aten/src/THCUNN/SoftPlus.cu
+++ b/aten/src/THCUNN/SoftPlus.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/SoftShrink.cu b/aten/src/THCUNN/SoftShrink.cu
index a4e45d87c6b3a3..5ec104155ea737 100644
--- a/aten/src/THCUNN/SoftShrink.cu
+++ b/aten/src/THCUNN/SoftShrink.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/SparseLinear.cu b/aten/src/THCUNN/SparseLinear.cu
index 2bc17a75a76f7e..08f72046341ca7 100644
--- a/aten/src/THCUNN/SparseLinear.cu
+++ b/aten/src/THCUNN/SparseLinear.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 
diff --git a/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu b/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu
index 2c671dad5a8364..ff68ab8440f757 100644
--- a/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu
+++ b/aten/src/THCUNN/SpatialAdaptiveAveragePooling.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "THCTensor.hpp"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu b/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu
index 592e6fd8b08154..591dd012fdd521 100644
--- a/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu
+++ b/aten/src/THCUNN/SpatialAdaptiveMaxPooling.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 #include "THCTensor.hpp"
diff --git a/aten/src/THCUNN/SpatialAveragePooling.cu b/aten/src/THCUNN/SpatialAveragePooling.cu
index ce9941a62398c7..d07255a954ee77 100644
--- a/aten/src/THCUNN/SpatialAveragePooling.cu
+++ b/aten/src/THCUNN/SpatialAveragePooling.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "THCTensor.hpp"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "common.h"
 
diff --git a/aten/src/THCUNN/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/SpatialClassNLLCriterion.cu
index 83addd09a2769b..92262354b3b696 100644
--- a/aten/src/THCUNN/SpatialClassNLLCriterion.cu
+++ b/aten/src/THCUNN/SpatialClassNLLCriterion.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 #include "common.h"
diff --git a/aten/src/THCUNN/SpatialConvolutionLocal.cu b/aten/src/THCUNN/SpatialConvolutionLocal.cu
index 17801d52b12688..0af7685432991b 100644
--- a/aten/src/THCUNN/SpatialConvolutionLocal.cu
+++ b/aten/src/THCUNN/SpatialConvolutionLocal.cu
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "im2col.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THCUNN/SpatialConvolutionMM.cu b/aten/src/THCUNN/SpatialConvolutionMM.cu
index 4a59acb2975188..d9f6f128efe81b 100644
--- a/aten/src/THCUNN/SpatialConvolutionMM.cu
+++ b/aten/src/THCUNN/SpatialConvolutionMM.cu
@@ -3,7 +3,7 @@
 #include "common.h"
 #include "im2col.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/SpatialConvolutionMM.cu"
diff --git a/aten/src/THCUNN/SpatialCrossMapLRN.cu b/aten/src/THCUNN/SpatialCrossMapLRN.cu
index cd6f081b1302df..2262ddba4743ed 100644
--- a/aten/src/THCUNN/SpatialCrossMapLRN.cu
+++ b/aten/src/THCUNN/SpatialCrossMapLRN.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THCUNN/SpatialDilatedConvolution.cu b/aten/src/THCUNN/SpatialDilatedConvolution.cu
index b8e96024fdaa87..b0edadb2b357fc 100644
--- a/aten/src/THCUNN/SpatialDilatedConvolution.cu
+++ b/aten/src/THCUNN/SpatialDilatedConvolution.cu
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "im2col.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu
index 6732e4f2b53409..3aef4ecf524cce 100644
--- a/aten/src/THCUNN/SpatialDilatedMaxPooling.cu
+++ b/aten/src/THCUNN/SpatialDilatedMaxPooling.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "THCTensor.hpp"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCNumerics.cuh"
 #include "common.h"
diff --git a/aten/src/THCUNN/SpatialFractionalMaxPooling.cu b/aten/src/THCUNN/SpatialFractionalMaxPooling.cu
index f3ca162453107a..71ddce8c84a995 100644
--- a/aten/src/THCUNN/SpatialFractionalMaxPooling.cu
+++ b/aten/src/THCUNN/SpatialFractionalMaxPooling.cu
@@ -3,7 +3,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/SpatialFullConvolution.cu b/aten/src/THCUNN/SpatialFullConvolution.cu
index 4e37ecf280bbf5..b4eff4b9c22d1a 100644
--- a/aten/src/THCUNN/SpatialFullConvolution.cu
+++ b/aten/src/THCUNN/SpatialFullConvolution.cu
@@ -1,7 +1,7 @@
 #include "THCUNN.h"
 #include "im2col.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/SpatialFullConvolution.cu"
diff --git a/aten/src/THCUNN/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/SpatialFullDilatedConvolution.cu
index 61e1fe5910ad18..9ba2236cea66cd 100644
--- a/aten/src/THCUNN/SpatialFullDilatedConvolution.cu
+++ b/aten/src/THCUNN/SpatialFullDilatedConvolution.cu
@@ -2,7 +2,7 @@
 #include "im2col.h"
 #include "THCTensor.hpp"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/SpatialFullDilatedConvolution.cu"
diff --git a/aten/src/THCUNN/SpatialReflectionPadding.cu b/aten/src/THCUNN/SpatialReflectionPadding.cu
index 96472eed08b839..0b0643c8e41342 100644
--- a/aten/src/THCUNN/SpatialReflectionPadding.cu
+++ b/aten/src/THCUNN/SpatialReflectionPadding.cu
@@ -7,7 +7,7 @@
 #include "THCReduceApplyUtils.cuh"
 #include <THC/THCApply.cuh>
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/SpatialReplicationPadding.cu b/aten/src/THCUNN/SpatialReplicationPadding.cu
index f63c2090d5fb14..1ee5c62f6e7132 100644
--- a/aten/src/THCUNN/SpatialReplicationPadding.cu
+++ b/aten/src/THCUNN/SpatialReplicationPadding.cu
@@ -7,7 +7,7 @@
 #include "THCReduceApplyUtils.cuh"
 #include <THC/THCApply.cuh>
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/SpatialSubSampling.cu b/aten/src/THCUNN/SpatialSubSampling.cu
index bb0484662254fd..8e8e390c136388 100644
--- a/aten/src/THCUNN/SpatialSubSampling.cu
+++ b/aten/src/THCUNN/SpatialSubSampling.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "THCTensor.hpp"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu
index 07daa0e9fec01d..b093ee287edce0 100644
--- a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu
+++ b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu
@@ -7,7 +7,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/SpatialUpSamplingNearest.cu b/aten/src/THCUNN/SpatialUpSamplingNearest.cu
index 889d64e1817e1c..8b0784cfd75351 100644
--- a/aten/src/THCUNN/SpatialUpSamplingNearest.cu
+++ b/aten/src/THCUNN/SpatialUpSamplingNearest.cu
@@ -7,7 +7,7 @@
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/Sqrt.cu b/aten/src/THCUNN/Sqrt.cu
index a52ce34117aaf0..7358f8c6bd0bc7 100644
--- a/aten/src/THCUNN/Sqrt.cu
+++ b/aten/src/THCUNN/Sqrt.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/Square.cu b/aten/src/THCUNN/Square.cu
index 66bbec49d29cd9..f44fbfe7ea4fc4 100644
--- a/aten/src/THCUNN/Square.cu
+++ b/aten/src/THCUNN/Square.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
index 5f8fda89909552..fe8a8bbc3cdc1a 100644
--- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh
+++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
@@ -1,23 +1,23 @@
 #ifndef THC_HALF_AUTO_NUMERICS_INC
 #define THC_HALF_AUTO_NUMERICS_INC
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCNumerics.cuh"
 
-// WARNING: THCNumerics is being deprecated. Read the comments and function usage 
+// WARNING: THCNumerics is being deprecated. Read the comments and function usage
 //          in THCNumerics to learn about the deprecation
-//      
+//
 // Half numerics functions defined as free functions, so cunn code can be
-//written generically, i.e. without excessive calling of THCNumerics<half> functions.
+// written generically, i.e. without excessive calling of THCNumerics<THHalf> functions.
 
 // these functions should move to THCNumerics
 
-inline __host__ __device__ half fmaxType(half x, half y) {
-  return THCNumerics<half>::ge(x, y) ? x : y;
+inline __host__ __device__ THHalf fmaxType(THHalf x, THHalf y) {
+  return THCNumerics<THHalf>::ge(x, y) ? x : y;
 }
 
-inline __host__ __device__ float fmaxType(float x, half y) {
-  return fmaxf(x, ScalarConvert<half, float>::to(y));
+inline __host__ __device__ float fmaxType(float x, THHalf y) {
+  return fmaxf(x, ScalarConvert<THHalf, float>::to(y));
 }
 
 inline __host__ __device__ float fmaxType(float x, float y) {
@@ -31,217 +31,40 @@ inline __host__ __device__ double fmaxType(double x, double y) {
 
 // arithmetic functions
 
-inline __host__ __device__ half operator+(half a, half b) {
-  return THCNumerics<half>::add(a, b);
+inline __host__ __device__ THHalf abs(THHalf a) {
+  return THCNumerics<THHalf>::abs(a);
 }
 
-inline __host__ __device__ float operator+(half a, float b) {
-  return ScalarConvert<half, float>::to(a) + b;
+inline __host__ __device__ THHalf exp(THHalf a) {
+  return THCNumerics<THHalf>::exp(a);
 }
 
-inline __host__ __device__ float operator+(float a, half b) {
-  return a + ScalarConvert<half, float>::to(b);
+inline __host__ __device__ THHalf log10(THHalf a) {
+  return THCNumerics<THHalf>::log10(a);
 }
 
-inline __host__ __device__ double operator+(double a, half b) {
-  return a + ScalarConvert<half, double>::to(b);
+inline __host__ __device__ THHalf log1p(THHalf a) {
+  return THCNumerics<THHalf>::log1p(a);
 }
 
-inline __host__ __device__ half operator-(half a) {
-  return THCNumerics<half>::neg(a);
+inline __host__ __device__ THHalf log2(THHalf a) {
+  return THCNumerics<THHalf>::log2(a);
 }
 
-inline __host__ __device__ half operator-(half a, half b) {
-  return THCNumerics<half>::add(a, THCNumerics<half>::neg(b));
+inline __host__ __device__ THHalf expm1(THHalf a) {
+  return THCNumerics<THHalf>::expm1(a);
 }
 
-inline __host__ __device__ half operator-(half a, int b) {
-  return THCNumerics<half>::add(a, THCNumerics<half>::neg(ScalarConvert<int, half>::to(b)));
+inline __host__ __device__ THHalf pow(THHalf a, THHalf b) {
+  return THCNumerics<THHalf>::pow(a, b);
 }
 
-inline __host__ __device__ float operator-(half a, float b) {
-  return ScalarConvert<half, float>::to(a) - b;
+inline __host__ __device__ THHalf sqrt(THHalf a) {
+  return THCNumerics<THHalf>::sqrt(a);
 }
 
-inline __host__ __device__ double operator-(half a, double b) {
-  return ScalarConvert<half, double>::to(a) - b;
-}
-
-inline __host__ __device__ half operator-(int a, half b) {
-  return THCNumerics<half>::add(ScalarConvert<int, half>::to(a), THCNumerics<half>::neg(b));
-}
-
-inline __host__ __device__ float operator-(float a, half b) {
-  return a - ScalarConvert<half, float>::to(b);
-}
-
-inline __host__ __device__ double operator-(double a, half b) {
-  return a - ScalarConvert<half, double>::to(b);
-}
-
-inline __host__ __device__ half operator*(half a, half b) {
-  return THCNumerics<half>::mul(a, b);
-}
-
-inline __host__ __device__ float operator*(half a, float b) {
-  return ScalarConvert<half, float>::to(a) * b;
-}
-
-inline __host__ __device__ double operator*(half a, double b) {
-  return ScalarConvert<half, double>::to(a) * b;
-}
-
-inline __host__ __device__ half operator*(half a, int b) {
-  return a * ScalarConvert<int, half>::to(b);
-}
-
-inline __host__ __device__ float operator*(float a, half b) {
-  return a * ScalarConvert<half, float>::to(b);
-}
-
-inline __host__ __device__ double operator*(double a, half b) {
-  return a * ScalarConvert<half, double>::to(b);
-}
-
-inline __host__ __device__ half operator/(half a, half b) {
-  return THCNumerics<half>::div(a, b);
-}
-
-inline __host__ __device__ float operator/(float a, half b) {
-  return a / ScalarConvert<half, float>::to(b);
-}
-
-inline __host__ __device__ double operator/(double a, half b) {
-  return a / ScalarConvert<half, double>::to(b);
-}
-
-inline __host__ __device__ half operator/(int a, half b) {
-  return ScalarConvert<int, half>::to(a) / b;
-}
-
-inline __host__ __device__ float operator/(half a, float b) {
-  return ScalarConvert<half, float>::to(a) / b;
-}
-
-inline __host__ __device__ double operator/(half a, double b) {
-  return ScalarConvert<half, double>::to(a) / b;
-}
-
-inline __host__ __device__ half operator/(half a, int b) {
-  return a / ScalarConvert<int, half>::to(b);
-}
-
-inline __host__ __device__ half& operator+=(half &lhs, const half &rhs) {
-  lhs = lhs + rhs;
-  return lhs;
-}
-inline __host__ __device__ float& operator+=(float &lhs, const half &rhs) {
-  lhs = lhs + rhs;
-  return lhs;
-}
-
-inline __host__ __device__ float& operator-=(float &lhs, const half &rhs) {
-  lhs = lhs - rhs;
-  return lhs;
-}
-
-inline __host__ __device__ half& operator*=(half &lhs, const half &rhs) {
-  lhs = lhs * rhs;
-  return lhs;
-}
-
-inline __host__ __device__ half& operator/=(half &lhs, const int &rhs) {
-  lhs = lhs / rhs;
-  return lhs;
-}
-
-inline __host__ __device__ half& operator/=(half &lhs, const half &rhs) {
-  lhs = lhs / rhs;
-  return lhs;
-}
-
-inline __host__ __device__ half abs(half a) {
-  return THCNumerics<half>::abs(a);
-}
-
-inline __host__ __device__ half exp(half a) {
-  return THCNumerics<half>::exp(a);
-}
-
-inline __host__ __device__ half log10(half a) {
-  return THCNumerics<half>::log10(a);
-}
-
-inline __host__ __device__ half log1p(half a) {
-  return THCNumerics<half>::log1p(a);
-}
-
-inline __host__ __device__ half log2(half a) {
-  return THCNumerics<half>::log2(a);
-}
-
-inline __host__ __device__ half expm1(half a) {
-  return THCNumerics<half>::expm1(a);
-}
-
-inline __host__ __device__ half pow(half a, half b) {
-  return THCNumerics<half>::pow(a, b);
-}
-
-inline __host__ __device__ half sqrt(half a) {
-  return THCNumerics<half>::sqrt(a);
-}
-
-inline __host__ __device__ half tanh(half a) {
-  return THCNumerics<half>::tanh(a);
-}
-
-#if defined(_MSC_VER) && CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
-inline __host__ __device__ half operator+(half a, int b) {
-  return THCNumerics<half>::add(a, ScalarConvert<int, half>::to(b));
-}
-
-inline __host__ __device__ double operator+(half a, double b) {
-  return ScalarConvert<half, double>::to(a) + b;
-}
-
-inline __host__ __device__ half operator*(half a, bool b) {
-  return THCNumerics<half>::mul(a, ScalarConvert<bool, half>::to(b));
-}
-#endif
-
-// comparison functions
-
-inline __host__ __device__ bool operator<(half a, half b) {
-  return THCNumerics<half>::lt(a, b);
-}
-
-inline __host__ __device__ bool operator<=(half a, half b) {
-  return THCNumerics<half>::le(a, b);
-}
-
-inline __host__ __device__ bool operator<=(half a, int b) {
-  return THCNumerics<half>::le(a, ScalarConvert<int, half>::to(b));
-}
-
-inline __host__ __device__ bool operator<(half a, int b) {
-  return THCNumerics<half>::lt(a, ScalarConvert<int, half>::to(b));
-}
-
-inline __host__ __device__ bool operator>(half a, half b) {
-  return THCNumerics<half>::gt(a, b);
-}
-
-inline __host__ __device__ bool operator>(half a, int b) {
-  return THCNumerics<half>::gt(a, ScalarConvert<int, half>::to(b));
-}
-
-inline __host__ __device__ bool operator>=(half a, half b) {
-  return THCNumerics<half>::ge(a, b);
-}
-
-inline __host__ __device__ bool operator>=(half a, int b) {
-  return THCNumerics<half>::ge(a, ScalarConvert<int ,half>::to(b));
+inline __host__ __device__ THHalf tanh(THHalf a) {
+  return THCNumerics<THHalf>::tanh(a);
 }
 
 #endif
diff --git a/aten/src/THCUNN/Tanh.cu b/aten/src/THCUNN/Tanh.cu
index a19cc71c045dd2..676da711f33443 100644
--- a/aten/src/THCUNN/Tanh.cu
+++ b/aten/src/THCUNN/Tanh.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/TemporalConvolution.cu b/aten/src/THCUNN/TemporalConvolution.cu
index af12169d7a4ce8..847b82449ac60f 100644
--- a/aten/src/THCUNN/TemporalConvolution.cu
+++ b/aten/src/THCUNN/TemporalConvolution.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 
diff --git a/aten/src/THCUNN/TemporalMaxPooling.cu b/aten/src/THCUNN/TemporalMaxPooling.cu
index 2508f835177b25..de478339d8f984 100644
--- a/aten/src/THCUNN/TemporalMaxPooling.cu
+++ b/aten/src/THCUNN/TemporalMaxPooling.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 #include "THCTensor.hpp"
diff --git a/aten/src/THCUNN/TemporalReflectionPadding.cu b/aten/src/THCUNN/TemporalReflectionPadding.cu
index 4dd4da84c0a2d6..9e905f914653ca 100644
--- a/aten/src/THCUNN/TemporalReflectionPadding.cu
+++ b/aten/src/THCUNN/TemporalReflectionPadding.cu
@@ -8,7 +8,7 @@
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/TemporalReplicationPadding.cu b/aten/src/THCUNN/TemporalReplicationPadding.cu
index 2c812bda8d64f5..3f74d1e62473b2 100644
--- a/aten/src/THCUNN/TemporalReplicationPadding.cu
+++ b/aten/src/THCUNN/TemporalReplicationPadding.cu
@@ -7,7 +7,7 @@
 #include "THCReduceApplyUtils.cuh"
 #include <THC/THCApply.cuh>
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/TemporalRowConvolution.cu b/aten/src/THCUNN/TemporalRowConvolution.cu
index 745fef807510d8..097c78fde6f81b 100644
--- a/aten/src/THCUNN/TemporalRowConvolution.cu
+++ b/aten/src/THCUNN/TemporalRowConvolution.cu
@@ -2,7 +2,7 @@
 #include "common.h"
 #include "row2col.h"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensor.hpp"
 #include "THCStorage.hpp"
diff --git a/aten/src/THCUNN/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/TemporalUpSamplingLinear.cu
index 89b0c37b1fa78d..2aaf4bcf4435e4 100644
--- a/aten/src/THCUNN/TemporalUpSamplingLinear.cu
+++ b/aten/src/THCUNN/TemporalUpSamplingLinear.cu
@@ -7,7 +7,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/TemporalUpSamplingNearest.cu b/aten/src/THCUNN/TemporalUpSamplingNearest.cu
index c87129da7e1563..225e319423e8ad 100644
--- a/aten/src/THCUNN/TemporalUpSamplingNearest.cu
+++ b/aten/src/THCUNN/TemporalUpSamplingNearest.cu
@@ -7,7 +7,7 @@
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/Threshold.cu b/aten/src/THCUNN/Threshold.cu
index 37290894103919..1d44e442e21277 100644
--- a/aten/src/THCUNN/Threshold.cu
+++ b/aten/src/THCUNN/Threshold.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include <THC/THCApply.cuh>
 
diff --git a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu
index 84e2c7f7063c3a..89ecfe09bfaf94 100644
--- a/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu
+++ b/aten/src/THCUNN/VolumetricAdaptiveAveragePooling.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "THCTensor.hpp"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu b/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu
index 6d542ba39037ee..3e631b2755e9bd 100644
--- a/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu
+++ b/aten/src/THCUNN/VolumetricAdaptiveMaxPooling.cu
@@ -1,5 +1,5 @@
 #include "THCUNN.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 #include "THCTensor.hpp"
diff --git a/aten/src/THCUNN/VolumetricAveragePooling.cu b/aten/src/THCUNN/VolumetricAveragePooling.cu
index 110eac44dcb997..66e89d2a950b72 100644
--- a/aten/src/THCUNN/VolumetricAveragePooling.cu
+++ b/aten/src/THCUNN/VolumetricAveragePooling.cu
@@ -4,7 +4,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/VolumetricConvolution.cu b/aten/src/THCUNN/VolumetricConvolution.cu
index da66140fb61537..2e405f3c3b00c2 100644
--- a/aten/src/THCUNN/VolumetricConvolution.cu
+++ b/aten/src/THCUNN/VolumetricConvolution.cu
@@ -1,7 +1,7 @@
 #include "THCUNN.h"
 #include "THCTensor.hpp"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 // Kernel for fast unfold+copy
diff --git a/aten/src/THCUNN/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/VolumetricDilatedConvolution.cu
index 8a32c70b6701ad..09fc83b01a54f2 100644
--- a/aten/src/THCUNN/VolumetricDilatedConvolution.cu
+++ b/aten/src/THCUNN/VolumetricDilatedConvolution.cu
@@ -2,7 +2,7 @@
 #include "THCTensor.hpp"
 #include "common.h"
 #include "vol2col.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/VolumetricDilatedConvolution.cu"
diff --git a/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu b/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu
index 1a0f2f617d343e..bff981f73d3a3a 100644
--- a/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu
+++ b/aten/src/THCUNN/VolumetricDilatedMaxPooling.cu
@@ -4,7 +4,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu b/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu
index e6260ceabbe282..4875ae9f7da07a 100644
--- a/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu
+++ b/aten/src/THCUNN/VolumetricFractionalMaxPooling.cu
@@ -3,7 +3,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/VolumetricFullConvolution.cu b/aten/src/THCUNN/VolumetricFullConvolution.cu
index 556b5bc1d4a5ac..6c4ace126e04e7 100644
--- a/aten/src/THCUNN/VolumetricFullConvolution.cu
+++ b/aten/src/THCUNN/VolumetricFullConvolution.cu
@@ -1,6 +1,6 @@
 #include "THCUNN.h"
 #include "common.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/VolumetricFullConvolution.cu"
diff --git a/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu
index c5c7196bac899e..d315ace7582440 100644
--- a/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu
+++ b/aten/src/THCUNN/VolumetricFullDilatedConvolution.cu
@@ -2,7 +2,7 @@
 #include "THCTensor.hpp"
 #include "common.h"
 #include "vol2col.h"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include "generic/VolumetricFullDilatedConvolution.cu"
diff --git a/aten/src/THCUNN/VolumetricMaxUnpooling.cu b/aten/src/THCUNN/VolumetricMaxUnpooling.cu
index eac3b2d17af5e4..0974ebc763cd8d 100644
--- a/aten/src/THCUNN/VolumetricMaxUnpooling.cu
+++ b/aten/src/THCUNN/VolumetricMaxUnpooling.cu
@@ -4,7 +4,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 
 #include <cfloat>
diff --git a/aten/src/THCUNN/VolumetricReplicationPadding.cu b/aten/src/THCUNN/VolumetricReplicationPadding.cu
index 27ea3ecad3faa2..e9ff31de27240b 100644
--- a/aten/src/THCUNN/VolumetricReplicationPadding.cu
+++ b/aten/src/THCUNN/VolumetricReplicationPadding.cu
@@ -5,7 +5,7 @@
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
 #include "THCReduceApplyUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 #include <THC/THCApply.cuh>
diff --git a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu
index babbd58b0d4a0f..2f06bdaa78ca0b 100644
--- a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu
+++ b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu
@@ -7,7 +7,7 @@
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
 
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu
index 0f353b91acb7ea..ea4c50433370fe 100644
--- a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu
+++ b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu
@@ -7,7 +7,7 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
-#include "THCHalf.h"
+#include "TH/THHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"
 
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 67c7856354a31d..6f679dde9e180b 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -32,9 +32,8 @@ if (NOT BUILD_ATEN_MOBILE)
   # Add source, includes, and libs to lists
   list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
   list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_SRCS})
-  # ATen tests use catch instead of gtest so keep separate for now
-  # list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS})
-  # list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS})
+  list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS})
+  list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS})
   list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS})
   list(APPEND Caffe2_CPU_INCLUDE ${ATen_CPU_INCLUDE})
   list(APPEND Caffe2_GPU_INCLUDE ${ATen_CUDA_INCLUDE})
@@ -86,8 +85,10 @@ add_subdirectory(mobile)
 add_subdirectory(mpi)
 add_subdirectory(observers)
 add_subdirectory(onnx)
-add_subdirectory(operators)
-add_subdirectory(operators/rnn)
+if (BUILD_CAFFE2_OPS)
+  add_subdirectory(operators)
+  add_subdirectory(operators/rnn)
+endif()
 add_subdirectory(opt)
 add_subdirectory(perfkernels)
 add_subdirectory(python)
@@ -290,6 +291,11 @@ if (MSVC AND NOT BUILD_SHARED_LIBS)
   # as the latter is not respected by nvcc
   target_compile_definitions(caffe2 PUBLIC "AT_CORE_STATIC_WINDOWS=1")
 endif()
+if (MSVC AND BUILD_SHARED_LIBS)
+  # ONNX is linked statically and needs to be exported from this library
+  # to be used externally. Make sure that references match the export.
+  target_compile_options(caffe2 PRIVATE "-DONNX_BUILD_MAIN_LIB")
+endif()
 # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
 target_compile_options(caffe2 PRIVATE "$<$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>:-O2>")
 install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib)
@@ -426,30 +432,11 @@ if (BUILD_TEST)
     endforeach()
   endif()
 
-  if (NOT USE_ROCM)
-    set(__aten_test_dir "test/aten")
-    foreach(test_src ${ATen_CPU_TEST_SRCS})
-      get_filename_component(test_name ${test_src} NAME_WE)
-      add_executable(${test_name} "${test_src}")
-      target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
-      target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
-      target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE})
-      target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS})
-      add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
-      install(TARGETS ${test_name} DESTINATION ${__aten_test_dir})
-    endforeach()
-
-    if(USE_CUDA OR USE_ROCM)
-      foreach(test_src ${ATen_CUDA_TEST_SRCS})
-        get_filename_component(test_name ${test_src} NAME_WE)
-        torch_cuda_based_add_executable(${test_name} "${test_src}")
-        target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
-        target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
-        target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE})
-        target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS})
-        add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
-        install(TARGETS ${test_name} DESTINATION ${__aten_test_dir})
-      endforeach()
+  # For special tests that explicitly uses dependencies, we add them here
+  if (USE_MPI)
+    target_link_libraries(mpi_test ${MPI_CXX_LIBRARIES})
+    if (USE_CUDA)
+      target_link_libraries(mpi_gpu_test ${MPI_CXX_LIBRARIES})
     endif()
   endif()
 endif()
@@ -510,6 +497,9 @@ if (BUILD_PYTHON)
   if (APPLE)
     set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
   endif()
+  if (WIN32)
+    set_target_properties(caffe2_pybind11_state PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED")
+  endif()
   target_include_directories(caffe2_pybind11_state PRIVATE $<INSTALL_INTERFACE:include>)
   target_include_directories(caffe2_pybind11_state PRIVATE ${Caffe2_CPU_INCLUDE})
 
@@ -535,6 +525,9 @@ if (BUILD_PYTHON)
     if (APPLE)
       set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
     endif()
+    if (WIN32)
+      set_target_properties(caffe2_pybind11_state_gpu PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED")
+    endif()
     target_include_directories(caffe2_pybind11_state_gpu PRIVATE $<INSTALL_INTERFACE:include>)
     target_include_directories(caffe2_pybind11_state_gpu PRIVATE ${Caffe2_CPU_INCLUDE})
     target_link_libraries(
@@ -560,6 +553,9 @@ if (BUILD_PYTHON)
     if (APPLE)
       set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
     endif()
+    if (WIN32)
+      set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED")
+    endif()
     target_include_directories(caffe2_pybind11_state_hip PRIVATE $<INSTALL_INTERFACE:include>)
     target_include_directories(caffe2_pybind11_state_hip PRIVATE ${Caffe2_CPU_INCLUDE})
     target_link_libraries(
diff --git a/caffe2/contrib/aten/CMakeLists.txt b/caffe2/contrib/aten/CMakeLists.txt
index 92eb671e019cb7..add3918d4c3373 100644
--- a/caffe2/contrib/aten/CMakeLists.txt
+++ b/caffe2/contrib/aten/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT BUILD_ATEN_MOBILE)
+if(NOT BUILD_ATEN_MOBILE AND BUILD_CAFFE2_OPS)
   # Add source generated by Codegen.cmake and pass to parent
   list(APPEND Caffe2_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc)
   list(APPEND Caffe2_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_cuda.cc)
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index 9f327fdd5d82d4..583a2fa492647b 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -91,14 +91,22 @@ class ATenOp : public Operator<Context> {
   void assignTo(Tensor* dst, const at::Tensor& src_) {
     at::Tensor src = src_.contiguous();
     auto at_sizes = src.sizes();
-    std::vector<int64_t> dims(at_sizes.begin(),at_sizes.end());
+    caffe2::TypeMeta type_meta = typeMetaFor(src);
+    at::Device device = src.device();
+    at::TensorImpl* src_impl = src.unsafeReleaseTensorImpl();
+    std::vector<int64_t> dims(at_sizes.begin(), at_sizes.end());
     dst->Resize(dims);
     dst->ShareExternalPointer(
-        src.data_ptr(), typeMetaFor(src), 0, [src](void* ptr) mutable {
-          // return a closure that holds a handle to t until it is called
-          // to keep the aten memory alive
-          return src.reset();
-        });
+        at::DataPtr(
+            src_impl->data(),
+            static_cast<void*>(src_impl),
+            [](void* t_ptr) -> void {
+              at::TensorImpl* local_impl = static_cast<at::TensorImpl*>(t_ptr);
+              c10::raw::intrusive_ptr::decref(local_impl);
+            },
+            device),
+        type_meta,
+        0);
   }
   void assignListStartingAt(
       size_t offset,
@@ -214,10 +222,10 @@ class ATenOp : public Operator<Context> {
     DEFINE_IF(int64, Long)
     CAFFE_THROW("unsupported type annotation: ", name);
   }
-  at::Type & stringToType(const std::string & name) {
+  at::TypeExtendedInterface & stringToType(const std::string & name) {
     return at::getNonVariableType(backend(), stringToScalarType(name));
   }
-  at::Type * readTypeAttribute(const std::string & name) {
+  at::TypeExtendedInterface * readTypeAttribute(const std::string & name) {
     CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<std::string>(name));
     return &stringToType(OperatorBase::GetSingleArgument<std::string>(name, ""));
   }
diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
index 18a3db4c7daed3..70843bb0d91108 100755
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@@ -278,7 +278,7 @@ def find_factory_methods(decls):
                     # first tensor input is used to define the output type.
                     defined_inferred_type = True
                     env['statements'].append(
-                        'auto inferred_type = &({}.type());'.format(
+                        'auto inferred_type = &at::getType({});'.format(
                             arg['name']))
             else:
                 init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name'])
diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
index 716ff7a70814e0..3612d8b46f1f8d 100644
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
@@ -95,10 +95,10 @@ void BlobToTensorProto(
   }
 
   // Set values
-  if (blob->template IsType<Tensor>(CPU)) {
+  if (blob->IsTensorType(CPU)) {
     const auto& cpu_tensor = blob->template Get<TensorCPU>();
     CPUTensorToTensorProto(cpu_tensor, t);
-  } else if (blob->template IsType<Tensor>(CUDA)) {
+  } else if (blob->IsTensorType(CUDA)) {
     const auto& cuda_tensor = blob->template Get<TensorCUDA>();
     const auto cpu_tensor = TensorCPU(cuda_tensor, context);
     context->FinishDeviceComputation();
diff --git a/caffe2/core/allocator.h b/caffe2/core/allocator.h
index 1e9be9f50b39d8..96bc720ccd59d1 100644
--- a/caffe2/core/allocator.h
+++ b/caffe2/core/allocator.h
@@ -11,8 +11,8 @@ CAFFE2_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill);
 
 namespace caffe2 {
 
-// Use 32-byte alignment should be enough for computation up to AVX512.
-constexpr size_t gCaffe2Alignment = 32;
+// Use 64-byte alignment should be enough for computation up to AVX512.
+constexpr size_t gCaffe2Alignment = 64;
 
 using MemoryDeleter = void (*)(void*);
 
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index f085ee23995bd7..490b58c1565e6e 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -23,35 +23,22 @@ namespace caffe2 {
  * properly when the blob is deallocated or re-allocated with a new type. A blob
  * could contain anything, although the most common case is to contain a Tensor.
  */
-class CAFFE2_API Blob {
+class CAFFE2_API Blob final {
  public:
-  typedef void (*DestroyCall)(void*);
+  using DestroyCall = void(void*);
 
   /**
    * Initializes an empty Blob.
    */
-  Blob() : meta_(), pointer_(nullptr) {}
+  Blob() noexcept : meta_(), pointer_(nullptr), destroy_(nullptr) {}
   ~Blob() { Reset(); }
 
-  Blob(Blob&& other) noexcept
-      : meta_(std::move(other.meta_)),
-        pointer_(std::move(other.pointer_)),
-        destroy_(std::move(other.destroy_)) {
-    other.meta_ = {};
-    other.pointer_ = nullptr;
-    other.destroy_ = nullptr;
+  Blob(Blob&& other) noexcept : Blob() {
+    swap(other);
   }
 
   Blob& operator=(Blob&& other) noexcept {
-    if (pointer_ && destroy_) {
-      destroy_(pointer_);
-    }
-    meta_ = std::move(other.meta_);
-    pointer_ = std::move(other.pointer_);
-    destroy_ = std::move(other.destroy_);
-    other.meta_ = {};
-    other.pointer_ = nullptr;
-    other.destroy_ = nullptr;
+    Blob(std::move(other)).swap(*this);
     return *this;
   }
 
@@ -59,18 +46,12 @@ class CAFFE2_API Blob {
    * Checks if the content stored in the blob is of type T.
    */
   template <class T>
-  bool IsType() const {
+  bool IsType() const noexcept {
     return meta_.Match<T>();
   }
 
-  // TODO(jerryzh): Remove template
-  template <class T>
-  bool IsType(DeviceType device_type) const {
-    static_assert(
-        std::is_same<T, Tensor>::value,
-        "IsType(DeviceType) only available on "
-        "Tensor types.");
-    bool is_match = meta_.Match<T>();
+  bool IsTensorType(DeviceType device_type) const {
+    bool is_match = meta_.Match<Tensor>();
     auto* tensor = static_cast<Tensor*>(pointer_);
     if (is_match && tensor && tensor->GetDeviceType() == device_type) {
       return true;
@@ -81,12 +62,12 @@ class CAFFE2_API Blob {
   /**
    * Returns the meta info of the blob.
    */
-  inline const TypeMeta& meta() const { return meta_; }
+  inline const TypeMeta& meta() const noexcept { return meta_; }
 
   /**
    * Returns a printable typename of the blob.
    */
-  inline const char* TypeName() const { return meta_.name(); }
+  inline const char* TypeName() const noexcept { return meta_.name(); }
 
   /**
    * @brief Gets the const reference of the stored object. The code checks if
@@ -107,10 +88,10 @@ class CAFFE2_API Blob {
     return *static_cast<const T*>(pointer_);
   }
 
-  const void* GetRaw() const {
+  const void* GetRaw() const noexcept {
     return pointer_;
   }
-  void* GetRaw() {
+  void* GetRaw() noexcept {
     return pointer_;
   }
 
@@ -149,7 +130,7 @@ class CAFFE2_API Blob {
   }
 
   inline Tensor* GetMutableTensor(DeviceType device_type) {
-    if (IsType<Tensor>(device_type)) {
+    if (IsTensorType(device_type)) {
       return static_cast<Tensor*>(pointer_);
     } else {
       VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<Tensor>()
@@ -178,7 +159,7 @@ class CAFFE2_API Blob {
   }
 
   inline void*
-  Reset(void* allocated, const TypeMeta& meta, const DestroyCall& destroy) {
+  Reset(void* allocated, const TypeMeta& meta, DestroyCall* destroy) {
     if (pointer_ && destroy_) {
       destroy_(pointer_);
     }
@@ -192,8 +173,8 @@ class CAFFE2_API Blob {
    * Releases the ownership, if any, this Blob has on the underlying pointer.
    * The user is then responsible for freeing the data if needed
    */
-  inline DestroyCall Release() {
-    DestroyCall d = destroy_;
+  inline DestroyCall* Release() {
+    DestroyCall* d = destroy_;
     destroy_ = nullptr;
     return d;
   }
@@ -289,7 +270,7 @@ class CAFFE2_API Blob {
   }
   TypeMeta meta_;
   void* pointer_ = nullptr;
-  DestroyCall destroy_ = nullptr;
+  DestroyCall* destroy_ = nullptr;
 
   AT_DISABLE_COPY_AND_ASSIGN(Blob);
 };
diff --git a/caffe2/core/blob_gpu_test.cc b/caffe2/core/blob_gpu_test.cc
index 0f0fcb54906dcd..29bf0c3bc52ae6 100644
--- a/caffe2/core/blob_gpu_test.cc
+++ b/caffe2/core/blob_gpu_test.cc
@@ -148,7 +148,7 @@ TYPED_TEST(TensorGPUDeathTest, CannotAccessDataWhenEmpty) {
     }                                                                      \
     Blob new_blob;                                                         \
     EXPECT_NO_THROW(new_blob.Deserialize(serialized));                     \
-    EXPECT_TRUE(new_blob.IsType<Tensor>(CUDA));                            \
+    EXPECT_TRUE(new_blob.IsTensorType(CUDA));                            \
     Tensor new_cpu_tensor(blob.Get<Tensor>(), CPU);                        \
     EXPECT_EQ(new_cpu_tensor.ndim(), 2);                                   \
     EXPECT_EQ(new_cpu_tensor.dim(0), 2);                                   \
@@ -198,7 +198,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     // Test if the restored blob is still of the same device.
     blob.Reset();
     EXPECT_NO_THROW(blob.Deserialize(serialized));
-    EXPECT_TRUE(blob.IsType<Tensor>(CUDA));
+    EXPECT_TRUE(blob.IsTensorType(CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()),
               gpu_id);
     // Test if we force the restored blob on a different device, we
@@ -206,7 +206,7 @@ TEST(TensorTest, TensorSerializationMultiDevices) {
     blob.Reset();
     proto.mutable_tensor()->mutable_device_detail()->set_cuda_gpu_id(0);
     EXPECT_NO_THROW(blob.Deserialize(proto.SerializeAsString()));
-    EXPECT_TRUE(blob.IsType<Tensor>(CUDA));
+    EXPECT_TRUE(blob.IsTensorType(CUDA));
     EXPECT_EQ(GetGPUIDForPointer(blob.Get<TensorCUDA>().data<float>()), 0);
   }
 }
diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc
index 97d17c6e5924f3..8103071c81ee26 100644
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@@ -86,15 +86,15 @@ TEST(BlobTest, Blob) {
   int* int_unused CAFFE2_UNUSED = blob.GetMutable<int>();
   EXPECT_TRUE(blob.IsType<int>());
   EXPECT_FALSE(blob.IsType<BlobTestFoo>());
-  EXPECT_FALSE(blob.IsType<Tensor>(CPU));
+  EXPECT_FALSE(blob.IsTensorType(CPU));
 
   BlobTestFoo* foo_unused CAFFE2_UNUSED = blob.GetMutable<BlobTestFoo>();
   EXPECT_TRUE(blob.IsType<BlobTestFoo>());
   EXPECT_FALSE(blob.IsType<int>());
-  EXPECT_FALSE(blob.IsType<Tensor>(CPU));
+  EXPECT_FALSE(blob.IsTensorType(CPU));
 
   Tensor* tensor_unused CAFFE2_UNUSED = blob.GetMutableTensor(CPU);
-  EXPECT_TRUE(blob.IsType<Tensor>(CPU));
+  EXPECT_TRUE(blob.IsTensorType(CPU));
   EXPECT_FALSE(blob.IsType<BlobTestFoo>());
   EXPECT_FALSE(blob.IsType<int>());
 }
@@ -621,7 +621,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     }                                                                     \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(new_blob.Deserialize(serialized));                    \
-    EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));                            \
+    EXPECT_TRUE(new_blob.IsTensorType(CPU));                            \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 2);                                      \
@@ -650,7 +650,7 @@ TEST(TensorDeathTest, CannotCastDownLargeDims) {
     EXPECT_EQ(tensor_proto.field_name##_size(), 0);                       \
     Blob new_blob;                                                        \
     EXPECT_NO_THROW(new_blob.Deserialize(serialized));                    \
-    EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));                            \
+    EXPECT_TRUE(new_blob.IsTensorType(CPU));                            \
     const TensorCPU& new_tensor = blob.Get<TensorCPU>();                  \
     EXPECT_EQ(new_tensor.ndim(), 2);                                      \
     EXPECT_EQ(new_tensor.dim(0), 0);                                      \
@@ -681,7 +681,7 @@ TEST(TensorTest, TensorSerialization_CustomType) {
   EXPECT_EQ(proto.type(), "Tensor");
   Blob new_blob;
   EXPECT_NO_THROW(new_blob.Deserialize(serialized));
-  EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));
+  EXPECT_TRUE(new_blob.IsTensorType(CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 2);
   EXPECT_EQ(new_tensor.dim(0), 2);
@@ -724,7 +724,7 @@ TEST(TensorTest, float16) {
   }
   Blob new_blob;
   EXPECT_NO_THROW(new_blob.Deserialize(serialized));
-  EXPECT_TRUE(new_blob.IsType<Tensor>(CPU));
+  EXPECT_TRUE(new_blob.IsTensorType(CPU));
   const TensorCPU& new_tensor = blob.Get<TensorCPU>();
   EXPECT_EQ(new_tensor.ndim(), 1);
   EXPECT_EQ(new_tensor.dim(0), kSize);
@@ -903,7 +903,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
     load_op->Run();
     VLOG(1) << "Reading blob from workspace";
     auto new_blob = ws.GetBlob("test");
-    EXPECT_TRUE(new_blob->IsType<Tensor>(CPU));
+    EXPECT_TRUE(new_blob->IsTensorType(CPU));
     const auto& new_tensor = new_blob->Get<TensorCPU>();
 
     EXPECT_EQ(new_tensor.ndim(), d1);
diff --git a/caffe2/core/common_cudnn.h b/caffe2/core/common_cudnn.h
index a8f2808e30eeab..2bbbce7df1e4ef 100644
--- a/caffe2/core/common_cudnn.h
+++ b/caffe2/core/common_cudnn.h
@@ -4,8 +4,6 @@
 #include <array>
 #include <mutex>
 
-#include <cudnn.h>
-
 #include "caffe2/core/common.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
@@ -16,6 +14,8 @@
 #error("This Caffe2 install is not built with cudnn, so you should not include this file.");
 #endif
 
+#include <cudnn.h>
+
 static_assert(
     CUDNN_VERSION >= 5000,
     "Caffe2 requires cudnn version 5.0 or above.");
diff --git a/caffe2/core/common_gpu.h b/caffe2/core/common_gpu.h
index 4658cd0d756099..b73a6aefa406a0 100644
--- a/caffe2/core/common_gpu.h
+++ b/caffe2/core/common_gpu.h
@@ -116,7 +116,7 @@ inline int CudaVersion() { return CUDA_VERSION; }
 /**
  * Returns the number of devices.
  */
-int NumCudaDevices();
+CAFFE2_CUDA_API int NumCudaDevices();
 
 /**
  * Check if the current running session has a cuda gpu present.
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index 4faaea93c6da12..aff66534d22198 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -153,7 +153,7 @@ class CAFFE2_API CPUContext final : public BaseContext {
     return true;
   }
 
-  DeviceType GetDevicetype() const override {
+  DeviceType device_type() const override {
     return CPU;
   }
 
@@ -207,6 +207,7 @@ class CAFFE2_API CPUStaticContext : public BaseStaticContext {
 
   void ExtractDeviceOption(DeviceOption* device, const void* /*data*/)
       override {
+    CHECK(device);
     device->set_device_type(TypeToProto(GetDeviceType()));
   }
 
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index 3090ca57aedc31..987c9ffe35299d 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -285,7 +285,7 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
     return cudaStreamQuery(stream) == cudaSuccess;
   }
 
-  DeviceType GetDevicetype() const override {
+  DeviceType device_type() const override {
     return CUDA;
   }
 
@@ -403,6 +403,7 @@ class CAFFE2_CUDA_API CUDAStaticContext final : public BaseStaticContext {
   }
 
   void ExtractDeviceOption(DeviceOption* device, const void* data) override {
+    CAFFE_ENFORCE(data, "data cannot be nullptr");
     device->set_device_type(TypeToProto(GetDeviceType()));
     device->set_cuda_gpu_id(GetGPUIDForPointer(data));
   }
diff --git a/caffe2/core/hip/context_hip.h b/caffe2/core/hip/context_hip.h
index cd309e6473be4f..5a7613cf934fd0 100644
--- a/caffe2/core/hip/context_hip.h
+++ b/caffe2/core/hip/context_hip.h
@@ -269,7 +269,7 @@ class HIPContext final : public BaseContext {
     return hipStreamQuery(stream) == hipSuccess;
   }
 
-  DeviceType GetDevicetype() const override {
+  DeviceType device_type() const override {
     return HIP;
   }
 
diff --git a/caffe2/core/nomnigraph/Representations/NeuralNet.cc b/caffe2/core/nomnigraph/Representations/NeuralNet.cc
index 2765b48435e7bd..28f33a43cbff2f 100644
--- a/caffe2/core/nomnigraph/Representations/NeuralNet.cc
+++ b/caffe2/core/nomnigraph/Representations/NeuralNet.cc
@@ -216,17 +216,14 @@ NNNodeMatchCriteria criteriaSingleConsumer() {
       "Single consumer");
 }
 
-NNNodeMatchCriteria matchTensor() {
-  return matchOp<nom::repr::Tensor>("matchTensor");
+NNNodeMatchCriteria matchTensor(const std::string& debugString) {
+  return matchOp<nom::repr::Tensor>(debugString);
 }
 
-NNMatchGraph::NodeRef operatorSubgraph(
-    NNMatchGraph& g,
-    const NNNodeMatchCriteria& root,
-    const std::vector<NNMatchGraph::NodeRef>& childrenCriteria,
-    int count) {
-  return subgraph(
-      g, matchTensor(), {subgraph(g, root, childrenCriteria)}, count);
+NNMatchNode matchExternalTensorNode(const std::string& debugString) {
+  return NNMatchNode(matchTensor(debugString))
+      .nonTerminal()
+      .excludeFromSubgraph();
 }
 
 } // namespace nn
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
index c6b10f0912eca8..568d46a61ff561 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
@@ -39,7 +39,7 @@ class Node;
 
 // \brief Edge within a Graph.
 template <typename T, typename... U>
-class CAFFE2_API Edge : public StorageType<U...> {
+class Edge : public StorageType<U...> {
  public:
   using NodeRef = typename Graph<T, U...>::NodeRef;
   Edge(NodeRef tail, NodeRef head, U... args)
@@ -73,7 +73,7 @@ class CAFFE2_API Edge : public StorageType<U...> {
 
 // \brief Node within a Graph.
 template <typename T, typename... U>
-class CAFFE2_API Node : public StorageType<T>, public Notifier<Node<T, U...>> {
+class Node : public StorageType<T>, public Notifier<Node<T, U...>> {
  public:
   using NodeRef = typename Graph<T, U...>::NodeRef;
   using EdgeRef = typename Graph<T, U...>::EdgeRef;
@@ -152,7 +152,7 @@ class CAFFE2_API Node : public StorageType<T>, public Notifier<Node<T, U...>> {
 /// for example.
 ///
 template <typename T, typename... U>
-class CAFFE2_API Subgraph {
+class Subgraph {
  public:
   Subgraph() {
     DEBUG_PRINT("Creating instance of Subgraph: %p\n", this);
@@ -219,7 +219,7 @@ class CAFFE2_API Subgraph {
 /// Everything is owned by the graph to simplify storage concerns.
 ///
 template <typename T, typename... U>
-class CAFFE2_API Graph {
+class Graph {
  public:
   using SubgraphType = Subgraph<T, U...>;
   using NodeRef = Node<T, U...>*;
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
index f4c3940acd0711..2a03e428619b30 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@@ -53,9 +53,6 @@ class CAFFE2_API Annotation {
     return kind_;
   }
 
-  Annotation(const Annotation&) = delete;
-  Annotation& operator=(Annotation&) = delete;
-
  private:
   const AnnotationKind kind_;
 };
@@ -427,7 +424,7 @@ CAFFE2_API void coalesceInsertedDataDependencies(repr::NNModule* m);
 template <NNGraph* G>
 struct CAFFE2_EXPORT NodeHelper {};
 
-struct CAFFE2_API NNNodeMatchCriteria {
+struct NNNodeMatchCriteria {
   std::function<bool(NNGraph::NodeRef)> predicate;
   std::string debugString;
 
@@ -474,8 +471,6 @@ NNNodeMatchCriteria matchOp(const std::string& debugString = "matchOp") {
       debugString);
 }
 
-CAFFE2_API NNNodeMatchCriteria matchTensor();
-
 template <typename NodeType>
 NNNodeMatchCriteria matchOp(
     const std::function<bool(const NodeType&)> predicate,
@@ -489,6 +484,12 @@ NNNodeMatchCriteria matchOp(
       debugString);
 };
 
+CAFFE2_API NNNodeMatchCriteria
+matchTensor(const std::string& debugString = "matchTensor");
+
+CAFFE2_API NNMatchNode
+matchExternalTensorNode(const std::string& debugString = "matchExternalTensor");
+
 struct CAFFE2_API NNNodeMatch {
   static bool isMatch(
       const NNGraph::NodeRef& node,
@@ -500,15 +501,6 @@ struct CAFFE2_API NNNodeMatch {
 using NNSubgraphMatcher =
     nom::matcher::SubgraphMatcher<NNGraph, NNNodeMatchCriteria, NNNodeMatch>;
 
-// This helper method makes it easy to create matching criteria in NNGraph.
-// For example, operatorSubgraph(opMatch, ...) will refer to a tree like this:
-// ... -> opMatch -> opMatch_Output
-CAFFE2_API NNMatchGraph::NodeRef operatorSubgraph(
-    NNMatchGraph& g,
-    const NNNodeMatchCriteria& root,
-    const std::vector<NNMatchGraph::NodeRef>& childrenCriteria = {},
-    int count = 1);
-
 } // namespace nn
 
 } // namespace repr
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
index f11975284b217c..a303324fbb5701 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
@@ -29,18 +29,11 @@ namespace matcher {
  */
 
 template <typename NodeMatchCriteria>
-class CAFFE2_API MatchNode {
+class MatchNode {
  public:
   static const int kStarCount = -1;
-  MatchNode(
-      const NodeMatchCriteria& criteria,
-      bool includeInSubgraph = true,
-      int count = 1,
-      bool nonTerminal = false)
-      : criteria_(criteria),
-        includeInSubgraph_(includeInSubgraph),
-        count_(count),
-        nonTerminal_(nonTerminal) {}
+
+  MatchNode(const NodeMatchCriteria& criteria) : criteria_(criteria) {}
 
   MatchNode() = default;
   MatchNode(const MatchNode&) = default;
@@ -55,6 +48,25 @@ class CAFFE2_API MatchNode {
     return count_;
   }
 
+  MatchNode<NodeMatchCriteria>& count(int count) {
+    count_ = count;
+    return *this;
+  }
+
+  MatchNode<NodeMatchCriteria>& starCount() {
+    return count(kStarCount);
+  }
+
+  MatchNode<NodeMatchCriteria>& nonTerminal() {
+    nonTerminal_ = true;
+    return *this;
+  }
+
+  MatchNode<NodeMatchCriteria>& excludeFromSubgraph() {
+    includeInSubgraph_ = false;
+    return *this;
+  }
+
   bool isNonTerminal() const {
     return nonTerminal_;
   }
@@ -65,9 +77,9 @@ class CAFFE2_API MatchNode {
 
  private:
   NodeMatchCriteria criteria_;
-  bool includeInSubgraph_;
-  int count_;
-  bool nonTerminal_;
+  int count_ = 1;
+  bool includeInSubgraph_ = true;
+  bool nonTerminal_ = false;
 };
 
 template <typename NodeMatchCriteria>
@@ -76,38 +88,12 @@ using MatchGraph = Graph<MatchNode<NodeMatchCriteria>>;
 template <typename NodeMatchCriteria>
 using MatchNodeRef = typename MatchGraph<NodeMatchCriteria>::NodeRef;
 
-template <typename NodeMatchCriteria>
-MatchNodeRef<NodeMatchCriteria> subgraph(
-    MatchGraph<NodeMatchCriteria>& graph,
-    const NodeMatchCriteria& root,
-    const std::vector<MatchNodeRef<NodeMatchCriteria>>& children,
-    int count = 1,
-    bool includeInSubgraph = true) {
-  auto result = graph.createNode(
-      MatchNode<NodeMatchCriteria>(root, includeInSubgraph, count, false));
-  for (auto child : children) {
-    graph.createEdge(result, child);
-  }
-  return result;
-}
-
-// Note that for nonTerminalSubgraph, the default value for includeInSubgraph
-// is false since we typically do not want to include a nonTerminal node
-// in the matched subgraph.
-template <typename NodeMatchCriteria>
-MatchNodeRef<NodeMatchCriteria> nonTerminalSubgraph(
-    MatchGraph<NodeMatchCriteria>& graph,
-    const NodeMatchCriteria& root,
-    int count = 1,
-    bool includeInSubgraph = false) {
-  return graph.createNode(
-      MatchNode<NodeMatchCriteria>(root, includeInSubgraph, count, true));
-}
-
 // TODO: Reuse convertToDotString once convertToDotString can work
 // with subgraph.
 template <typename NodeMatchCriteria>
-std::string debugString(MatchNodeRef<NodeMatchCriteria> rootCriteriaRef) {
+std::string debugString(
+    MatchNodeRef<NodeMatchCriteria> rootCriteriaRef,
+    bool invertGraphTraversal) {
   std::ostringstream out;
   auto rootNode = rootCriteriaRef->data();
   out << "{rootCriteria = '" << rootNode.getCriteria() << "'";
@@ -117,11 +103,14 @@ std::string debugString(MatchNodeRef<NodeMatchCriteria> rootCriteriaRef) {
   if (rootNode.isNonTerminal()) {
     out << ", nonTerminal = " << rootNode.isNonTerminal();
   }
-  auto outEdges = rootCriteriaRef->getOutEdges();
-  if (!outEdges.empty()) {
+  auto edges = invertGraphTraversal ? rootCriteriaRef->getInEdges()
+                                    : rootCriteriaRef->getOutEdges();
+  if (!edges.empty()) {
     out << ", childrenCriteria = [";
-    for (auto& child : outEdges) {
-      out << debugString<NodeMatchCriteria>(child->head()) << ", ";
+    for (auto& child : edges) {
+      auto nextNode = invertGraphTraversal ? child->tail() : child->head();
+      out << debugString<NodeMatchCriteria>(nextNode, invertGraphTraversal)
+          << ", ";
     }
     out << "]";
   }
@@ -294,7 +283,8 @@ struct SubgraphMatcher {
           std::ostringstream debugMessage;
           debugMessage << "Subgraph root at " << root << " is not the same as "
                        << matchedNode << " which previously matched criteria "
-                       << debugString<NodeMatchCriteria>(rootCriteriaRef);
+                       << debugString<NodeMatchCriteria>(
+                              rootCriteriaRef, invertGraphTraversal);
           return SubgraphMatchResultType::notMatched(debugMessage.str());
         } else {
           return SubgraphMatchResultType::notMatched();
@@ -307,7 +297,8 @@ struct SubgraphMatcher {
         std::ostringstream debugMessage;
         debugMessage << "Subgraph root at " << root
                      << " does not match criteria "
-                     << debugString<NodeMatchCriteria>(rootCriteriaRef);
+                     << debugString<NodeMatchCriteria>(
+                            rootCriteriaRef, invertGraphTraversal);
         return SubgraphMatchResultType::notMatched(debugMessage.str());
       } else {
         return SubgraphMatchResultType::notMatched();
@@ -326,8 +317,10 @@ struct SubgraphMatcher {
         invertGraphTraversal ? root->getInEdges() : root->getOutEdges();
 
     int numEdges = edges.size();
-    const auto outEdges = rootCriteriaRef->getOutEdges();
-    int numChildrenCriteria = outEdges.size();
+    const auto criteriaEdges = invertGraphTraversal
+        ? rootCriteriaRef->getInEdges()
+        : rootCriteriaRef->getOutEdges();
+    int numChildrenCriteria = criteriaEdges.size();
 
     // The current algorithm implies that the ordering of the children is
     // important. The children nodes will be matched with the children subgraph
@@ -336,7 +329,9 @@ struct SubgraphMatcher {
     int currentEdgeIdx = 0;
     for (int criteriaIdx = 0; criteriaIdx < numChildrenCriteria;
          criteriaIdx++) {
-      auto childrenCriteriaRef = outEdges[criteriaIdx]->head();
+      auto childrenCriteriaRef = invertGraphTraversal
+          ? criteriaEdges[criteriaIdx]->tail()
+          : criteriaEdges[criteriaIdx]->head();
 
       int expectedCount = childrenCriteriaRef->data().getCount();
       bool isStarCount =
@@ -374,7 +369,7 @@ struct SubgraphMatcher {
               debugMessage << "Child node at " << child
                            << " does not match child criteria "
                            << debugString<NodeMatchCriteria>(
-                                  childrenCriteriaRef)
+                                  childrenCriteriaRef, invertGraphTraversal)
                            << ". We expected " << expectedCount
                            << " matches but only found " << countMatch << ".";
               return SubgraphMatchResultType::notMatched(debugMessage.str());
@@ -399,7 +394,8 @@ struct SubgraphMatcher {
           std::ostringstream debugMessage;
           debugMessage << "Expected " << expectedCount
                        << " matches for child criteria "
-                       << debugString<NodeMatchCriteria>(childrenCriteriaRef)
+                       << debugString<NodeMatchCriteria>(
+                              childrenCriteriaRef, invertGraphTraversal)
                        << " but only found " << countMatch;
           return SubgraphMatchResultType::notMatched(debugMessage.str());
         } else {
diff --git a/caffe2/core/nomnigraph/tests/NeuralNetTest.cc b/caffe2/core/nomnigraph/tests/NeuralNetTest.cc
index 32c0f20f3571bf..874da120b5be8f 100644
--- a/caffe2/core/nomnigraph/tests/NeuralNetTest.cc
+++ b/caffe2/core/nomnigraph/tests/NeuralNetTest.cc
@@ -43,27 +43,28 @@ TEST(NeuralNetGraph, ReplaceGraph) {
   */
 
   auto mg = NNMatchGraph();
-  // clang-format off
-  auto matchSumOutput = operatorSubgraph(mg,
-      matchOp<Sum>(), {
-        nonTerminalSubgraph(mg, matchTensor(), 2)
-      });;
-  auto pattern = subgraph(mg,
-      matchOp<Relu>(), {
-        matchSumOutput
-      });
-  // clang-format on
+  auto matchSumInput =
+      mg.createNode(std::move(matchExternalTensorNode().count(2)));
+  auto matchSum = mg.createNode(matchOp<Sum>("matchSum"));
+  mg.createEdge(matchSumInput, matchSum);
+
+  auto matchSumOutput = mg.createNode(matchTensor("matchSumOutput"));
+  mg.createEdge(matchSum, matchSumOutput);
+
+  auto matchRelu = mg.createNode(matchOp<Relu>("matchRelu"));
+  mg.createEdge(matchSumOutput, matchRelu);
 
-  EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(sum, pattern).isMatch());
+  auto matchRoot = matchRelu;
+  EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(sum, matchRoot).isMatch());
   EXPECT_FALSE(
-      NNSubgraphMatcher::isSubgraphMatch(reluOutput, pattern).isMatch());
-  EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(input1, pattern).isMatch());
+      NNSubgraphMatcher::isSubgraphMatch(reluOutput, matchRoot).isMatch());
+  EXPECT_FALSE(NNSubgraphMatcher::isSubgraphMatch(input1, matchRoot).isMatch());
 
-  EXPECT_TRUE(NNSubgraphMatcher::isSubgraphMatch(relu, pattern).isMatch());
+  EXPECT_TRUE(NNSubgraphMatcher::isSubgraphMatch(relu, matchRoot).isMatch());
 
   NNSubgraphMatcher::replaceSubgraph(
       graph,
-      pattern,
+      matchRoot,
       [&matchSumOutput](
           NNGraph& g,
           NNGraph::NodeRef relu,
diff --git a/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc b/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc
index 7ed996aa5f1bdc..ee677665c6546d 100644
--- a/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc
+++ b/caffe2/core/nomnigraph/tests/SubgraphMatcherTest.cc
@@ -41,27 +41,22 @@ TestMatchGraph::NodeRef Tree(
     const Criteria& root,
     const std::vector<TestMatchGraph::NodeRef>& children = {},
     int count = 1) {
-  return subgraph(graph, root, children, count);
+  auto result = graph.createNode(std::move(TestMatchNode(root).count(count)));
+  for (auto& child : children) {
+    graph.createEdge(result, child);
+  }
+  return result;
 }
 
 TestMatchGraph::NodeRef NonTerminal(const Criteria& root, int count = 1) {
-  return nonTerminalSubgraph(graph, root, count);
+  return graph.createNode(
+      std::move(TestMatchNode(root).count(count).nonTerminal()));
 }
 
 Criteria any() {
   return Criteria("*");
 }
 
-// Make it more concise to create matching criteria in dataflow graph.
-// For example, operatorTree("opA", ...) will refer to a tree like this:
-// ... -> opA -> opA_Output
-TestMatchGraph::NodeRef operatorTree(
-    const Criteria& root,
-    const std::vector<TestMatchGraph::NodeRef>& childrenCriteria = {},
-    int count = 1) {
-  return Tree(any(), {Tree(root, childrenCriteria)}, count);
-}
-
 std::map<std::string, std::string> TestGraphNodePrinter(
     TestGraph::NodeRef node) {
   std::map<std::string, std::string> labelMap;
@@ -185,20 +180,35 @@ struct DataFlowTestGraphCriteria {
   TestMatchGraph::NodeRef matchOpG;
 
   DataFlowTestGraphCriteria() {
-    // clang-format off
-    matchOpCOutput = operatorTree("opC", {
-      NonTerminal(Criteria("input"), TestMatchNode::kStarCount)
-    });
-    matchOpG = Tree(
-      Criteria("opG"),{
-        operatorTree("opF", {
-            operatorTree("opB", {
-              matchOpCOutput, matchOpCOutput,
-            })
-        }),
-        NonTerminal(any()) // matches dataI
-      });
-    // clang-format on
+    auto matchOpCInputs =
+        graph.createNode(std::move(TestMatchNode(Criteria("input"))
+                                       .starCount()
+                                       .nonTerminal()
+                                       .excludeFromSubgraph()));
+    auto matchOpC = graph.createNode(Criteria("opC"));
+    graph.createEdge(matchOpCInputs, matchOpC);
+
+    matchOpCOutput = graph.createNode(any());
+    graph.createEdge(matchOpC, matchOpCOutput);
+
+    auto matchOpB = graph.createNode(Criteria("opB"));
+    graph.createEdge(matchOpCOutput, matchOpB);
+    graph.createEdge(matchOpCOutput, matchOpB);
+
+    auto matchOpBOutput = graph.createNode(any());
+    graph.createEdge(matchOpB, matchOpBOutput);
+
+    auto matchOpF = graph.createNode(Criteria("opF"));
+    graph.createEdge(matchOpBOutput, matchOpF);
+
+    auto matchOpFOutput = graph.createNode(any());
+    graph.createEdge(matchOpF, matchOpFOutput);
+
+    matchOpG = graph.createNode(Criteria("opG"));
+    auto matchDataI = graph.createNode(
+        std::move(TestMatchNode(any()).nonTerminal().excludeFromSubgraph()));
+    graph.createEdge(matchOpFOutput, matchOpG);
+    graph.createEdge(matchDataI, matchOpG);
   }
 };
 
diff --git a/caffe2/core/observer_test.cc b/caffe2/core/observer_test.cc
index f4f4e81a3cb45c..fa8aee6d818366 100644
--- a/caffe2/core/observer_test.cc
+++ b/caffe2/core/observer_test.cc
@@ -151,6 +151,9 @@ TEST(ObserverTest, TestDAGNetBase) {
   EXPECT_EQ(1212, count_after - count_before);
 }
 
+#if 0
+// This test intermittently segfaults,
+// see https://github.com/pytorch/pytorch/issues/9137
 TEST(ObserverTest, TestMultipleNetBase) {
   Workspace ws;
   ws.CreateBlob("in");
@@ -176,4 +179,5 @@ TEST(ObserverTest, TestMultipleNetBase) {
 
   EXPECT_EQ(net.get()->NumObservers(), prev_num);
 }
+#endif
 } // namespace caffe2
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 2def93f0b51d08..e75681ff3a9df6 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -143,36 +143,26 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   inline bool InputIsType(int idx) {
     static_assert(
         !std::is_same<T, Tensor>::value,
-        "You should use InputIsType<Tensor>(int, DeviceType) for "
+        "You should use InputIsTensorType(int, DeviceType) for "
         "Tensor.");
     return inputs_.at(idx)->template IsType<T>();
   }
 
-  template <typename T>
-  inline bool InputIsType(int idx, DeviceType device_type) {
-    static_assert(
-        std::is_same<T, Tensor>::value,
-        "InputIsType(idx, DeviceType) only available on "
-        "Tensor types.");
-    return inputs_.at(idx)->template IsType<T>(device_type);
+  inline bool InputIsTensorType(int idx, DeviceType device_type) {
+    return inputs_.at(idx)->IsTensorType(device_type);
   }
 
   template <typename T>
   inline bool OutputIsType(int idx) {
     static_assert(
         !std::is_same<T, Tensor>::value,
-        "You should use OutputIsType<Tensor>(int, DeviceType) for "
+        "You should use OutputIsTensorType(int, DeviceType) for "
         "Tensor.");
     return outputs_.at(idx)->template IsType<T>();
   }
 
-  template <typename T>
-  inline bool OutputIsType(int idx, DeviceType type) {
-    static_assert(
-        std::is_same<T, Tensor>::value,
-        "OutputIsType(idx, DeviceType) only available on "
-        "Tensor types.");
-    return outputs_.at(idx)->template IsType<T>(type);
+  inline bool OutputIsTensorType(int idx, DeviceType type) {
+    return outputs_.at(idx)->IsTensorType(type);
   }
 
   inline int InputSize() const {
diff --git a/caffe2/core/registry.h b/caffe2/core/registry.h
index c7f3a7af539d51..7db975077ea8b9 100644
--- a/caffe2/core/registry.h
+++ b/caffe2/core/registry.h
@@ -17,21 +17,13 @@
 #include <memory>
 #include <mutex>
 
+#include <ATen/core/Registry.h>
+
 #include "caffe2/core/common.h"
 #include "caffe2/core/typeid.h"
 
 namespace caffe2 {
 
-template <typename KeyType>
-inline void PrintOffendingKey(const KeyType& /*key*/) {
-  printf("[key type printing not supported]\n");
-}
-
-template <>
-inline void PrintOffendingKey(const string& key) {
-  printf("Offending key: %s.\n", key.c_str());
-}
-
 /**
  * @brief A template class that allows one to register classes by keys.
  *
@@ -43,7 +35,7 @@ inline void PrintOffendingKey(const string& key) {
  * objects.
  */
 template <class SrcType, class ObjectPtrType, class... Args>
-class CAFFE2_API Registry {
+class Registry {
  public:
   typedef std::function<ObjectPtrType(Args...)> Creator;
 
@@ -59,7 +51,7 @@ class CAFFE2_API Registry {
     std::lock_guard<std::mutex> lock(register_mutex_);
     if (registry_.count(key) != 0) {
       printf("Key already registered.\n");
-      PrintOffendingKey(key);
+      at::PrintOffendingKey(key);
       std::exit(1);
     }
     registry_[key] = creator;
@@ -112,7 +104,7 @@ class CAFFE2_API Registry {
 };
 
 template <class SrcType, class ObjectPtrType, class... Args>
-class CAFFE2_API Registerer {
+class Registerer {
  public:
   Registerer(
       const SrcType& key,
diff --git a/caffe2/core/storage.h b/caffe2/core/storage.h
index 973b07ee630642..35647d7b62d8af 100644
--- a/caffe2/core/storage.h
+++ b/caffe2/core/storage.h
@@ -16,271 +16,17 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/typeid.h"
 
+#include <ATen/core/Allocator.h>
 #include <ATen/core/Device.h>
 #include <ATen/core/DeviceType.h>
 #include <ATen/core/intrusive_ptr.h>
+#include <ATen/core/Storage.h>
+#include <ATen/core/StorageImpl.h>
 
 namespace caffe2 {
 
-using DataType = TypeMeta;
-using DataPtr = std::shared_ptr<void>;
-using at::DeviceType;
-
-class CAFFE2_API StorageImpl : public c10::intrusive_ptr_target {
- public:
-  StorageImpl() = delete;
-  StorageImpl(const StorageImpl&) = delete;
-  StorageImpl& operator=(const StorageImpl&) = delete;
-
-  explicit StorageImpl(DeviceType device_type) : device_type_(device_type) {}
-  StorageImpl(DeviceType device_type, TypeMeta data_type)
-      : data_type_(data_type), device_type_(device_type) {}
-  template <typename Deleter = MemoryDeleter>
-  StorageImpl(
-      DeviceType device_type,
-      TypeMeta data_type,
-      void* src,
-      size_t capacity,
-      Deleter d = nullptr)
-      : data_type_(data_type), device_type_(device_type) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type_.id() != TypeIdentifier::uninitialized(),
-        "To create storage with a raw external pointer you need to pass in an "
-        "initialized data_type(TypeMeta).");
-    // Check if the deleter is a MemoryDeleter and is a simple nullptr.
-    if (std::is_same<MemoryDeleter, Deleter>::value &&
-        reinterpret_cast<MemoryDeleter*>(static_cast<void*>(&d))[0] ==
-            nullptr) {
-      // Use aliasing constructor trick to avoid calling the destructor.
-      data_ptr_ = std::shared_ptr<void>(std::shared_ptr<void>(), src);
-    } else {
-      data_ptr_.reset(src, d);
-    }
-    capacity_ = capacity;
-  }
-
-  void reset() {
-    data_ptr_.reset();
-    capacity_ = 0;
-  }
-
-  template <typename T>
-  inline bool IsType() const {
-    return data_type_.Match<T>();
-  }
-
-  void* data() const {
-    return data_ptr_.get();
-  }
-
-  void* data() {
-    return data_ptr_.get();
-  }
-
-  DataPtr& data_ptr() {
-    return data_ptr_;
-  }
-
-  const DataPtr& data_ptr() const {
-    return data_ptr_;
-  }
-
-  void set_dtype(const DataType& data_type) {
-    data_type_ = data_type;
-  }
-
-  const DataType& dtype() const {
-    return data_type_;
-  }
-
-  size_t capacity() const {
-    return capacity_;
-  }
-
-  int64_t numel() const {
-    return capacity_ / itemsize();
-  }
-
-  // TODO: remove later
-  void set_numel(int64_t numel) {
-    capacity_ = numel * itemsize();
-  }
-
-  inline DeviceType device_type() const {
-    return device_type_;
-  }
-
-  inline size_t itemsize() const {
-    return data_type_.itemsize();
-  }
-
-  // Rule of Five
-  StorageImpl(StorageImpl&&) = default;
-  ~StorageImpl() = default;
-  StorageImpl& operator=(StorageImpl&&) = default;
-
-  /**
-   * Can only be called when use_count is 1
-   */
-  template <typename Deleter = MemoryDeleter>
-  void UniqueStorageShareExternalPointer(
-      void* src,
-      const DataType& data_type,
-      size_t capacity,
-      Deleter d = nullptr) {
-    data_type_ = data_type;
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type_.id() != TypeIdentifier::uninitialized(),
-        "To share with a raw external pointer you need to have meta "
-        "already set.");
-    // Check if the deleter is a MemoryDeleter and is a simple nullptr.
-    if (std::is_same<MemoryDeleter, Deleter>::value &&
-        reinterpret_cast<MemoryDeleter*>(&d)[0] == nullptr) {
-      // Use aliasing constructor trick to avoid calling the destructor.
-      data_ptr_ = std::shared_ptr<void>(std::shared_ptr<void>(), src);
-    } else {
-      data_ptr_.reset(src, d);
-    }
-    capacity_ = capacity;
-  }
-
- private:
-  int64_t capacity_ = 0;
-  DataType data_type_;
-  DataPtr data_ptr_;
-  // allocator_ takes precedence over StaticContext from device_type_
-  // Allocator* allocator_;
-  DeviceType device_type_ = CPU;
-};
-
-class CAFFE2_API Storage {
- public:
-  Storage() {}
-  Storage(DeviceType device_type)
-      : storage_impl_(c10::make_intrusive<StorageImpl>(device_type)) {}
-  Storage(DeviceType device_type, TypeMeta data_type)
-      : storage_impl_(
-            c10::make_intrusive<StorageImpl>(device_type, data_type)) {}
-
-  template <typename T, typename Deleter = MemoryDeleter>
-  Storage(
-      T* src,
-      DeviceType device_type,
-      size_t capacity = 0,
-      Deleter d = nullptr)
-      : storage_impl_(c10::make_intrusive<StorageImpl>(
-            src,
-            device_type,
-            TypeMeta::Make<T>(),
-            capacity,
-            d)) {}
-
-  template <typename Deleter = MemoryDeleter>
-  Storage(
-      void* src,
-      DeviceType device_type,
-      TypeMeta data_type,
-      size_t capacity,
-      Deleter d = nullptr)
-      : storage_impl_(c10::make_intrusive<StorageImpl>(
-            device_type,
-            data_type,
-            src,
-            capacity,
-            d)) {}
-
-  void reset() {
-    storage_impl_->reset();
-  }
-
-  template <typename T>
-  inline bool IsType() const {
-    return storage_impl_->IsType<T>();
-  }
-
-  void* data() const {
-    return storage_impl_->data();
-  }
-
-  void* data() {
-    return storage_impl_->data();
-  }
-
-  DataPtr& data_ptr() {
-    return storage_impl_->data_ptr();
-  }
-
-  const DataPtr& data_ptr() const {
-    return storage_impl_->data_ptr();
-  }
-
-  void set_dtype(const DataType& data_type) {
-    storage_impl_->set_dtype(data_type);
-  }
-
-  const DataType& dtype() const {
-    return storage_impl_->dtype();
-  }
-  size_t capacity() const {
-    return storage_impl_->capacity();
-  }
-
-  int64_t numel() const {
-    return storage_impl_->numel();
-  }
-
-  // TODO: remove later
-  void set_numel(int64_t numel) {
-    storage_impl_->set_numel(numel);
-  }
-
-  DeviceType device_type() const {
-    return storage_impl_->device_type();
-  }
-
-  inline size_t itemsize() const {
-    return storage_impl_->itemsize();
-  }
-
-  inline long use_count() const {
-    return storage_impl_.use_count();
-  }
-
-  inline bool unique() const {
-    return storage_impl_.unique();
-  }
-
-  template <typename Deleter = MemoryDeleter>
-  void UniqueStorageShareExternalPointer(
-      void* src,
-      const DataType& data_type,
-      size_t capacity,
-      Deleter d = nullptr) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        storage_impl_.unique(),
-        "UniqueStorageShareExternalPointer can only be called when \
-        use_count == 1");
-    storage_impl_->UniqueStorageShareExternalPointer<Deleter>(
-        src, data_type, capacity, d);
-  }
-
- protected:
-  c10::intrusive_ptr<StorageImpl> storage_impl_;
-};
-
-/**
- * Create a Storage given an external pointer `src`.
- * `device_type`: the device type of the storage
- * `capacity`: the capacity of the Tensor
- */
-template <typename T, typename Deleter = MemoryDeleter>
-Storage CreateStorage(
-    T* src,
-    DeviceType device_type,
-    size_t capacity = 0,
-    Deleter d = nullptr) {
-  return CreateStorage(src, device_type, TypeMeta::Make<T>(), capacity, d);
-}
+using StorageImpl = at::StorageImpl;
+using Storage = at::Storage;
 
 } // namespace caffe2
 
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index 1659e6ba252bab..58b4c4b75e91cb 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -1,18 +1,6 @@
 #include "caffe2/core/tensor.h"
 
 #include "caffe2/core/blob_stats.h"
-#include "caffe2/core/flags.h"
-
-CAFFE2_DEFINE_bool(
-    caffe2_keep_on_shrink,
-    true,
-    "If set, keeps memory when a tensor is shrinking its size.");
-
-CAFFE2_DEFINE_int64(
-    caffe2_max_keep_on_shrink_memory,
-    LLONG_MAX,
-    "The maximum memory in bytes to keep on shrink, if the difference between "
-    "tensor sizes is bigger than this then tensor will be reset.");
 
 namespace caffe2 {
 
@@ -93,7 +81,11 @@ vector<TIndex> GetTensorInfo(
     const void* c,
     size_t* capacity,
     DeviceOption* device) {
+  CHECK(capacity);
   const Tensor* tc = static_cast<const Tensor*>(c);
+  CHECK(tc);
+  CHECK(tc->unsafeGetTensorImpl());
+  CHECK(tc->unsafeGetTensorImpl()->storage().unsafeGetStorageImpl());
   *capacity = tc->capacity_nbytes();
   tc->ExtractDeviceOption(device);
   return tc->dims();
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 392cb523c21b44..27c09f00c4c1e2 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -2,873 +2,12 @@
 #define CAFFE2_CORE_TENSOR_H_
 
 #include "caffe2/core/storage.h"
+#include "caffe2/core/tensor_impl.h"
 
 #include <ATen/core/intrusive_ptr.h>
 
-// A global boolean variable to control whether we free memory when a Tensor
-// is shrinked to a smaller size. As a result, a Tensor is always going to
-// keep the memory allocated for its maximum capacity reshaped to so far.
-CAFFE2_DECLARE_bool(caffe2_keep_on_shrink);
-
-// Since we can have high variance in blob memory allocated across different
-// inputs in the same run, we will shrink the blob only if the memory gain
-// is larger than this flag in bytes.
-CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
-
 namespace caffe2 {
 
-/**
- * A utility function to convert vector<int> to vector<TIndex>.
- */
-inline vector<TIndex> ToVectorTIndex(const std::vector<int>& src) {
-  return vector<TIndex>(src.begin(), src.end());
-}
-
-/**
- * Return product of all dimensions starting from k
- */
-inline TIndex size_from_dim_(int k, const vector<TIndex>& dims) {
-  TIndex r = 1;
-  for (size_t i = k; i < dims.size(); ++i) {
-    r *= dims[i];
-  }
-  return r;
-}
-
-// Product of all dims up to k (not including dims[k])
-inline TIndex size_to_dim_(int k, const vector<TIndex>& dims) {
-  CAFFE_ENFORCE((unsigned)k <= dims.size());
-  TIndex r = 1;
-  for (int i = 0; i < k; ++i) {
-    r *= dims[i];
-  }
-  return r;
-}
-
-// Product of all dims between k and l (not including dims[k] and dims[l])
-inline TIndex size_between_dim_(int k, int l, const vector<TIndex>& dims) {
-  CAFFE_ENFORCE((unsigned)l < dims.size());
-  TIndex r = 1;
-  if (k < l) {
-    for (int i = k + 1; i < l; ++i) {
-      r *= dims[i];
-    }
-  } else {
-    for (int i = l + 1; i < k; ++i) {
-      r *= dims[i];
-    }
-  }
-  return r;
-}
-
-// Wrap around axis_index if it is negative, s.t., -1 is the last dim
-inline int canonical_axis_index_(int axis_index, int ndims) {
-  CAFFE_ENFORCE_GE(axis_index, -ndims);
-  CAFFE_ENFORCE_LT(axis_index, ndims);
-  if (axis_index < 0) {
-    return axis_index + ndims;
-  }
-  return axis_index;
-}
-
-/**
- * @brief TensorImpl is the implementation of a tensor and the basic class
- * in Caffe2 that stores a contiguous memory with its shape information.
- *
- * The TensorImpl class is essentially a wrapper around a device-specific memory
- * (the device is specified by the Context template argument), and deals with
- * the allocation and de-allocation of such memory. We make a simplified
- * assumption that the memory is always contiguous.
- */
-class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
- public:
-  TensorImpl() = delete;
-  explicit TensorImpl(DeviceType device_type) : storage_(device_type) {}
-
-  /**
-   * @brief Creates a tensor of the given dimension.
-   *
-   * Note that the actual data allocation is not going to be carried out until
-   * the first time mutable_data() is called.
-   */
-  // TODO: here, we create a Storage
-  // and immediately discard it in Resize() since
-  // reset_tensor will be true and FreeMemory will be called,
-  // we might want to avoid creating Storage twice?
-  explicit TensorImpl(const vector<TIndex>& dims, at::DeviceType device_type)
-      : storage_(device_type) {
-    Resize(dims);
-  }
-
-  explicit TensorImpl(const vector<int>& dims, at::DeviceType device_type)
-      : storage_(device_type) {
-    Resize(dims);
-  }
-
-  /* Now we require that context_for_copy has the same device type as src since
-   * template is removed
-   */
-  TensorImpl(
-      const TensorImpl& src,
-      BaseContext* context_for_copy,
-      at::DeviceType device_type)
-      : storage_(device_type) {
-    CopyFrom(src, context_for_copy);
-  }
-
-  /**
-   * @brief: Create a Tensor of at::DeviceType `type` and initialize it with
-   * src Tensor
-   */
-  TensorImpl(const TensorImpl& src, at::DeviceType device_type)
-      : storage_(device_type) {
-    CopyFrom(src);
-  }
-
-  /**
-   * @brief Creates a tensor, and fills its contents with the given values.
-   * The type of tensor will be decided by the context parameter
-   */
-  template <typename T>
-  TensorImpl(
-      const vector<TIndex>& dims,
-      const vector<T>& values,
-      BaseContext* context)
-      : storage_(context->GetDevicetype(), TypeMeta::Make<T>()) {
-    Resize(dims);
-    CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), numel_);
-    context->CopyItemsFromCPU(
-        storage_.dtype(), numel_, values.data(), mutable_data<T>());
-  }
-
-  /**
-   * @brief Creates a scalar tensor, and fills its content with the given value.
-   * The type of tensor will be decided by the context parameter
-   */
-  template <
-      typename T,
-      typename = typename std::enable_if<std::is_scalar<T>::value>::type>
-  TensorImpl(const T& value, BaseContext* context)
-      : storage_(context->GetDevicetype(), TypeMeta::Make<T>()) {
-    Resize(vector<TIndex>{});
-    context->CopyItemsFromCPU(
-        storage_.dtype(), numel_, &value, mutable_data<T>());
-  }
-
-  /**
-   * @brief Delete the copy constructor and use Clone explicitly
-   */
-  TensorImpl(const TensorImpl& src) = delete;
-
-  TensorImpl(TensorImpl&& src) noexcept {
-    swap(src);
-  }
-
-  TensorImpl& operator=(TensorImpl&&) = default;
-  // Note(jiayq): possibly a rule-of-three violation, but we explicitly
-  // discourage the use of = for Tensors.
-  TensorImpl& operator=(const TensorImpl& src) = delete;
-
-  virtual ~TensorImpl() noexcept {}
-
-  /*
-   * Since we removed template from tensor, we now store a static
-   * context pointer in tensor, which indicates the type of the tensor.
-   */
-  BaseStaticContext* GetStaticContext() const {
-    return get_static_context(GetDeviceType());
-  }
-
-  /* @brief
-   * Create a context that has the same device_type
-   * as the tensor.
-   * Note that this doesn't support passing in argument
-   * TODO(jerryzh): move this to a global registry
-   * that can create context for us
-   */
-  std::unique_ptr<BaseContext> CreateContext() const {
-    return GetStaticContext()->CreateContext();
-  }
-
-  at::DeviceType GetDeviceType() const {
-    return storage_.device_type();
-  }
-
-  /**
-   * @brief Copies the data from a source tensor, with a contex provided to
-   * carry out the underlying memcpy operation.
-   */
-  void CopyFrom(const TensorImpl& src, BaseContext* context = nullptr) {
-    if ((void*)&src == (void*)this) {
-      return;
-    }
-    if (storage_.dtype() != src.meta()) {
-      storage_ = Storage(GetDeviceType(), src.meta());
-    }
-    if (src.size() == -1) {
-      dims_.clear();
-      numel_ = -1;
-      storage_.reset();
-      return;
-    }
-    Resize(src.dims());
-    if (size() > 0) {
-      if (storage_.dtype().copy()) {
-        CAFFE_ENFORCE(
-            GetDeviceType() == CPU,
-            "In CopyFrom source and dest tensors must both be CPU for meta copy");
-        CAFFE_ENFORCE(
-            src.GetDeviceType() == CPU,
-            "In CopyFrom source and dest tensors must both be CPU for meta copy");
-        storage_.dtype().copy()(src.raw_data(), raw_mutable_data(), size());
-      } else {
-        // We'll need to use a non-CPU context to perform the copy if
-        // one of the context is not CPU since only non-CPU context
-        // knows how to copy between CPU and that context
-        if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) {
-          if (!context) {
-            src.CreateContext()->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
-          } else {
-            CAFFE_ENFORCE(
-                context->GetDevicetype() == src.GetDeviceType(),
-                "Type for provided context does not match the type of source");
-            context->CopyBytesToDevice(
-                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
-          }
-        } else {
-          // In case source context is CPU, and target context is non-CPU
-          // We'll have to create a Context from target and perform the
-          // copy using that context
-          CreateContext()->CopyBytesFromCPU(
-              nbytes(), src.raw_data(), raw_mutable_data());
-        }
-      }
-    }
-  }
-
-  /**
-   * @brief Extend the outer-most dimension of this tensor
-   *        to dimension of `num`.
-   */
-  void ExtendTo(TIndex num, float growthPct, BaseContext* context) {
-    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
-    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
-    CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
-    Extend(num - dims_[0], growthPct, context);
-  }
-
-  /**
-   * @brief Extends the outer-most dimension of this tensor by num elements,
-   * preserving the existing data.
-   *
-   * The underlying data may be reallocated in order to accommodate the new
-   * elements, in which case this tensors' capacity is grown at a factor of
-   * growthPct. This ensures that Extend runs on an amortized O(1) time
-   * complexity.
-   */
-  void Extend(TIndex num, float growthPct, BaseContext* context) {
-    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
-    CAFFE_ENFORCE_GE_WITH_CALLER(
-        num, 0, "`num` must be non-negative for Extend");
-    auto newDims = dims_;
-    newDims[0] += num;
-    if (!storage_.data()) {
-      Resize(newDims);
-      return;
-    }
-    auto newNumel = std::accumulate(
-        newDims.begin(),
-        newDims.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
-    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
-      dims_ = newDims;
-      numel_ = newNumel;
-      return;
-    }
-    auto newCapacity = dims_;
-    newCapacity[0] = std::max<size_t>(
-        newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100));
-    auto oldData = std::move(storage_.data_ptr());
-    auto oldSize = numel_;
-    auto oldDims = dims_;
-    Resize(newCapacity);
-    auto* newData = raw_mutable_data(storage_.dtype());
-    CAFFE_ENFORCE(
-        context != nullptr, "Context must be provided to Extend the tensor");
-    context->CopyItemsSameDevice(
-        storage_.dtype(), oldSize, oldData.get(), newData);
-    reserved_ = true;
-    dims_ = newDims;
-    numel_ = newNumel;
-  }
-
-  /**
-   * @brief Shrinks the outer-most dimension to given size, keeping the data.
-   *
-   * This method guarantees that no re-allocations are carried out, which means
-   * that the extra capacity after the end of the shurnk tensor is maintained.
-   */
-  void ShrinkTo(TIndex outer_dim) {
-    CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D");
-    CAFFE_ENFORCE_WITH_CALLER(
-        outer_dim <= dims_[0],
-        "New outer dimension must be smaller than current.");
-    CAFFE_ENFORCE(
-        storage_.unique(),
-        "Can't call ShrinkTo on shared storage, please call Resize instead.");
-    dims_[0] = outer_dim;
-    numel_ = std::accumulate(
-        dims_.begin(),
-        dims_.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
-  }
-
-  /**
-   * @brief Reserve space for the underlying tensor.
-   *
-   * This must be called after Resize(), since we only specify the first
-   * dimension This does not copy over the old data to the newly allocated space
-   */
-  template <class T>
-  void ReserveSpace(const T& outer_dim) {
-    CAFFE_ENFORCE(
-        numel_ != -1, "size should be initialized before calling ReserveSpace");
-    CAFFE_ENFORCE(
-        storage_.unique(), "Can't call ReserveSpace on shared storage.");
-    auto newCapacity = dims_;
-    newCapacity[0] = outer_dim;
-    auto newNumel = std::accumulate(
-        newCapacity.begin(),
-        newCapacity.end(),
-        static_cast<TIndex>(1),
-        std::multiplies<TIndex>());
-    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
-      return;
-    }
-    // Old data is discarded
-    storage_.data_ptr().reset();
-    auto oldSize = numel_;
-    auto oldDims = dims_;
-    Resize(newCapacity);
-    // Allocate new memory but don't copy over the data
-    raw_mutable_data(storage_.dtype());
-    dims_ = oldDims;
-    numel_ = oldSize;
-    reserved_ = true;
-  }
-
-  /**
-   * @brief Resizes a tensor.
-   *
-   * Resize takes in a vector of ints specifying the dimensions of the tensor.
-   * You can pass in an empty vector to specify that it is a scalar (i.e.
-   * containing one single item).
-   *
-   * The underlying storage may be deleted after calling Resize: if the new
-   * shape leads to a different number of items in the tensor, the old memory
-   * is deleted and new memory will be allocated next time you call
-   * mutable_data(). However, if the shape is different but the total number of
-   * items is the same, the underlying storage is kept.
-   */
-  template <typename... Ts>
-  void Resize(Ts... dim_source) {
-    bool is_init = numel_ == -1;
-    bool size_changed = SetDims(dim_source...);
-    if (size_changed) {
-      // If needed, we will free the data. the next mutable_data() call
-      // will create the data storage.
-      bool reset_tensor = false;
-      if (reserved_) {
-        // If tensor is reserved then don't claim its memeory unless capacity()
-        // is smaller than new size
-        reset_tensor = storage_.capacity() < numel_ * storage_.itemsize();
-      } else {
-        reset_tensor = storage_.capacity() < numel_ * storage_.itemsize() ||
-            !FLAGS_caffe2_keep_on_shrink ||
-            storage_.capacity() - numel_ * storage_.itemsize() >
-                FLAGS_caffe2_max_keep_on_shrink_memory;
-      }
-
-      if (reset_tensor && !is_init) {
-        FreeMemory();
-      }
-    }
-  }
-
-  /**
-   * Resize the tensor like the source tensor. Note that this is just a
-   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
-   */
-  inline void ResizeLike(const TensorImpl& src_tensor) {
-    // Note: need casting for different context types.
-    if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
-      Resize(src_tensor.dims());
-    }
-  }
-
-  /**
-   * Resizes the tensor without touching underlying storage.
-   * This requires the total size of the tensor to remains constant.
-   */
-  inline void Reshape(const vector<TIndex>& dims) {
-    TIndex new_size = 1;
-    for (auto d : dims) {
-      CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
-      new_size *= d;
-    }
-    CAFFE_ENFORCE_WITH_CALLER(
-        new_size == numel_,
-        "New size and old size are not equal. You cannot use Reshape, "
-        "but should use Resize."
-        // TODO(jiayq): remove the following warning after pending diffs
-        // stabilize.
-        " The old caffe2 mixes Reshape and Resize but this behavior has "
-        "been changed. If you find this error, most likely you will need "
-        "to change corresponding code from Reshape to Resize.");
-    dims_ = dims;
-  }
-
-  inline void Reshape(const vector<int>& dims) {
-    Reshape(ToVectorTIndex(dims));
-  }
-
-  /**
-   * Release whatever memory the tensor was holding but keep size and type
-   * information. Subsequent call to mutable_data will trigger new memory
-   * allocation.
-   */
-  inline void FreeMemory() {
-    // We'll detach from the old Storage and create a new one
-    storage_ = Storage(storage_.device_type(), storage_.dtype());
-  }
-
-  /**
-   * A utility function to print the debug string for the tensor. Note that this
-   * is very slow since it involves quite some string operations, so do not use
-   * it in your performance-critical code.
-   */
-  string DebugString() const {
-    std::stringstream ss;
-    ss << "A Tensor of item size " << storage_.itemsize() << " and type "
-       << storage_.dtype().name() << " and dimension (";
-    for (int d : dims_) {
-      ss << d << ",";
-    }
-    ss << ").";
-    return ss.str();
-  }
-
-  void swap(TensorImpl& other) noexcept {
-    std::swap(dims_, other.dims_);
-    std::swap(numel_, other.numel_);
-    std::swap(storage_, other.storage_);
-  }
-
-  /**
-   * @brief Shares the data with another tensor.
-   *
-   * To share data between two tensors, the sizes of the two tensors must be
-   * equal already. The reason we do not implicitly do a Resize to make the two
-   * tensors have the same shape is that we want to allow tensors of different
-   * shapes but the same number of items to still be able to share data. This
-   * allows one to e.g. have a n-dimensional Tensor and a flattened version
-   * sharing the same underlying storage.
-   *
-   * The source tensor should already have its data allocated.
-   */
-  void ShareData(const TensorImpl& src) {
-    // Right now, we are assuming the device_type are the same, since it is
-    // inherently the same in the non-templatized code. We should probably add
-    // an ENFORCE here which might affect perf a little bit.
-    CAFFE_ENFORCE_EQ_WITH_CALLER(
-        src.numel_,
-        numel_,
-        "Size mismatch - did you call reshape before sharing the data?");
-    // It is possible that the source tensor hasn't called mutable_data() yet,
-    // in which case ShareData() doesn't make much sense since we don't really
-    // know what to share yet.
-    CAFFE_ENFORCE_WITH_CALLER(
-        src.storage_.data() || src.numel_ == 0,
-        "Source tensor has no content and has size > 0");
-    // Finally, do sharing.
-    /* Since we create new Storage whenever we need to change data_type/capacity
-     * this still keeps the original semantics
-     */
-    storage_ = src.storage();
-  }
-
-  /**
-   * @brief Shares the data with an externally managed pointer.
-   *
-   * This is similar to ShareData() but the source is a pointer with an advanced
-   * deleter option. In default, no deletion takes place, and one needs to make
-   * sure that the external memory is deallocated only after the tensor finishes
-   * using it. If a Deleter object is passed in, when this tensor is reallocated
-   * or freed, the deleter function is going to be called.
-   */
-  template <typename T, typename Deleter = MemoryDeleter>
-  void ShareExternalPointer(T* src, size_t capacity = 0, Deleter d = nullptr) {
-    ShareExternalPointer(src, TypeMeta::Make<T>(), capacity, d);
-  }
-
-  template <typename Deleter = MemoryDeleter>
-  void ShareExternalPointer(
-      void* src,
-      const TypeMeta& data_type,
-      size_t capacity = 0,
-      Deleter d = nullptr) {
-    CAFFE_ENFORCE_WITH_CALLER(
-        data_type.id() != TypeIdentifier::uninitialized(),
-        "To share with a raw external pointer you need to pass in an "
-        "initialized data_type(TypeMeta).");
-    if (!capacity) {
-      capacity = numel_ * data_type.itemsize();
-    }
-    if (storage_.unique()) {
-      CAFFE_ENFORCE_WITH_CALLER(
-          numel_ >= 0,
-          "To share data with a raw pointer, you need to set shape first.");
-      storage_.UniqueStorageShareExternalPointer(src, data_type, capacity, d);
-    } else {
-      // Create a new Storage
-      storage_ = Storage(src, GetDeviceType(), data_type, capacity, d);
-    }
-  }
-
-  /**
-   * Returns a const raw void* pointer of the underlying storage. mutable_data()
-   * or raw_mutable_data() must have been called prior to this function call.
-   */
-  inline const void* raw_data() const {
-    CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0);
-    return storage_.data();
-  }
-
-  /**
-   * Returns a typed pointer of the underlying storage. mutable_data() or
-   * raw_mutable_data() must have been called prior to this function call, and
-   * the data type must be of the correct type. If you want to get a void*
-   * pointer instead, use raw_data().
-   */
-  template <typename T>
-  inline const T* data() const {
-    CAFFE_ENFORCE_WITH_CALLER(
-        storage_.data() || numel_ == 0,
-        "The tensor is of non-zero shape, but its data is not allocated yet. "
-        "Caffe2 uses a lazy allocation, so you will need to call "
-        "mutable_data() or raw_mutable_data() to actually allocate memory.");
-    CAFFE_ENFORCE_WITH_CALLER(
-        IsType<T>(),
-        "Tensor type mismatch, caller expects elements to be ",
-        TypeMeta::TypeName<T>(),
-        " while tensor contains ",
-        storage_.dtype().name());
-    return static_cast<T*>(storage_.data());
-  }
-
-  /**
-   * Returns a mutable raw pointer of the underlying storage. Since we will need
-   * to know the type of the data for allocation, a TypeMeta object is passed in
-   * to specify the necessary information. This is conceptually equivalent of
-   * calling mutable_data<T>() where the TypeMeta parameter meta is derived from
-   * the type T. This function differs from mutable_data<T>() in the sense that
-   * the type T can be specified during runtime via the TypeMeta object.
-   *
-   * If the existing data does not match the desired type, it will be deleted
-   * and a new storage will be created.
-   */
-  inline void* raw_mutable_data(const TypeMeta& meta) {
-    // For 0-size tensors it's fine to return any pointer (including nullptr)
-    if (storage_.dtype() == meta && (storage_.data() || numel_ == 0)) {
-      return storage_.data();
-    } else {
-      bool had_special_dtor = storage_.dtype().dtor() != nullptr;
-      if (storage_.unique()) {
-        storage_.set_dtype(meta);
-        // TODO: recalcuate numel when we store numel instead of capacity in
-        // Storage
-      } else {
-        if (storage_.dtype() != meta) {
-          storage_ = Storage(storage_.device_type(), meta);
-        }
-      }
-      CAFFE_ENFORCE_WITH_CALLER(
-          numel_ >= 0,
-          "Tensor is not initialized. You probably need to call Resize() "
-          "before calling mutable_data()");
-
-      // We can reuse the existing buffer if the current data does not have
-      // a special destructor and the new data doesn't have a special
-      // constructor.
-      if (numel_ == 0 ||
-          (meta.ctor() == nullptr && !had_special_dtor &&
-           storage_.capacity() >= numel_ * storage_.itemsize())) {
-        return storage_.data();
-      }
-      if (meta.ctor()) {
-        // For types that need placement new, we will call it, as well as
-        // making sure that when the data is freed, it calls the right
-        // destruction procedure.
-        auto size = numel_;
-        auto dtor = storage_.dtype().dtor();
-        auto ptr_and_deleter =
-            GetStaticContext()->New(numel_ * storage_.itemsize());
-        auto deleter = ptr_and_deleter.second;
-        storage_.data_ptr().reset(
-            ptr_and_deleter.first, [size, dtor, deleter](void* ptr) -> void {
-              dtor(ptr, size);
-              deleter(ptr);
-            });
-        storage_.dtype().ctor()(storage_.data(), numel_);
-      } else {
-        // For fundamental type, new and delete is easier.
-        auto ptr_and_deleter =
-            GetStaticContext()->New(numel_ * storage_.itemsize());
-        storage_.data_ptr().reset(
-            ptr_and_deleter.first, ptr_and_deleter.second);
-      }
-      storage_.set_numel(numel_);
-      return storage_.data();
-    }
-  }
-
-  /**
-   * Returns a mutable raw pointer of the underlying storage. This can only be
-   * used when you know for sure that the underlying storage of the tensor is
-   * already created via an earlier raw_mutable_data(meta) call or a
-   * mutable_data<T>() call.
-   *
-   * If the existing data does not match the desired type, it will be deleted
-   * and a new storage will be created.
-   */
-  inline void* raw_mutable_data() {
-    CAFFE_ENFORCE_WITH_CALLER(
-        storage_.dtype().id() != TypeIdentifier::uninitialized(),
-        "Calling raw_mutable_data() without meta, but the current meta is "
-        "of unknown type.");
-    return raw_mutable_data(storage_.dtype());
-  }
-
-  /**
-   * Returns a typed pointer of the underlying storage.
-   *
-   * For fundamental types, we reuse possible existing storage if there
-   * is sufficient capacity.
-   */
-  template <typename T>
-  inline T* mutable_data() {
-    if ((numel_ == 0 || storage_.data()) && IsType<T>()) {
-      return static_cast<T*>(storage_.data());
-    }
-    // Check it here statically - otherwise TypeMeta would throw the runtime
-    // error in attempt to invoke TypeMeta::ctor()
-    static_assert(
-        std::is_default_constructible<T>::value,
-        "Tensor can't hold non-default-constructible types");
-    return static_cast<T*>(raw_mutable_data(TypeMeta::Make<T>()));
-  }
-
-  /**
-   * Returns the number of dimensions of the data.
-   */
-  inline int ndim() const {
-    return dims_.size();
-  }
-  /**
-   * Returns the size (i.e. the number of items) of the tensor.
-   */
-  inline TIndex size() const {
-    return numel_;
-  }
-  /**
-   * Return the number of bytes each item takes in the tensor.
-   */
-  inline size_t itemsize() const {
-    return storage_.itemsize();
-  }
-  /**
-   * Returns the total number of bytes of the storage.
-   *
-   * This is equivalent to calling size() * itemsize().
-   */
-  inline size_t nbytes() const {
-    return numel_ * itemsize();
-    ;
-  }
-
-  inline size_t capacity_nbytes() const {
-    return storage_.capacity();
-  }
-  /**
-   * Returns the dimensions of the tensor as a vector.
-   */
-  inline const vector<TIndex>& dims() const {
-    return dims_;
-  }
-
-  inline TIndex size_from_dim(int k) const {
-    return size_from_dim_(k, dims_);
-  }
-
-  inline TIndex size_to_dim(int k) const {
-    return size_to_dim_(k, dims_);
-  }
-
-  inline TIndex size_between_dim(int k, int l) const {
-    return size_between_dim_(k, l, dims_);
-  }
-
-  /**
-   * Returns the 'canonical' version of a (usually)  user-specified axis,
-   * allowing for negative indexing (e.g., -1 for the last axis).
-   *
-   * @param axis_index the axis index.
-   *        If 0 <= index < ndim(), return index.
-   *        If -ndim <= index <= -1, return (ndim() - (-index)),
-   *        e.g., the last axis index (ndim() - 1) if index == -1,
-   *        the second to last if index == -2, etc.
-   *        Dies on out of range index.
-   */
-  inline int canonical_axis_index(int axis_index) const {
-    return canonical_axis_index_(axis_index, ndim());
-  }
-
-  /**
-   * Checks if the tensor content is of the given data type.
-   */
-  template <typename T>
-  inline bool IsType() const {
-    return storage_.IsType<T>();
-  }
-  /**
-   * Returns the TypeMeta object associated with the current data type.
-   */
-  inline const TypeMeta& meta() const {
-    return storage_.dtype();
-  }
-
-  /**
-   * Returns the i-th dimension of the tensor in int.
-   *
-   * This function returns an int value instead of TIndex, which depending on
-   * the typedef could be int64. If you want int64 dim values, make sure you
-   * call dim() instead.
-   */
-  inline int dim32(const int i) const {
-#ifndef NDEBUG
-    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
-#endif
-    CAFFE_ENFORCE_LT_WITH_CALLER(dims_[i], std::numeric_limits<int>::max());
-    return static_cast<int>(dims_[i]);
-  }
-
-  /**
-   * Returns the i-th dimension of the tensor. Note that the passed in index
-   * must be between 0 (inclusive) and the number of dimensions, otherwise
-   * this function will produce a fatal message.
-   */
-  inline TIndex dim(const int i) const {
-#ifndef NDEBUG
-    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
-    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
-#endif
-    return dims_[i];
-  }
-
-  void ExtractDeviceOption(DeviceOption* device) const {
-    GetStaticContext()->ExtractDeviceOption(device, raw_data());
-  }
-
-  const Storage& storage() {
-    return storage_;
-  }
-
-  const Storage& storage() const {
-    return storage_;
-  }
-
- protected:
-  using DimVector = std::vector<TIndex>;
-  DimVector dims_; // sizes_
-  TIndex numel_ = -1; // numel_
-  // we decide to keep reserved_ and it will
-  // live in Tensor after the split
-  // The logic is that if Extend() or ReserveSpace() were ever called,
-  // then subsequent Resize()s will not free up Storage.
-  bool reserved_ = false;
-  Storage storage_;
-  // int64_t storage_offset_;
-
- private:
-  template <
-      typename T,
-      typename = typename std::enable_if<std::is_integral<T>::value>::type>
-  bool SetDims(const vector<T>& src) {
-    auto old_numel = numel_;
-    dims_.resize(src.size());
-    TIndex new_numel = 1;
-    for (size_t i = 0; i < src.size(); ++i) {
-      new_numel *= src[i];
-      dims_[i] = src[i];
-    }
-    numel_ = new_numel;
-    return numel_ != old_numel;
-  }
-
-  bool SetDims() {
-    auto old_numel = numel_;
-    dims_.resize(0);
-    numel_ = 1;
-    return numel_ != old_numel;
-  }
-
-  // TODO(jiayq): maybe rewrite the following functions with initializer list.
-  // NVCC does not play well with initializer lists last time, but worth
-  // another shot.
-  bool SetDims(const TIndex d0) {
-    auto old_numel = numel_;
-    dims_.resize(1);
-    dims_[0] = d0;
-    numel_ = d0;
-    return numel_ != old_numel;
-  }
-
-  bool SetDims(const TIndex d0, const TIndex d1) {
-    auto old_numel = numel_;
-    dims_.resize(2);
-    dims_[0] = d0;
-    dims_[1] = d1;
-    numel_ = d0 * d1;
-    return numel_ != old_numel;
-  }
-
-  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) {
-    auto old_numel = numel_;
-    dims_.resize(3);
-    dims_[0] = d0;
-    dims_[1] = d1;
-    dims_[2] = d2;
-    numel_ = d0 * d1 * d2;
-    return numel_ != old_numel;
-  }
-
-  bool
-  SetDims(const TIndex d0, const TIndex d1, const TIndex d2, const TIndex d3) {
-    auto old_numel = numel_;
-    dims_.resize(4);
-    dims_[0] = d0;
-    dims_[1] = d1;
-    dims_[2] = d2;
-    dims_[3] = d3;
-    numel_ = d0 * d1 * d2 * d3;
-    return numel_ != old_numel;
-  }
-};
-
 class CAFFE2_API UndefinedTensorImpl final : public TensorImpl {
   UndefinedTensorImpl() : TensorImpl(CPU){};
 
@@ -911,45 +50,75 @@ class CAFFE2_API Tensor final {
     return impl_.get();
   }
 
-  explicit Tensor(DeviceType type)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(type)) {}
+  explicit Tensor(Storage storage)
+      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(std::move(storage))) {}
 
+  /**
+   * @brief Creates a tensor of the given dimension.
+   *
+   * Note that the actual data allocation is not going to be carried out until
+   * the first time mutable_data() is called.
+   */
   explicit Tensor(const vector<TIndex>& dims, DeviceType type)
-      : impl_(
-            c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(dims, type)) {}
+      : Tensor(Storage(type)) {
+    // TODO: here, we create a Storage
+    // and immediately discard it in Resize() since
+    // reset_tensor will be true and FreeMemory will be called,
+    // we might want to avoid creating Storage twice?
+    Resize(dims);
+  }
 
   explicit Tensor(const vector<int>& dims, DeviceType type)
-      : impl_(
-            c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(dims, type)) {}
+      : Tensor(Storage(type)) {
+    Resize(dims);
+  }
 
+  /**
+   * context_for_copy is required to have the same DeviceType as src
+   */
   Tensor(const Tensor& src, BaseContext* context_for_copy, DeviceType type)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            *src.impl_,
-            context_for_copy,
-            type)) {}
+      : Tensor(Storage(type)) {
+    CopyFrom(src, context_for_copy);
+  }
 
+  /**
+   * @brief: Create a Tensor of at::DeviceType `type` and initialize it with
+   * src Tensor
+   */
   Tensor(const Tensor& src, DeviceType type)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            *src.impl_,
-            type)) {}
+      : Tensor(Storage(type)) {
+    CopyFrom(src);
+  }
 
+  /**
+   * @brief Creates a tensor, and fills its contents with the given values.
+   * The type of tensor will be decided by the context parameter
+   */
   template <typename T>
   Tensor(
       const vector<TIndex>& dims,
       const vector<T>& values,
       BaseContext* context)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            dims,
-            values,
-            context)) {}
+      : Tensor(Storage(context->device_type(), TypeMeta::Make<T>())) {
+    Resize(dims);
+    CAFFE_ENFORCE_EQ_WITH_CALLER(values.size(), size());
+    context->CopyItemsFromCPU(
+        storage().dtype(), size(), values.data(), mutable_data<T>());
+  }
 
+  /**
+   * @brief Creates a scalar tensor, and fills its content with the given value.
+   * The type of tensor will be decided by the context parameter
+   */
   template <
       typename T,
       typename = typename std::enable_if<std::is_scalar<T>::value>::type>
   Tensor(const T& value, BaseContext* context)
-      : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
-            value,
-            context)) {}
+      : Tensor(Storage(context->device_type(), TypeMeta::Make<T>())) {
+    Resize(std::vector<TIndex>{});
+    context->CopyItemsFromCPU(
+        storage().dtype(), size(), &value, mutable_data<T>());
+  }
 
   Tensor Clone() const {
     Tensor x(GetDeviceType());
@@ -1019,26 +188,40 @@ class CAFFE2_API Tensor final {
   // swap method swaps the CONTENTS of the tensors, while std::swap
   // swaps the POINTERS.
   void swap(const Tensor& other) const noexcept {
-    impl_.get()->swap(*other.impl_.get());
+    // NB: use get() to get a non-const pointer!
+    std::swap(*impl_.get(), *other.impl_.get());
   }
 
   void ShareData(const Tensor& src) const {
     impl_.get()->ShareData(*src.impl_.get());
   }
 
-  template <typename T, typename Deleter = MemoryDeleter>
-  void ShareExternalPointer(T* src, size_t capacity = 0, Deleter d = nullptr)
-      const {
-    impl_.get()->ShareExternalPointer<T, Deleter>(src, capacity, d);
+  template <typename T>
+  void ShareExternalPointer(
+      T* src,
+      size_t capacity = 0,
+      MemoryDeleter d = nullptr) const {
+    impl_.get()->ShareExternalPointer<T>(src, capacity, d);
+  }
+
+  template <typename T>
+  void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) const {
+    impl_.get()->ShareExternalPointer<T>(std::move(data_ptr), capacity);
   }
 
-  template <typename Deleter = MemoryDeleter>
   void ShareExternalPointer(
       void* src,
       const TypeMeta& meta,
       size_t capacity = 0,
-      Deleter d = nullptr) const {
-    impl_.get()->ShareExternalPointer<Deleter>(src, meta, capacity, d);
+      MemoryDeleter d = nullptr) const {
+    impl_.get()->ShareExternalPointer(src, meta, capacity, d);
+  }
+
+  void ShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const TypeMeta& data_type,
+      size_t capacity) {
+    impl_.get()->ShareExternalPointer(std::move(data_ptr), data_type, capacity);
   }
 
   inline const void* raw_data() const {
@@ -1103,6 +286,18 @@ class CAFFE2_API Tensor final {
     return impl_.get()->canonical_axis_index(axis_index);
   }
 
+  inline int64_t stride(int64_t dim) const {
+    return impl_.get()->stride(dim);
+  }
+
+  inline at::DimVector strides() {
+    return impl_.get()->strides();
+  }
+
+  inline bool is_contiguous() const {
+    return impl_.get()->is_contiguous();
+  }
+
   template <typename T>
   inline bool IsType() const {
     return impl_.get()->IsType<T>();
@@ -1123,6 +318,10 @@ class CAFFE2_API Tensor final {
   inline void ExtractDeviceOption(DeviceOption* device) const {
     return impl_.get()->ExtractDeviceOption(device);
   }
+
+  const Storage& storage() {
+    return impl_->storage();
+  }
 };
 
 using TensorCPU = Tensor;
diff --git a/caffe2/core/tensor_impl.cc b/caffe2/core/tensor_impl.cc
new file mode 100644
index 00000000000000..cff98c6101ea5d
--- /dev/null
+++ b/caffe2/core/tensor_impl.cc
@@ -0,0 +1,14 @@
+#include "caffe2/core/tensor_impl.h"
+
+#include "caffe2/core/flags.h"
+
+CAFFE2_DEFINE_bool(
+    caffe2_keep_on_shrink,
+    true,
+    "If set, keeps memory when a tensor is shrinking its size.");
+
+CAFFE2_DEFINE_int64(
+    caffe2_max_keep_on_shrink_memory,
+    LLONG_MAX,
+    "The maximum memory in bytes to keep on shrink, if the difference between "
+    "tensor sizes is bigger than this then tensor will be reset.");
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
new file mode 100644
index 00000000000000..3f42ed36b30954
--- /dev/null
+++ b/caffe2/core/tensor_impl.h
@@ -0,0 +1,915 @@
+#pragma once
+
+#include <ATen/core/DimVector.h>
+#include <ATen/core/TensorImpl.h>
+#include <ATen/core/context_base.h>
+
+#include "caffe2/core/allocator.h"
+#include "caffe2/core/common.h"
+#include "caffe2/core/flags.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/context_base.h"
+
+// A global boolean variable to control whether we free memory when a Tensor
+// is shrinked to a smaller size. As a result, a Tensor is always going to
+// keep the memory allocated for its maximum capacity reshaped to so far.
+CAFFE2_DECLARE_bool(caffe2_keep_on_shrink);
+
+// Since we can have high variance in blob memory allocated across different
+// inputs in the same run, we will shrink the blob only if the memory gain
+// is larger than this flag in bytes.
+CAFFE2_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
+
+namespace caffe2 {
+
+/**
+ * A utility function to convert vector<int> to vector<TIndex>.
+ */
+inline std::vector<TIndex> ToVectorTIndex(const std::vector<int>& src) {
+  return std::vector<TIndex>(src.begin(), src.end());
+}
+
+/**
+ * Return product of all dimensions starting from k
+ */
+inline TIndex size_from_dim_(int k, const std::vector<TIndex>& dims) {
+  TIndex r = 1;
+  for (size_t i = k; i < dims.size(); ++i) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims up to k (not including dims[k])
+inline TIndex size_to_dim_(int k, const std::vector<TIndex>& dims) {
+  CAFFE_ENFORCE((unsigned)k <= dims.size());
+  TIndex r = 1;
+  for (int i = 0; i < k; ++i) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims between k and l (not including dims[k] and dims[l])
+inline TIndex size_between_dim_(int k, int l, const std::vector<TIndex>& dims) {
+  CAFFE_ENFORCE((unsigned)l < dims.size());
+  TIndex r = 1;
+  if (k < l) {
+    for (int i = k + 1; i < l; ++i) {
+      r *= dims[i];
+    }
+  } else {
+    for (int i = l + 1; i < k; ++i) {
+      r *= dims[i];
+    }
+  }
+  return r;
+}
+
+// Wrap around axis_index if it is negative, s.t., -1 is the last dim
+inline int canonical_axis_index_(int axis_index, int ndims) {
+  CAFFE_ENFORCE_GE(axis_index, -ndims);
+  CAFFE_ENFORCE_LT(axis_index, ndims);
+  if (axis_index < 0) {
+    return axis_index + ndims;
+  }
+  return axis_index;
+}
+
+/**
+ * @brief TensorImpl is the implementation of a tensor and the basic class
+ * in Caffe2 that stores a contiguous memory with its shape information.
+ *
+ * The TensorImpl class is essentially a wrapper around a device-specific memory
+ * (the device is specified by the Context template argument), and deals with
+ * the allocation and de-allocation of such memory. We make a simplified
+ * assumption that the memory is always contiguous.
+ */
+class CAFFE2_API TensorImpl : public c10::intrusive_ptr_target {
+ public:
+  TensorImpl() = delete;
+
+  explicit TensorImpl(at::Storage storage) : storage_(std::move(storage)), storage_offset_(0) {
+    data_type_ = storage_ ? storage_.dtype() : TypeMeta{};
+  }
+
+  TensorImpl(const TensorImpl&) = default;
+  TensorImpl& operator=(const TensorImpl&) = default;
+  TensorImpl(TensorImpl&&) = default;
+  TensorImpl& operator=(TensorImpl&&) = default;
+
+  virtual ~TensorImpl() noexcept {}
+
+  /*
+   * Since we removed template from tensor, we now store a static
+   * context pointer in tensor, which indicates the type of the tensor.
+   */
+  at::BaseStaticContext* GetStaticContext() const {
+    auto device_type = GetDeviceType();
+    return get_static_context(device_type);
+  }
+
+  /* @brief
+   * Create a context that has the same device_type
+   * as the tensor.
+   * Note that this doesn't support passing in argument
+   * TODO(jerryzh): move this to a global registry
+   * that can create context for us
+   */
+  std::unique_ptr<at::BaseContext> CreateContext() const {
+    return GetStaticContext()->CreateContext();
+  }
+
+  at::DeviceType GetDeviceType() const {
+    return storage_.device_type();
+  }
+
+  /**
+   * @brief Copies the data from a source tensor, with a contex provided to
+   * carry out the underlying memcpy operation.
+   */
+  void CopyFrom(const TensorImpl& src, at::BaseContext* context = nullptr) {
+    if ((void*)&src == (void*)this) {
+      return;
+    }
+    if (data_type_ != src.meta()) {
+      CAFFE_ENFORCE_WITH_CALLER(
+          src.is_contiguous(),
+          "Right now only copy of contiguous source Tensor is supported.");
+      storage_ = at::Storage(GetDeviceType(), src.meta());
+      data_type_ = src.meta();
+    }
+    if (src.size() == -1) {
+      dims_.clear();
+      numel_ = -1;
+      strides_.clear();
+      is_contiguous_ = true;
+      storage_.reset();
+      data_type_ = TypeMeta();
+      return;
+    }
+    Resize(src.dims());
+    if (size() > 0) {
+      if (data_type_.copy()) {
+        CAFFE_ENFORCE(
+            GetDeviceType() == CPU,
+            "In CopyFrom source and dest tensors must both be CPU for meta copy");
+        CAFFE_ENFORCE(
+            src.GetDeviceType() == CPU,
+            "In CopyFrom source and dest tensors must both be CPU for meta copy");
+        data_type_.copy()(src.raw_data(), raw_mutable_data(), size());
+      } else {
+        // We'll need to use a non-CPU context to perform the copy if
+        // one of the context is not CPU since only non-CPU context
+        // knows how to copy between CPU and that context
+        if (src.GetDeviceType() != CPU || GetDeviceType() == CPU) {
+          if (!context) {
+            src.CreateContext()->CopyBytesToDevice(
+                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+          } else {
+            CAFFE_ENFORCE(
+                context->device_type() == src.GetDeviceType(),
+                "Type for provided context does not match the type of source");
+            context->CopyBytesToDevice(
+                nbytes(), src.raw_data(), raw_mutable_data(), GetDeviceType());
+          }
+        } else {
+          // In case source context is CPU, and target context is non-CPU
+          // We'll have to create a Context from target and perform the
+          // copy using that context
+          CreateContext()->CopyBytesFromCPU(
+              nbytes(), src.raw_data(), raw_mutable_data());
+        }
+      }
+    }
+  }
+
+  /**
+   * @brief Extend the outer-most dimension of this tensor
+   *        to dimension of `num`.
+   */
+  void ExtendTo(TIndex num, float growthPct, at::BaseContext* context) {
+    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
+    CAFFE_ENFORCE_GE_WITH_CALLER(growthPct, 0);
+    CAFFE_ENFORCE(context != nullptr, "Context must be provided.");
+    Extend(num - dims_[0], growthPct, context);
+  }
+
+  /**
+   * @brief Extends the outer-most dimension of this tensor by num elements,
+   * preserving the existing data.
+   *
+   * The underlying data may be reallocated in order to accommodate the new
+   * elements, in which case this tensors' capacity is grown at a factor of
+   * growthPct. This ensures that Extend runs on an amortized O(1) time
+   * complexity.
+   */
+  void Extend(TIndex num, float growthPct, at::BaseContext* context) {
+    CAFFE_ENFORCE_GE_WITH_CALLER(dims_.size(), 1);
+    CAFFE_ENFORCE_GE_WITH_CALLER(
+        num, 0, "`num` must be non-negative for Extend");
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now Extend is only supported for contiguous Tensor.");
+    auto newDims = dims_;
+    newDims[0] += num;
+    if (!storage_.data()) {
+      Resize(newDims);
+      return;
+    }
+    auto newNumel = std::accumulate(
+        newDims.begin(),
+        newDims.end(),
+        static_cast<TIndex>(1),
+        std::multiplies<TIndex>());
+    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
+      dims_ = newDims;
+      numel_ = newNumel;
+      return;
+    }
+    auto newCapacity = dims_;
+    newCapacity[0] = std::max<size_t>(
+        newDims[0], std::ceil(dims_[0] * (growthPct + 100) / 100));
+    auto oldData = std::move(storage_.data_ptr());
+    auto oldSize = numel_;
+    auto oldDims = dims_;
+    Resize(newCapacity);
+    auto* newData = raw_mutable_data(data_type_);
+    CAFFE_ENFORCE(
+        context != nullptr, "Context must be provided to Extend the tensor");
+    context->CopyItemsSameDevice(
+        data_type_, oldSize, oldData.get(), newData);
+    reserved_ = true;
+    dims_ = newDims;
+    numel_ = newNumel;
+  }
+
+  /**
+   * @brief Shrinks the outer-most dimension to given size, keeping the data.
+   *
+   * This method guarantees that no re-allocations are carried out, which means
+   * that the extra capacity after the end of the shurnk tensor is maintained.
+   */
+  void ShrinkTo(TIndex outer_dim) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now ShrinkTo is only supported on contiguous Tensor.");
+    CAFFE_ENFORCE_WITH_CALLER(dims_.size() >= 1, "Tensor must be at least 1D");
+    CAFFE_ENFORCE_WITH_CALLER(
+        outer_dim <= dims_[0],
+        "New outer dimension must be smaller than current.");
+    CAFFE_ENFORCE(
+        storage_.unique(),
+        "Can't call ShrinkTo on shared storage, please call Resize instead.");
+    dims_[0] = outer_dim;
+    numel_ = std::accumulate(
+        dims_.begin(),
+        dims_.end(),
+        static_cast<TIndex>(1),
+        std::multiplies<TIndex>());
+  }
+
+  /**
+   * @brief Reserve space for the underlying tensor.
+   *
+   * This must be called after Resize(), since we only specify the first
+   * dimension This does not copy over the old data to the newly allocated space
+   */
+  template <class T>
+  void ReserveSpace(const T& outer_dim) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now ReserveSpace is only supported for contiguous Tensor.");
+    CAFFE_ENFORCE(
+        numel_ != -1, "size should be initialized before calling ReserveSpace");
+    CAFFE_ENFORCE(
+        storage_.unique(), "Can't call ReserveSpace on shared storage.");
+    auto newCapacity = dims_;
+    newCapacity[0] = outer_dim;
+    auto newNumel = std::accumulate(
+        newCapacity.begin(),
+        newCapacity.end(),
+        static_cast<TIndex>(1),
+        std::multiplies<TIndex>());
+    if (newNumel * storage_.itemsize() <= storage_.capacity()) {
+      return;
+    }
+    // Old data is discarded
+    storage_.data_ptr().clear();
+    auto oldSize = numel_;
+    auto oldDims = dims_;
+    Resize(newCapacity);
+    // Allocate new memory but don't copy over the data
+    raw_mutable_data(data_type_);
+    dims_ = oldDims;
+    numel_ = oldSize;
+    reserved_ = true;
+  }
+
+  /**
+   * @brief Resizes a tensor.
+   *
+   * Resize takes in a vector of ints specifying the dimensions of the tensor.
+   * You can pass in an empty vector to specify that it is a scalar (i.e.
+   * containing one single item).
+   *
+   * The underlying storage may be deleted after calling Resize: if the new
+   * shape leads to a different number of items in the tensor, the old memory
+   * is deleted and new memory will be allocated next time you call
+   * mutable_data(). However, if the shape is different but the total number of
+   * items is the same, the underlying storage is kept.
+   */
+  template <typename... Ts>
+  void Resize(Ts... dim_source) {
+    bool is_init = numel_ == -1;
+    bool size_changed = SetDims(dim_source...);
+    if (size_changed) {
+      // If needed, we will free the data. the next mutable_data() call
+      // will create the data storage.
+      bool reset_tensor = false;
+      if (reserved_) {
+        // If tensor is reserved then don't claim its memeory unless capacity()
+        // is smaller than new size
+        reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize();
+      } else {
+        reset_tensor = storage_.capacity() < (storage_offset_ + numel_) * storage_.itemsize() ||
+            !FLAGS_caffe2_keep_on_shrink ||
+            storage_.capacity() - (storage_offset_ + numel_) * storage_.itemsize() >
+                FLAGS_caffe2_max_keep_on_shrink_memory;
+      }
+
+      if (reset_tensor && !is_init) {
+        FreeMemory();
+      }
+    }
+  }
+
+  /**
+   * Resize the tensor like the source tensor. Note that this is just a
+   * sugar wrapper that essentially calls Resize(src_tensor.dims()).
+   */
+  inline void ResizeLike(const TensorImpl& src_tensor) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        src_tensor.is_contiguous(),
+        "Right now ResizeLike is only supported for contiguous Tensor.");
+    // Note: need casting for different context types.
+    if (static_cast<void*>(this) != static_cast<const void*>(&src_tensor)) {
+      Resize(src_tensor.dims());
+    }
+  }
+
+  /**
+   * Resizes the tensor without touching underlying storage.
+   * This requires the total size of the tensor to remains constant.
+   */
+  inline void Reshape(const std::vector<TIndex>& dims) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now Reshape is only supported for contiguous Tensor.");
+    TIndex new_size = 1;
+    for (auto d : dims) {
+      CAFFE_ENFORCE_GE_WITH_CALLER(d, 0);
+      new_size *= d;
+    }
+    CAFFE_ENFORCE_WITH_CALLER(
+        new_size == numel_,
+        "New size and old size are not equal. You cannot use Reshape, "
+        "but should use Resize."
+        // TODO(jiayq): remove the following warning after pending diffs
+        // stabilize.
+        " The old caffe2 mixes Reshape and Resize but this behavior has "
+        "been changed. If you find this error, most likely you will need "
+        "to change corresponding code from Reshape to Resize.");
+    dims_ = dims;
+  }
+
+  inline void Reshape(const std::vector<int>& dims) {
+    Reshape(ToVectorTIndex(dims));
+  }
+
+  /**
+   * Release whatever memory the tensor was holding but keep size and type
+   * information. Subsequent call to mutable_data will trigger new memory
+   * allocation.
+   */
+  inline void FreeMemory() {
+    // We'll detach from the old Storage and create a new one
+    storage_ = at::Storage(storage_.device_type(), data_type_);
+    storage_offset_ = 0;
+  }
+
+  /**
+   * A utility function to print the debug string for the tensor. Note that this
+   * is very slow since it involves quite some string operations, so do not use
+   * it in your performance-critical code.
+   */
+  std::string DebugString() const {
+    std::stringstream ss;
+    ss << "A Tensor of item size " << storage_.itemsize() << " and type "
+       << data_type_.name() << " and dimension (";
+    for (int d : dims_) {
+      ss << d << ",";
+    }
+    ss << ").";
+    return ss.str();
+  }
+
+  /**
+   * @brief Shares the data with another tensor.
+   *
+   * To share data between two tensors, the sizes of the two tensors must be
+   * equal already. The reason we do not implicitly do a Resize to make the two
+   * tensors have the same shape is that we want to allow tensors of different
+   * shapes but the same number of items to still be able to share data. This
+   * allows one to e.g. have a n-dimensional Tensor and a flattened version
+   * sharing the same underlying storage.
+   *
+   * The source tensor should already have its data allocated.
+   */
+  void ShareData(const TensorImpl& src) {
+    // Right now, we are assuming the device_type are the same, since it is
+    // inherently the same in the non-templatized code. We should probably add
+    // an ENFORCE here which might affect perf a little bit.
+    CAFFE_ENFORCE_EQ_WITH_CALLER(
+        src.numel_,
+        numel_,
+        "Size mismatch - did you call reshape before sharing the data?");
+    // It is possible that the source tensor hasn't called mutable_data() yet,
+    // in which case ShareData() doesn't make much sense since we don't really
+    // know what to share yet.
+    CAFFE_ENFORCE_WITH_CALLER(
+        src.storage_.data() || src.numel_ == 0,
+        "Source tensor has no content and has size > 0");
+    // Finally, do sharing.
+    /* Since we create new Storage whenever we need to change data_type/capacity
+     * this still keeps the original semantics
+     */
+    storage_ = src.storage();
+    data_type_ = src.dtype();
+    storage_offset_ = src.storage_offset();
+  }
+
+  /**
+   * @brief Shares the data with an externally managed pointer.
+   *
+   * This is similar to ShareData() but the source is a pointer with an advanced
+   * deleter option. In default, no deletion takes place, and one needs to make
+   * sure that the external memory is deallocated only after the tensor finishes
+   * using it. If a Deleter object is passed in, when this tensor is reallocated
+   * or freed, the deleter function is going to be called.
+   */
+  template <typename T>
+  void
+  ShareExternalPointer(T* src, size_t capacity = 0, MemoryDeleter d = nullptr) {
+    ShareExternalPointer((void*)src, TypeMeta::Make<T>(), capacity, d);
+  }
+
+  template <typename T>
+  void ShareExternalPointer(at::DataPtr&& data_ptr, size_t capacity = 0) {
+    ShareExternalPointer(std::move(data_ptr), TypeMeta::Make<T>(), capacity);
+  }
+
+  void ShareExternalPointer(
+      void* src,
+      const TypeMeta& data_type,
+      size_t capacity = 0,
+      MemoryDeleter d = nullptr) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        is_contiguous_,
+        "Right now ShareExternalPointer is only supported for contiguos Tensor.");
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type.id() != TypeIdentifier::uninitialized(),
+        "To share with a raw external pointer you need to pass in an "
+        "initialized data_type(TypeMeta).");
+    ShareExternalPointer(
+        at::DataPtr(src, src, d, GetDeviceType()), data_type, capacity);
+  }
+
+  void ShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      const TypeMeta& data_type,
+      size_t capacity) {
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type.id() != TypeIdentifier::uninitialized(),
+        "To share with a raw external pointer you need to pass in an "
+        "initialized data_type(TypeMeta).");
+    if (!capacity) {
+      capacity = numel_ * data_type.itemsize();
+    }
+    if (storage_.unique()) {
+      CAFFE_ENFORCE_WITH_CALLER(
+          numel_ >= 0,
+          "To share data with a raw pointer, you need to set shape first.");
+      storage_.UniqueStorageShareExternalPointer(
+          std::move(data_ptr), data_type, capacity);
+      data_type_ = data_type;
+      storage_offset_ = 0;
+    } else {
+      int64_t numel = capacity / data_type.itemsize();
+      // Create a new Storage
+      storage_ = at::Storage(data_type, numel, std::move(data_ptr), nullptr, true);
+      data_type_ = data_type;
+      storage_offset_ = 0;
+    }
+  }
+
+  /**
+   * Returns a const raw void* pointer of the underlying storage. mutable_data()
+   * or raw_mutable_data() must have been called prior to this function call.
+   */
+  inline const void* raw_data() const {
+    CAFFE_ENFORCE_WITH_CALLER(storage_.data() || numel_ == 0);
+    return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * storage_.itemsize());
+  }
+
+  /**
+   * Returns a typed pointer of the underlying storage. mutable_data() or
+   * raw_mutable_data() must have been called prior to this function call, and
+   * the data type must be of the correct type. If you want to get a void*
+   * pointer instead, use raw_data().
+   */
+  template <typename T>
+  inline const T* data() const {
+    CAFFE_ENFORCE_WITH_CALLER(
+        storage_.data() || numel_ == 0,
+        "The tensor is of non-zero shape, but its data is not allocated yet. "
+        "Caffe2 uses a lazy allocation, so you will need to call "
+        "mutable_data() or raw_mutable_data() to actually allocate memory.");
+    CAFFE_ENFORCE_WITH_CALLER(
+        IsType<T>(),
+        "Tensor type mismatch, caller expects elements to be ",
+        TypeMeta::TypeName<T>(),
+        ", while tensor contains ",
+        data_type_.name(),
+        ". ");
+    return static_cast<T*>(storage_.data()) + storage_offset_;
+  }
+
+  /**
+   * Returns a mutable raw pointer of the underlying storage. Since we will need
+   * to know the type of the data for allocation, a TypeMeta object is passed in
+   * to specify the necessary information. This is conceptually equivalent of
+   * calling mutable_data<T>() where the TypeMeta parameter meta is derived from
+   * the type T. This function differs from mutable_data<T>() in the sense that
+   * the type T can be specified during runtime via the TypeMeta object.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
+  inline void* raw_mutable_data(const TypeMeta& meta) {
+    // For 0-size tensors it's fine to return any pointer (including nullptr)
+    if (data_type_ == meta && (storage_.data() || numel_ == 0)) {
+      return static_cast<void*>(static_cast<char*>(storage_.data()) + storage_offset_ * meta.itemsize());
+    } else {
+      CAFFE_ENFORCE_WITH_CALLER(
+          numel_ >= 0,
+          "Tensor is not initialized. You probably need to call Resize() "
+          "before calling mutable_data()");
+      bool had_special_dtor = data_type_.dtor() != nullptr;
+      storage_offset_ = 0;
+      if (storage_.unique()) {
+        storage_.set_dtype(meta);
+      } else {
+        if (data_type_ != meta) {
+          storage_ = at::Storage(storage_.device_type(), meta);
+        }
+      }
+      data_type_ = meta;
+
+      // We can reuse the existing buffer if the current data does not have
+      // a special destructor and the new data doesn't have a special
+      // constructor.
+      if (numel_ == 0 ||
+          (meta.ctor() == nullptr && !had_special_dtor &&
+           storage_.numel() >= numel_)) {
+        AT_ASSERT(storage_offset_ == 0); // because we just reallocated
+        return storage_.data();
+      }
+      const at::Allocator* allocator = storage_.allocator();
+      // TODO: Get rid of StaticContext
+      CAFFE_ENFORCE(
+          allocator == nullptr,
+          "Allocator is not used within Caffe2 functions, please use StaticContext instead.");
+      if (meta.ctor()) {
+        // For types that need placement new, we will call it, as well as
+        // making sure that when the data is freed, it calls the right
+        // destruction procedure.
+        auto size = numel_;
+        auto dtor = data_type_.dtor();
+        void* ptr;
+        at::DeleterFnPtr deleter;
+        auto ptr_and_deleter = GetStaticContext()->New(
+            numel_ * storage_.itemsize()); // Removing this can get rid of
+                                           // InefficientStdFunctionContext
+        ptr = ptr_and_deleter.first;
+        deleter = ptr_and_deleter.second;
+        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
+            ptr,
+            [size, dtor, deleter](void* local_ptr) -> void {
+              dtor(local_ptr, size);
+              deleter(local_ptr);
+            },
+            at::Device(storage_.device_type())));
+        data_type_.ctor()(storage_.data(), numel_);
+      } else {
+        // For fundamental type, new and delete is easier.
+        auto ptr_and_deleter =
+            GetStaticContext()->New(numel_ * storage_.itemsize());
+        storage_.set_data_ptr(at::InefficientStdFunctionContext::makeDataPtr(
+            ptr_and_deleter.first,
+            ptr_and_deleter.second,
+            at::Device(storage_.device_type())));
+      }
+      storage_.set_numel(numel_);
+      AT_ASSERT(storage_offset_ == 0); // because we just reallocated
+      return storage_.data();
+    }
+  }
+
+  /**
+   * Returns a mutable raw pointer of the underlying storage. This can only be
+   * used when you know for sure that the underlying storage of the tensor is
+   * already created via an earlier raw_mutable_data(meta) call or a
+   * mutable_data<T>() call.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
+  inline void* raw_mutable_data() {
+    CAFFE_ENFORCE_WITH_CALLER(
+        data_type_.id() != TypeIdentifier::uninitialized(),
+        "Calling raw_mutable_data() without meta, but the current meta is "
+        "of unknown type.");
+    return raw_mutable_data(data_type_);
+  }
+
+  /**
+   * Returns a typed pointer of the underlying storage.
+   *
+   * For fundamental types, we reuse possible existing storage if there
+   * is sufficient capacity.
+   */
+  template <typename T>
+  inline T* mutable_data() {
+    if ((numel_ == 0 || storage_.data()) && IsType<T>()) {
+      return static_cast<T*>(storage_.data()) + storage_offset_;
+    }
+    // Check it here statically - otherwise TypeMeta would throw the runtime
+    // error in attempt to invoke TypeMeta::ctor()
+    static_assert(
+        std::is_default_constructible<T>::value,
+        "Tensor can't hold non-default-constructible types");
+    return static_cast<T*>(raw_mutable_data(TypeMeta::Make<T>()));
+  }
+
+  /**
+   * Returns the number of dimensions of the data.
+   */
+  inline int ndim() const {
+    return dims_.size();
+  }
+  /**
+   * Returns the size (i.e. the number of items) of the tensor.
+   */
+  inline TIndex size() const {
+    return numel_;
+  }
+  /**
+   * Return the number of bytes each item takes in the tensor.
+   */
+  inline size_t itemsize() const {
+    return storage_.itemsize();
+  }
+  /**
+   * Returns the total number of bytes of the storage.
+   *
+   * This is equivalent to calling size() * itemsize().
+   */
+  inline size_t nbytes() const {
+    return numel_ * itemsize();
+    ;
+  }
+
+  // NB: This capacity may also include available space
+  // in the storage BEFORE the tensor data, if storage_offset != 0
+  inline size_t capacity_nbytes() const {
+    return storage_.capacity();
+  }
+  /**
+   * Returns the dimensions of the tensor as a vector.
+   */
+  inline const std::vector<TIndex>& dims() const {
+    return dims_;
+  }
+
+  inline TIndex size_from_dim(int k) const {
+    return size_from_dim_(k, dims_);
+  }
+
+  inline TIndex size_to_dim(int k) const {
+    return size_to_dim_(k, dims_);
+  }
+
+  inline TIndex size_between_dim(int k, int l) const {
+    return size_between_dim_(k, l, dims_);
+  }
+
+  /**
+   * Returns the 'canonical' version of a (usually)  user-specified axis,
+   * allowing for negative indexing (e.g., -1 for the last axis).
+   *
+   * @param axis_index the axis index.
+   *        If 0 <= index < ndim(), return index.
+   *        If -ndim <= index <= -1, return (ndim() - (-index)),
+   *        e.g., the last axis index (ndim() - 1) if index == -1,
+   *        the second to last if index == -2, etc.
+   *        Dies on out of range index.
+   */
+  inline int canonical_axis_index(int axis_index) const {
+    return canonical_axis_index_(axis_index, ndim());
+  }
+
+  inline int64_t stride(int64_t dim) const {
+#ifndef NDEBUG
+    // TODO: dim wrapping?
+    CAFFE_ENFORCE_LT_WITH_CALLER(dim, strides_.size(), "Exceeding ndim limit");
+    CAFFE_ENFORCE_GE_WITH_CALLER(
+        dim, 0, "Cannot have negative dimension index");
+#endif
+    return strides_[dim];
+  }
+
+  // TODO: Change to ArrayRef later
+  inline at::DimVector strides() {
+    return strides_;
+  }
+
+  inline bool is_contiguous() const {
+    return is_contiguous_;
+  }
+
+  /**
+   * Checks if the tensor content is of the given data type.
+   */
+  template <typename T>
+  inline bool IsType() const {
+    return storage_.IsType<T>();
+  }
+  /**
+   * Returns the TypeMeta object associated with the current data type.
+   */
+  inline const TypeMeta& meta() const {
+    return data_type_;
+  }
+
+  inline const TypeMeta& dtype() const {
+    return data_type_;
+  }
+
+  /**
+   * Returns the i-th dimension of the tensor in int.
+   *
+   * This function returns an int value instead of TIndex, which depending on
+   * the typedef could be int64. If you want int64 dim values, make sure you
+   * call dim() instead.
+   */
+  inline int dim32(const int i) const {
+#ifndef NDEBUG
+    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
+    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
+#endif
+    CAFFE_ENFORCE_LT_WITH_CALLER(dims_[i], std::numeric_limits<int>::max());
+    return static_cast<int>(dims_[i]);
+  }
+
+  /**
+   * Returns the i-th dimension of the tensor. Note that the passed in index
+   * must be between 0 (inclusive) and the number of dimensions, otherwise
+   * this function will produce a fatal message.
+   */
+  inline TIndex dim(const int i) const {
+#ifndef NDEBUG
+    CAFFE_ENFORCE_LT_WITH_CALLER(i, dims_.size(), "Exceeding ndim limit");
+    CAFFE_ENFORCE_GE_WITH_CALLER(i, 0, "Cannot have negative dimension index");
+#endif
+    return dims_[i];
+  }
+
+  void ExtractDeviceOption(DeviceOption* device) const {
+    auto* context = GetStaticContext();
+    CHECK(context);
+    context->ExtractDeviceOption(device, raw_data());
+  }
+
+  const at::Storage& storage() {
+    return storage_;
+  }
+
+  const at::Storage& storage() const {
+    return storage_;
+  }
+
+  int64_t storage_offset() const {
+    return storage_offset_;
+  }
+
+ protected:
+  // TODO: change to DimVector
+  std::vector<TIndex> dims_; // sizes_
+  at::DimVector strides_;
+  TIndex numel_ = -1; // numel_
+  bool is_contiguous_ = true;
+  // we decide to keep reserved_ and it will
+  // live in Tensor after the split
+  // The logic is that if Extend() or ReserveSpace() were ever called,
+  // then subsequent Resize()s will not free up Storage.
+  bool reserved_ = false;
+  at::Storage storage_;
+  int64_t storage_offset_ = 0;
+  TypeMeta data_type_;
+
+ private:
+  template <
+      typename T,
+      typename = typename std::enable_if<std::is_integral<T>::value>::type>
+  bool SetDims(const std::vector<T>& src) {
+    auto old_numel = numel_;
+    dims_.resize(src.size());
+    TIndex new_numel = 1;
+    for (size_t i = 0; i < src.size(); ++i) {
+      new_numel *= src[i];
+      dims_[i] = src[i];
+    }
+    update_strides();
+    numel_ = new_numel;
+    return numel_ != old_numel;
+  }
+
+  bool SetDims() {
+    auto old_numel = numel_;
+    dims_.resize(0);
+    update_strides();
+    numel_ = 1;
+    return numel_ != old_numel;
+  }
+
+  // TODO(jiayq): maybe rewrite the following functions with initializer list.
+  // NVCC does not play well with initializer lists last time, but worth
+  // another shot.
+  bool SetDims(const TIndex d0) {
+    auto old_numel = numel_;
+    dims_.resize(1);
+    dims_[0] = d0;
+    update_strides();
+    numel_ = d0;
+    return numel_ != old_numel;
+  }
+
+  bool SetDims(const TIndex d0, const TIndex d1) {
+    auto old_numel = numel_;
+    dims_.resize(2);
+    dims_[0] = d0;
+    dims_[1] = d1;
+    update_strides();
+    numel_ = d0 * d1;
+    return numel_ != old_numel;
+  }
+
+  bool SetDims(const TIndex d0, const TIndex d1, const TIndex d2) {
+    auto old_numel = numel_;
+    dims_.resize(3);
+    dims_[0] = d0;
+    dims_[1] = d1;
+    dims_[2] = d2;
+    update_strides();
+    numel_ = d0 * d1 * d2;
+    return numel_ != old_numel;
+  }
+
+  bool
+  SetDims(const TIndex d0, const TIndex d1, const TIndex d2, const TIndex d3) {
+    auto old_numel = numel_;
+    dims_.resize(4);
+    dims_[0] = d0;
+    dims_[1] = d1;
+    dims_[2] = d2;
+    dims_[3] = d3;
+    update_strides();
+    numel_ = d0 * d1 * d2 * d3;
+    return numel_ != old_numel;
+  }
+
+  inline void update_strides() {
+    strides_.resize(dims_.size());
+    if (ndim() > 0) {
+      int last_idx = ndim() - 1;
+      strides_[last_idx] = 1;
+      for (auto i = last_idx - 1; i >= 0; --i) {
+        strides_[i] = strides_[i + 1] * std::max<int64_t>(dims_[i + 1], 1);
+      }
+    }
+    is_contiguous_ = true;
+  }
+};
+
+}
diff --git a/caffe2/ideep/operators/concat_split_op.cc b/caffe2/ideep/operators/concat_split_op.cc
index 25d4e16d2f9e7a..311c6446184a87 100644
--- a/caffe2/ideep/operators/concat_split_op.cc
+++ b/caffe2/ideep/operators/concat_split_op.cc
@@ -33,7 +33,7 @@ class IDEEPConcatOp final : public IDEEPOperator {
       if (OperatorBase::InputBlob(i).template IsType<itensor>()) {
         inputs.emplace_back(Input(i));
       } else {
-        CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsType<Tensor>(CPU),
+        CAFFE_ENFORCE(OperatorBase::InputBlob(i).IsTensorType(CPU),
                       "Expect cpu tensor if not itensor");
         auto& tensor_cpu = OperatorBase::Input<Tensor>(i, CPU);
         CAFFE_ENFORCE(tensor_cpu.dims().size() == 0 ||
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index 9ae2323442120a..08e6de2ae3f0dc 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -121,7 +121,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->template IsType<Tensor>(CPU),
+          local_output_blobs_[i]->IsTensorType(CPU),
           "IDEEP fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc
index 63bd0da7cb5cb6..626568a989b939 100644
--- a/caffe2/ideep/operators/utility_ops.cc
+++ b/caffe2/ideep/operators/utility_ops.cc
@@ -31,7 +31,7 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator {
   USE_IDEEP_DEF_ALIASES();
   bool RunOnDevice() override {
     const auto& input_blob = OperatorBase::InputBlob(0);
-    if (input_blob.template IsType<Tensor>(CPU)) {
+    if (input_blob.IsTensorType(CPU)) {
       VLOG(2) << "Directing sharing of TensorCPU";
       const auto& X = OperatorBase::Input<Tensor>(0, CPU);
       auto* Y = OperatorBase::Output<Tensor>(0, CPU);
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index c8657728c57e76..f50a4f34c66789 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -119,7 +119,7 @@ class IDEEPContext final : public BaseContext {
     return true;
   }
 
-  DeviceType GetDevicetype() const override {
+  DeviceType device_type() const override {
     return IDEEP;
   }
 
diff --git a/caffe2/mkl/operators/operator_fallback_mkl.h b/caffe2/mkl/operators/operator_fallback_mkl.h
index ce4c85d2c231e0..6d9713b74612d8 100644
--- a/caffe2/mkl/operators/operator_fallback_mkl.h
+++ b/caffe2/mkl/operators/operator_fallback_mkl.h
@@ -93,7 +93,7 @@ class MKLFallbackOp final : public Operator<MKLContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->template IsType<Tensor>(CPU),
+          local_output_blobs_[i]->IsTensorType(CPU),
           "MKL fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
diff --git a/caffe2/mkl/operators/packed_fc_op.cc b/caffe2/mkl/operators/packed_fc_op.cc
index 1f3231dc521f5d..0ed93cf061070c 100644
--- a/caffe2/mkl/operators/packed_fc_op.cc
+++ b/caffe2/mkl/operators/packed_fc_op.cc
@@ -49,7 +49,7 @@ class PackedFCOp final : public Operator<CPUContext> {
 
     // Check out what is the passed in format.
     const MKLPackedMatrix* packed_matrix = nullptr;
-    if (OperatorBase::InputIsType<Tensor>(1, CPU)) {
+    if (OperatorBase::InputIsTensorType(1, CPU)) {
       const auto& W = Input(1);
       CAFFE_ENFORCE_EQ(W.ndim(), 2);
       CAFFE_ENFORCE_EQ(W.dim32(0), N);
diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h
index 636ebf2217eac8..0a7b5808a446be 100644
--- a/caffe2/mkl/utils/mkl_context.h
+++ b/caffe2/mkl/utils/mkl_context.h
@@ -127,7 +127,7 @@ class MKLContext : public BaseContext {
     return true;
   }
 
-  DeviceType GetDevicetype() const override {
+  DeviceType device_type() const override {
     return MKLDNN;
   }
 
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index 755e1b5a57b8a9..2238d7af08dda6 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -489,7 +489,7 @@ bool RunOnDevice() override {
         "noise_size", 491 /* prime to avoid artifacts */);
     // Treaded as half4 in the kernel, so need half4 here.
     noiseSize = divRoundUp(noiseSize, 4) * 4;
-    if (!noiseBlob->IsType<Tensor>(CPU) ||
+    if (!noiseBlob->IsTensorType(CPU) ||
         noiseBlob->Get<TensorCPU>().size() != noiseSize) {
       VLOG(2) << "Initializing stylizer with noise: " << noiseSize;
       caffe2::Timer rt;
diff --git a/caffe2/mpi/mpi_ops.h b/caffe2/mpi/mpi_ops.h
index b84de851a9e948..8657c107ed0f33 100644
--- a/caffe2/mpi/mpi_ops.h
+++ b/caffe2/mpi/mpi_ops.h
@@ -36,7 +36,7 @@ class MPIBroadcastOp final : public Operator<Context> {
   bool RunOnDevice() override {
     MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
     CAFFE_ENFORCE(
-        OperatorBase::OutputIsType<Tensor>(0, Context::GetDeviceType()),
+        OperatorBase::OutputIsTensorType(0, Context::GetDeviceType()),
         "Output is of wrong type.");
     auto* output = Output(0);
     // Make sure that output is already allocated.
diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc
index 5d40916be09346..bf4e20b7904711 100644
--- a/caffe2/observers/profile_observer_gpu.cc
+++ b/caffe2/observers/profile_observer_gpu.cc
@@ -26,12 +26,12 @@ void ProfileOperatorObserver::Dump() const {
   LOG(INFO) << "--------- Starting operator " << subject_->debug_def().type()
             << " op#" << getId() << " ---------";
   for (int i = 0; i < subject_->InputSize(); ++i) {
-    if (subject_->InputIsType<Tensor>(i, CPU)) {
+    if (subject_->InputIsTensorType(i, CPU)) {
       const auto& tensor = subject_->Input<Tensor>(i, CPU);
       const auto& name = subject_->debug_def().input(i);
       TensorPrinter printer(name);
       LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
-    } else if (subject_->InputIsType<Tensor>(i, CUDA)) {
+    } else if (subject_->InputIsTensorType(i, CUDA)) {
       const auto& tensor = subject_->Input<Tensor>(i, CUDA);
       const auto& name = subject_->debug_def().input(i);
       TensorPrinter printer(name);
@@ -46,12 +46,12 @@ void ProfileOperatorObserver::Dump() const {
   }
 
   for (int o = 0; o < subject_->OutputSize(); ++o) {
-    if (subject_->OutputIsType<Tensor>(o, CPU)) {
+    if (subject_->OutputIsTensorType(o, CPU)) {
       auto* tensor = subject_->Output<Tensor>(o, CPU);
       const auto& name = subject_->debug_def().output(o);
       TensorPrinter printer(name);
       LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
-    } else if (subject_->OutputIsType<Tensor>(o, CUDA)) {
+    } else if (subject_->OutputIsTensorType(o, CUDA)) {
       auto* tensor = subject_->Output<Tensor>(o, CUDA);
       const auto& name = subject_->debug_def().output(o);
       TensorPrinter printer(name);
diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 5ccea70926fb6f..2350910febff27 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -301,7 +301,8 @@ Caffe2Backend::get_renamed_operators() const {
       {"Less", "LT"},
       {"Greater", "GT"},
       {"Unsqueeze", "ExpandDims"},
-      {"Tile", "NumpyTile"}};
+      {"Tile", "NumpyTile"},
+      {"DynamicSlice", "Slice"}};
   return kRenamedOperators;
 }
 
@@ -356,7 +357,8 @@ Caffe2Backend::get_special_operators() const {
               {"MatMul", &Caffe2Backend::CreateMatMul},
               {"Upsample", &Caffe2Backend::CreateUpsample},
               {"Dropout", &Caffe2Backend::CreateDropout},
-              {"LRN", &Caffe2Backend::CreateLRN}};
+              {"LRN", &Caffe2Backend::CreateLRN},
+              {"DynamicSlice", &Caffe2Backend::CreateDynamicSlice}};
   return kSpecialOperators;
 }
 
@@ -899,7 +901,6 @@ Caffe2Ops Caffe2Backend::CreateSlice(
 
   auto starts_vals_tensor = dummy_->NewDummyName();
   auto starts_tensor = dummy_->NewDummyName();
-  auto casted_starts_tensor = dummy_->NewDummyName();
   c2_op = ret.ops.Add();
   {
     caffe2::Argument shape_starts;
@@ -936,12 +937,9 @@ Caffe2Ops Caffe2Backend::CreateSlice(
   caffe2::Argument to;
   to.set_name("to");
   to.set_i(static_cast<int64_t>(caffe2::TensorProto::INT32));
-  c2_op = ret.ops.Add();
-  BuildOperator(c2_op, "Cast", {starts_tensor}, {casted_starts_tensor}, {to});
 
   auto ends_vals_tensor = dummy_->NewDummyName();
   auto ends_tensor = dummy_->NewDummyName();
-  auto casted_ends_tensor = dummy_->NewDummyName();
   c2_op = ret.ops.Add();
   {
     caffe2::Argument shape_ends;
@@ -965,17 +963,168 @@ Caffe2Ops Caffe2Backend::CreateSlice(
       "ScatterAssign",
       {ends_tensor, axes_tensor, ends_vals_tensor},
       {ends_tensor});
-  // Slice only accepts ends as int
+
+  // attach the original op at the end
+  c2_op = ret.ops.Add();
+  c2_op->CopyFrom(*op);
+  c2_op->mutable_input()->Clear();
+  c2_op->add_input(data);
+  c2_op->add_input(starts_tensor);
+  c2_op->add_input(ends_tensor);
+  c2_op->mutable_arg()->Clear();
+  for (const auto& kv : args) {
+    c2_op->add_arg()->CopyFrom(*kv.second);
+  }
+
+  return ret;
+}
+
+// Do the following:
+// for a given index tensor (i.e. `starts` or `ends`):
+// 1) Hilariously subtract 1 from the value if it is negative. This due to
+//    the behavior of Caffe2's slice operator not matching that of ONNX's slice
+// 2) Fully expand the index tensor out to the rank of the data tensor.
+//    pseudocode: indices_full = zeros(rank); indices_full[axes] = indices.int()
+std::string Caffe2Backend::PreprocessSliceIndexTensor(OnnxNode* onnx_node,
+                                                      Caffe2Ops& ret,
+                                                      std::string indices_tensor,
+                                                      std::string axes_tensor,
+                                                      std::string rank_tensor,
+                                                      std::string zero_tensor,
+                                                      std::string one_tensor,
+                                                      int default_value) {
+  auto indices_tensor_full = dummy_->NewDummyName();
+
+  {
+    caffe2::Argument value;
+    value.set_name("value");
+    value.set_i(default_value);
+    caffe2::Argument dtype;
+    dtype.set_name("dtype");
+    dtype.set_i(static_cast<int64_t>(caffe2::TensorProto::INT64));
+    caffe2::Argument input_as_shape;
+    input_as_shape.set_name("input_as_shape");
+    input_as_shape.set_i(1);
+    auto c2_op = ret.ops.Add();
+    BuildOperator(c2_op, "ConstantFill", {rank_tensor}, {indices_tensor_full},
+                  {value, dtype, input_as_shape});
+  }
+
+  // Subtract 1 from each element of the indices tensor that is negative
+  auto lt_tensor = dummy_->NewDummyName();
+  {
+    caffe2::Argument broadcast;
+    broadcast.set_name("broadcast");
+    broadcast.set_i(1);
+    auto c2_op = ret.ops.Add();
+    BuildOperator(c2_op, "LT", {indices_tensor, zero_tensor}, {lt_tensor}, {broadcast});
+  }
+
+  auto sub_one_tensor = dummy_->NewDummyName();
+  {
+    caffe2::Argument broadcast;
+    broadcast.set_name("broadcast");
+    broadcast.set_i(1);
+    auto c2_op = ret.ops.Add();
+    BuildOperator(c2_op, "Sub", {indices_tensor, one_tensor}, {sub_one_tensor}, {broadcast});
+  }
+
+  auto indices_tensor_adjusted = dummy_->NewDummyName();
+  auto c2_op = ret.ops.Add();
+  BuildOperator(c2_op, "Conditional", {lt_tensor, sub_one_tensor, indices_tensor}, {indices_tensor_adjusted}, {});
+
+  // Fill in values specified from the partially-specified ONNX indices tensor
+  c2_op = ret.ops.Add();
+  BuildOperator(c2_op, "ScatterAssign",
+                {indices_tensor_full, axes_tensor, indices_tensor_adjusted},
+                {indices_tensor_full});
+
+  return indices_tensor_full;
+}
+
+Caffe2Ops Caffe2Backend::CreateDynamicSlice(
+    OnnxNode* onnx_node,
+    const ConversionContext& ctx) {
+  auto op_tmp = CommonOnnxNodeToCaffe2Ops(onnx_node, ctx);
+  CAFFE_ENFORCE_EQ(op_tmp.ops.size(), 1);
+  auto* op = op_tmp.ops.Mutable(0);
+  std::unordered_map<std::string, caffe2::Argument*> args;
+  for (auto& arg : *op->mutable_arg()) {
+    args.emplace(arg.name(), &arg);
+  }
+
+  CAFFE_ENFORCE_GE(op->input_size(), 1);
+  auto data = op->input(0);
+  Caffe2Ops ret;
+
+  // First get the shape of the input tensor
+  auto* c2_op = ret.ops.Add();
+  auto size_tensor = dummy_->NewDummyName();
+  BuildOperator(c2_op, "Shape", {data}, {size_tensor});
+
+  // Now get the rank of the tensor by getting the shape of the shape of
+  // the input tensor
   c2_op = ret.ops.Add();
-  BuildOperator(c2_op, "Cast", {ends_tensor}, {casted_ends_tensor}, {to});
+  auto rank_tensor = dummy_->NewDummyName();
+  BuildOperator(c2_op, "Shape", {size_tensor}, {rank_tensor});
+
+  // Axes tensor will be used to populate the fully-specified starts and ends
+  // arguments to the caffe2 Slice operator.
+  std::string axes_tensor;
+  if (onnx_node->node.input_size() > 2) {
+    axes_tensor = onnx_node->node.input(3);
+  } else {
+    axes_tensor = dummy_->NewDummyName();
+    auto* c2_op = ret.ops.Add();
+    BuildOperator(c2_op, "Range", {rank_tensor}, {axes_tensor}, {});
+  }
+
+  // Useful int tensors
+  auto define_integer_constant = [this, &ret](int val) {
+    caffe2::Argument value;
+    value.set_name("value");
+    value.set_i(val);
+    caffe2::Argument dtype;
+    dtype.set_name("dtype");
+    dtype.set_i(static_cast<int64_t>(caffe2::TensorProto::INT64));
+    caffe2::Argument shape;
+    shape.set_name("shape");
+    shape.add_ints(1);
+    auto c2_op = ret.ops.Add();
+    auto name = dummy_->NewDummyName();
+    BuildOperator(c2_op, "ConstantFill", {}, {name},
+                  {value, dtype, shape});
+    return name;
+  };
+
+  auto zero_tensor = define_integer_constant(0);
+  auto one_tensor = define_integer_constant(1);
+
+  auto starts_tensor_full = PreprocessSliceIndexTensor(onnx_node,
+                                                       ret,
+                                                       onnx_node->node.input(1), // starts
+                                                       axes_tensor,
+                                                       rank_tensor,
+                                                       zero_tensor,
+                                                       one_tensor,
+                                                       0);
+
+  auto ends_tensor_full = PreprocessSliceIndexTensor(onnx_node,
+                                                     ret,
+                                                     onnx_node->node.input(2), // ends
+                                                     axes_tensor,
+                                                     rank_tensor,
+                                                     zero_tensor,
+                                                     one_tensor,
+                                                     -1);
 
   // attach the original op at the end
   c2_op = ret.ops.Add();
   c2_op->CopyFrom(*op);
   c2_op->mutable_input()->Clear();
   c2_op->add_input(data);
-  c2_op->add_input(casted_starts_tensor);
-  c2_op->add_input(casted_ends_tensor);
+  c2_op->add_input(starts_tensor_full);
+  c2_op->add_input(ends_tensor_full);
   c2_op->mutable_arg()->Clear();
   for (const auto& kv : args) {
     c2_op->add_arg()->CopyFrom(*kv.second);
diff --git a/caffe2/onnx/backend.h b/caffe2/onnx/backend.h
index eab0e2f7e1f131..2b74dec1e3ccea 100644
--- a/caffe2/onnx/backend.h
+++ b/caffe2/onnx/backend.h
@@ -11,7 +11,7 @@
 #include <unordered_map>
 #include <unordered_set>
 
-constexpr int kKnownOpsetVersion = 7;
+constexpr int kKnownOpsetVersion = 9;
 
 namespace caffe2 {
 namespace onnx {
@@ -212,6 +212,17 @@ class CAFFE2_API Caffe2Backend {
 
   Caffe2Ops CreateSlice(OnnxNode* onnx_node, const ConversionContext& ctx);
 
+  std::string PreprocessSliceIndexTensor(OnnxNode* onnx_node,
+                                                        Caffe2Ops& ret,
+                                                        std::string indices_tensor,
+                                                        std::string axes_tensor,
+                                                        std::string rank_tensor,
+                                                        std::string zero_tensor,
+                                                        std::string one_tensor,
+                                                        int default_value);
+
+  Caffe2Ops CreateDynamicSlice(OnnxNode* onnx_node, const ConversionContext& ctx);
+
   Caffe2Ops CreateSplit(OnnxNode* onnx_node, const ConversionContext& ctx);
 
   Caffe2Ops CreateReciprocal(OnnxNode* onnx_node, const ConversionContext& ctx);
diff --git a/caffe2/operators/copy_op.cc b/caffe2/operators/copy_op.cc
new file mode 100644
index 00000000000000..582e31475780bf
--- /dev/null
+++ b/caffe2/operators/copy_op.cc
@@ -0,0 +1,198 @@
+#include "caffe2/operators/copy_op.h"
+
+namespace caffe2 {
+
+// From CPU, copy it to whatever the current context
+REGISTER_CPU_OPERATOR(
+    CopyFromCPUInput,
+    CopyOp<CPUContext, CPUContext, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    CopyOnDeviceLike,
+    CopyOnDeviceLikeOp<CPUContext, CPUContext, CPUContext>);
+REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
+
+OPERATOR_SCHEMA(Copy)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .SetDoc(R"DOC(
+Copy input tensor into output, potentially across devices.
+
+Github Links:
+
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/copy_op.cc
+- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/copy_op.h
+
+
+<details>
+
+<summary> <b>Example</b> </summary>
+
+**Code**
+
+```
+
+workspace.ResetWorkspace()
+
+op = core.CreateOperator(
+    "Copy",
+    ["input"],
+    ["output"]
+)
+
+workspace.FeedBlob("input", np.random.rand(3,3))
+print("input:", workspace.FetchBlob("input"))
+workspace.RunOperatorOnce(op)
+print("output:", workspace.FetchBlob("output"))
+
+```
+
+**Result**
+
+```
+
+input:
+[[0.16826761 0.68168217 0.55196001]
+ [0.19735483 0.34837823 0.69015595]
+ [0.09448514 0.57390828 0.37097193]]
+output:
+[[0.16826761 0.68168217 0.55196001]
+ [0.19735483 0.34837823 0.69015595]
+ [0.09448514 0.57390828 0.37097193]]
+
+```
+
+</details>
+
+)DOC")
+    .Input(0, "input", "(*Tensor*): input tensor to copy")
+    .Output(0, "output", "(*Tensor*): copy of input tensor");
+
+OPERATOR_SCHEMA(CopyGPUToCPU)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      CAFFE_ENFORCE(
+          def.has_device_option(),
+          "CopyGPUToCPU op should have cuda device option.");
+      auto& cuda_option = def.device_option();
+      auto cpu_option = DeviceOption();
+      vector<DeviceOption> in_dev(def.input_size(), cuda_option);
+      vector<DeviceOption> out_dev(def.output_size(), cpu_option);
+      return std::make_pair(in_dev, out_dev);
+    })
+    .SetDoc(R"DOC(
+Copy tensor for GPU to CPU context. Must be run under GPU device option.
+)DOC")
+    .Input(0, "input", "The input tensor.")
+    .Output(0, "output", "Tensor that will contain a copy of the input.");
+
+OPERATOR_SCHEMA(CopyCPUToGPU)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      CAFFE_ENFORCE(
+          def.has_device_option(),
+          "CopyCPUToGPU op should have cuda device option.");
+      auto& cuda_option = def.device_option();
+      auto cpu_option = DeviceOption();
+      vector<DeviceOption> in_dev(def.input_size(), cpu_option);
+      vector<DeviceOption> out_dev(def.output_size(), cuda_option);
+      return std::make_pair(in_dev, out_dev);
+    })
+    .SetDoc(R"DOC(
+Copy tensor for CPU to GPU context. Must be run under GPU device option.
+)DOC")
+    .Input(0, "input", "The input tensor.")
+    .Output(0, "output", "Tensor that will contain a copy of the input.");
+
+OPERATOR_SCHEMA(CopyFromCPUInput)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .IdenticalTypeAndShape()
+    .InputsCanCrossDevices()
+    .DeviceInferenceFunction([](const OperatorDef& def) {
+      auto op_device =
+          def.has_device_option() ? def.device_option() : DeviceOption();
+      auto cpu_option = DeviceOption();
+      vector<DeviceOption> in_dev(def.input_size(), cpu_option);
+      vector<DeviceOption> out_dev(def.output_size(), op_device);
+      return std::make_pair(in_dev, out_dev);
+    })
+    .SetDoc(R"DOC(
+Take a CPU input tensor and copy it to an output in the current
+Context (GPU or CPU). This may involves cross-device MemCpy.
+)DOC")
+    .Input(0, "input", "The input CPU tensor.")
+    .Output(0, "output", "either a TensorCUDA or a TensorCPU");
+
+OPERATOR_SCHEMA(CopyOnDeviceLike)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc("Copy input tensor into output to the specific device.")
+    .Input(0, "input", "The input tensor.")
+    .Input(1, "dst", "Tensor, on which device the copy will be performed.")
+    .Output(0, "output", "Tensor that will contain a copy of the input.");
+
+struct GetCopyGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "CopyOnDeviceLike",
+        "",
+        vector<string>{GO(0), I(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(Copy, GetCopyGradient);
+
+struct GetGPUToCPUGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (g_output_[0].IsDense()) {
+      return SingleGradientDef(
+          "CopyCPUToGPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+    } else {
+      return vector<OperatorDef>{CreateOperatorDef(
+                                     "CopyCPUToGPU",
+                                     "",
+                                     std::vector<string>{GO_I(0)},
+                                     std::vector<string>{GI_I(0)}),
+                                 CreateOperatorDef(
+                                     "CopyCPUToGPU",
+                                     "",
+                                     std::vector<string>{GO_V(0)},
+                                     std::vector<string>{GI_V(0)})};
+    }
+  }
+};
+REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient);
+
+struct GetCPUToGPUGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (g_output_[0].IsDense()) {
+      return SingleGradientDef(
+          "CopyGPUToCPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
+    } else {
+      return vector<OperatorDef>{CreateOperatorDef(
+                                     "CopyGPUToCPU",
+                                     "",
+                                     std::vector<string>{GO_I(0)},
+                                     std::vector<string>{GI_I(0)}),
+                                 CreateOperatorDef(
+                                     "CopyGPUToCPU",
+                                     "",
+                                     std::vector<string>{GO_V(0)},
+                                     std::vector<string>{GI_V(0)})};
+    }
+  }
+};
+REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);
+
+} // namespace caffe2
diff --git a/caffe2/operators/copy_op.cu b/caffe2/operators/copy_op.cu
new file mode 100644
index 00000000000000..e833e720e556f3
--- /dev/null
+++ b/caffe2/operators/copy_op.cu
@@ -0,0 +1,48 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/copy_op.h"
+
+namespace caffe2 {
+
+template <>
+class CopyOnDeviceLikeOp<CUDAContext, CUDAContext, CUDAContext>
+    : public Operator<CUDAContext> {
+ public:
+  CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws) {}
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    auto* output = OperatorBase::Output<Tensor>(0, CUDA);
+    CUDAContext context(GetGPUIDForPointer(Input(1).raw_data()));
+    output->ResizeLike(input);
+    context.template CopyItems<CUDAContext, CUDAContext>(
+        input.meta(),
+        input.size(),
+        input.raw_data(),
+        output->raw_mutable_data(input.meta()));
+    return true;
+  }
+};
+
+// From CPU, copy it to whatever the current context
+REGISTER_CUDA_OPERATOR(
+    CopyFromCPUInput,
+    CopyOp<CUDAContext, CUDAContext, CPUContext>);
+
+// CopyGPUToCPU and CopyCPUToGPU should both be carried out in a cuda context,
+// since gpu code will be involved.
+REGISTER_CUDA_OPERATOR(
+    CopyGPUToCPU,
+    CopyOp<CUDAContext, CPUContext, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    CopyCPUToGPU,
+    CopyOp<CUDAContext, CUDAContext, CPUContext>);
+// If we only specify Copy, we assume that it is a gpu to gpu copy - maybe
+// involving different GPUs.
+REGISTER_CUDA_OPERATOR(Copy, CopyOp<CUDAContext, CUDAContext, CUDAContext>);
+
+REGISTER_CUDA_OPERATOR(
+    CopyOnDeviceLike,
+    CopyOnDeviceLikeOp<CUDAContext, CUDAContext, CUDAContext>);
+} // namespace caffe2
diff --git a/caffe2/operators/copy_op.h b/caffe2/operators/copy_op.h
new file mode 100644
index 00000000000000..11e8e15fbcf005
--- /dev/null
+++ b/caffe2/operators/copy_op.h
@@ -0,0 +1,38 @@
+#ifndef CAFFE2_OPERATORS_COPY_OP_H_
+#define CAFFE2_OPERATORS_COPY_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+namespace caffe2 {
+
+template <class Context, class DstContext, class SrcContext>
+class CopyOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  USE_SIMPLE_CTOR_DTOR(CopyOp)
+
+  bool RunOnDevice() override {
+    auto& input = this->template Input<Tensor>(0, SrcContext::GetDeviceType());
+    auto* output =
+        this->template Output<Tensor>(0, DstContext::GetDeviceType());
+    output->ResizeLike(input);
+    this->context_.template CopyItems<SrcContext, DstContext>(
+        input.meta(),
+        input.size(),
+        input.raw_data(),
+        output->raw_mutable_data(input.meta()));
+    return true;
+  }
+};
+
+template <class Context, class DstContext, class SrcContext>
+class CopyOnDeviceLikeOp : public CopyOp<Context, DstContext, SrcContext> {
+ public:
+  CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws)
+      : CopyOp<Context, DstContext, SrcContext>(operator_def, ws) {}
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_COPY_OP_H_
diff --git a/caffe2/operators/cross_entropy_op.cu b/caffe2/operators/cross_entropy_op.cu
index 7030e846b714e2..df7a124d29711f 100644
--- a/caffe2/operators/cross_entropy_op.cu
+++ b/caffe2/operators/cross_entropy_op.cu
@@ -13,7 +13,7 @@ __global__ void LabelCrossEntropyKernel(
     const float log_threshold, float* Ydata) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     CUDA_KERNEL_ASSERT(labeldata[i] >= 0 && labeldata[i] < D);
-    Ydata[i] = -logf(max(Xdata[i * D + labeldata[i]], log_threshold));
+    Ydata[i] = -logf(fmaxf(Xdata[i * D + labeldata[i]], log_threshold));
   }
 }
 __global__ void LabelCrossEntropyGradientKernel(
@@ -21,7 +21,7 @@ __global__ void LabelCrossEntropyGradientKernel(
     const float* dYdata, const float log_threshold, float* dXdata) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     int idx = i * D + labeldata[i];
-    dXdata[idx] = - dYdata[i] / max(Xdata[idx], log_threshold);
+    dXdata[idx] = - dYdata[i] / fmaxf(Xdata[idx], log_threshold);
   }
 }
 }  // namespace
diff --git a/caffe2/operators/ensure_cpu_output_op.h b/caffe2/operators/ensure_cpu_output_op.h
index 08207644f7f094..3b8cb439f12066 100644
--- a/caffe2/operators/ensure_cpu_output_op.h
+++ b/caffe2/operators/ensure_cpu_output_op.h
@@ -15,9 +15,9 @@ class EnsureCPUOutputOp : public Operator<Context> {
       : Operator<Context>(operator_def, ws) {}
 
   bool RunOnDevice() override {
-    if (this->template InputIsType<Tensor>(0, CPU)) {
+    if (this->InputIsTensorType(0, CPU)) {
       return CopyWithContext<CPUContext>();
-    } else if (this->template InputIsType<Tensor>(0, Context::GetDeviceType())) {
+    } else if (this->InputIsTensorType(0, Context::GetDeviceType())) {
       // CUDA Context will go this branch
       return CopyWithContext<Context>();
     } else {
diff --git a/caffe2/operators/half_float_ops.h b/caffe2/operators/half_float_ops.h
index e6698a0b7283d1..b8d5dacf69472b 100644
--- a/caffe2/operators/half_float_ops.h
+++ b/caffe2/operators/half_float_ops.h
@@ -76,7 +76,7 @@ inline std::vector<TensorShape> Float16FillerTensorInference(
   vector<TensorShape> out(1);
   ArgumentHelper helper(def);
   out[0].set_data_type(static_cast<TensorProto_DataType>(
-      helper.GetSingleArgument<int>("dtype", TensorProto_DataType_FLOAT)));
+      helper.GetSingleArgument<int>("dtype", TensorProto_DataType_FLOAT16)));
   auto shape = helper.GetRepeatedArgument<int>("shape");
   for (int d : shape) {
     out[0].add_dims(d);
diff --git a/caffe2/operators/if_op.h b/caffe2/operators/if_op.h
index cff2a620ef4694..e76fea0ee8f21a 100644
--- a/caffe2/operators/if_op.h
+++ b/caffe2/operators/if_op.h
@@ -32,7 +32,7 @@ class IfOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     CAFFE_ENFORCE(
-        this->template InputIsType<Tensor>(0, Context::GetDeviceType()),
+        this->InputIsTensorType(0, Context::GetDeviceType()),
         "Invalid condition in If operator: tensor expected");
 
     const auto& condition = Input(0);
diff --git a/caffe2/operators/length_split_op.cc b/caffe2/operators/length_split_op.cc
new file mode 100644
index 00000000000000..7c342d154491b1
--- /dev/null
+++ b/caffe2/operators/length_split_op.cc
@@ -0,0 +1,37 @@
+#include "caffe2/operators/length_split_op.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(LengthsSplit, LengthsSplitOp<CPUContext>);
+
+OPERATOR_SCHEMA(LengthsSplit)
+    .NumInputs(1, 2)
+    .NumOutputs(1)
+    .ScalarType(TensorProto::INT32)
+    .SetDoc(R"DOC(
+Given input vector LENGTHS, and input n_split, LengthsSplit returns
+a single output vector. It "splits" each length into n_split values which add
+up to the original length. It will attempt to do equal splits, and if not possible,
+it orders larger values first. If the n_split is larger than the length, zero
+padding will be applied.
+
+e.g. LENGTHS = [9 4 5]
+     n_split = 3
+     Y = [3 3 3 2 1 1 2 2 1]
+
+e.g. LENGTHS = [2, 1, 2]
+     n_split = 3
+     Y = [1 1 0 1 0 0 1 1 0]
+)DOC")
+    .Arg("n_split", "Number of splits for each element in LENGTHS")
+    .Input(0, "LENGTHS", "Mx1 Input tensor denoting INT32 lengths")
+    .Input(
+        1,
+        "n_split",
+        "(Optional) Number of splits for each element in LENGTHS (overrides argument)")
+    .Output(0, "Y", "(M*n_split)x1 Output vector denoting split lengths");
+
+// TODO: Write gradient for this when needed
+GRADIENT_NOT_IMPLEMENTED_YET(LengthsSplit);
+
+} // namespace caffe2
diff --git a/caffe2/operators/length_split_op.h b/caffe2/operators/length_split_op.h
new file mode 100644
index 00000000000000..d8c98bf085c8a2
--- /dev/null
+++ b/caffe2/operators/length_split_op.h
@@ -0,0 +1,75 @@
+#ifndef CAFFE2_OPERATORS_LENGTH_SPLIT_OP_H_
+#define CAFFE2_OPERATORS_LENGTH_SPLIT_OP_H_
+
+#include "caffe2/core/common_omp.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/logging.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+template <class Context>
+class LengthsSplitOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  LengthsSplitOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws),
+        n_split_(OperatorBase::GetSingleArgument<int32_t>("n_split", 0)) {
+    if (InputSize() == 1) {
+      // If not specified, then must have this argument
+      CAFFE_ENFORCE(
+          OperatorBase::HasArgument("n_split"),
+          "Argument `n_split` is missing and was not specified as input.");
+      CAFFE_ENFORCE(
+          n_split_ > 0,
+          "`n_split` must contain a positive value for defined behavior.");
+    }
+  }
+  ~LengthsSplitOp() {}
+
+  bool RunOnDevice() override {
+    const auto& L = Input(0);
+    CAFFE_ENFORCE_EQ(L.ndim(), 1, "Input `LENGTHS` should be a 1D vector.");
+
+    if (InputSize() > 1) {
+      // We potentially have n_split specified as inputs as well
+      CAFFE_ENFORCE(
+          Input(1).ndim() == 1 && Input(1).size() == 1,
+          "Input `n_split` should be a vector of size 1.");
+
+      const auto& input1 = Input(1);
+      context_.template CopyItems<Context, CPUContext>(
+          input1.meta(), 1, input1.raw_data(), &n_split_);
+    }
+
+    CAFFE_ENFORCE(
+        n_split_ > 0,
+        "`n_split` must contain a positive value for defined behavior.");
+    const auto M = L.size();
+
+    auto* Y = Output(0);
+    Y->Resize(M * n_split_);
+
+    const int32_t* Ldata = L.template data<int32_t>();
+    int32_t* Ydata = Y->template mutable_data<int32_t>();
+
+    for (int i = 0; i < M; i++) {
+      int32_t mod = Ldata[i] % n_split_;
+      int32_t res =
+          mod != 0 ? math::divUp(Ldata[i], n_split_) : Ldata[i] / n_split_ + 1;
+      for (int j = 0; j < n_split_; j++) {
+        Ydata[(i * n_split_) + j] = mod-- > 0 ? res : res - 1;
+      }
+    }
+    return true;
+  }
+
+ private:
+  int32_t n_split_;
+};
+
+} // namespace caffe2
+
+#endif // CAFFE2_OPERATORS_LENGTH_SPLIT_OP_H_
diff --git a/caffe2/operators/lengths_tile_op.cc b/caffe2/operators/lengths_tile_op.cc
index e832fe8723a0ce..d5af0a91bd65c0 100644
--- a/caffe2/operators/lengths_tile_op.cc
+++ b/caffe2/operators/lengths_tile_op.cc
@@ -1,6 +1,50 @@
 #include "caffe2/operators/lengths_tile_op.h"
 
 namespace caffe2 {
+
+template <>
+bool LengthsTileOp<CPUContext>::RunOnDevice() {
+  auto& data = Input(DATA);
+  auto& lengths = Input(LENGTHS);
+  auto* output = Output(0);
+
+  CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D");
+  CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
+  CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0));
+
+  // Context::CopyFrom and math::Sum need the same context to avoid race
+  // conditions
+  // why? CPUContext is not used in Sum
+  lengths_host_.CopyFrom(lengths, &context_);
+  context_.FinishDeviceComputation();
+  auto lengths_size = lengths_host_.size();
+  auto* lengths_data = lengths_host_.data<int32_t>();
+
+  int32_t total_length = 0;
+  CPUContext cpuContext;
+  math::Sum<int32_t, CPUContext>(
+      lengths_size, lengths_data, &total_length, &cpuContext);
+
+  auto shape = data.dims();
+  shape[0] = total_length;
+  output->Resize(shape);
+
+  auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
+  auto src = static_cast<const char*>(data.raw_data());
+  auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));
+
+  for (TIndex i = 0; i < lengths_size; ++i) {
+    auto length = lengths_data[i];
+    CAFFE_ENFORCE_GE(length, 0);
+    for (int32_t j = 0; j < length; ++j) {
+      context_.CopyBytesSameDevice(block_bytesize, src, out);
+      out += block_bytesize;
+    }
+    src += block_bytesize;
+  }
+  return true;
+}
+
 REGISTER_CPU_OPERATOR(LengthsTile, LengthsTileOp<CPUContext>);
 
 OPERATOR_SCHEMA(LengthsTile)
diff --git a/caffe2/operators/lengths_tile_op.cu b/caffe2/operators/lengths_tile_op.cu
new file mode 100644
index 00000000000000..aebb33c1460a56
--- /dev/null
+++ b/caffe2/operators/lengths_tile_op.cu
@@ -0,0 +1,110 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/lengths_tile_op.h"
+
+namespace caffe2 {
+
+template <typename T>
+__global__ void lengthsTileKernel(
+    int numElements,
+    int rowSize,
+    const T* input,
+    T* output,
+    const int32_t* inputRowOffsets) {
+  CUDA_1D_KERNEL_LOOP(i, numElements) {
+    auto outputRowIndex = i / rowSize;
+    auto inputBlockOffset = inputRowOffsets[outputRowIndex];
+    auto indexInRow = i - outputRowIndex * rowSize;
+    output[i] = input[inputBlockOffset + indexInRow];
+  }
+}
+
+template <>
+bool LengthsTileOp<CUDAContext>::RunOnDevice() {
+  auto& data = Input(DATA);
+  auto& lengths = Input(LENGTHS);
+  auto* output = Output(0);
+
+  CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D");
+  CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
+  CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0));
+
+  lengths_host_.CopyFrom(lengths, &context_);
+  context_.FinishDeviceComputation();
+  auto lengths_size = lengths_host_.size();
+  auto* lengths_data = lengths_host_.data<int32_t>();
+
+  int32_t total_length = 0;
+  CPUContext cpuContext;
+  math::Sum<int32_t, CPUContext>(
+      lengths_size, lengths_data, &total_length, &cpuContext);
+
+  auto shape = data.dims();
+  shape[0] = total_length;
+  output->Resize(shape);
+
+  auto numElementsPerRow = data.size_from_dim(1);
+  auto numElements = total_length * numElementsPerRow;
+  auto numBlocks = CAFFE_GET_BLOCKS(numElements);
+
+  rowMappingHost_.Resize(total_length);
+  rowMappingDevice_.Resize(total_length);
+  auto* rowOffsets = rowMappingHost_.mutable_data<int32_t>();
+  int32_t outputRow = 0;
+  for (TIndex i = 0; i < lengths_size; i++) {
+    auto length = lengths_data[i];
+    for (int32_t j = 0; j < length; j++) {
+      rowOffsets[outputRow++] = i * numElementsPerRow;
+    }
+  }
+
+  context_.CopyFromCPU<int32_t>(
+      total_length,
+      rowMappingHost_.data<int32_t>(),
+      rowMappingDevice_.mutable_data<int32_t>());
+  context_.FinishDeviceComputation();
+
+  if (data.template IsType<float>()) {
+    lengthsTileKernel<<<
+        numBlocks,
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        numElements,
+        numElementsPerRow,
+        data.data<float>(),
+        output->mutable_data<float>(),
+        rowMappingDevice_.data<int32_t>());
+  } else if (data.template IsType<int>()) {
+    lengthsTileKernel<<<
+        numBlocks,
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        numElements,
+        numElementsPerRow,
+        data.data<int>(),
+        output->mutable_data<int>(),
+        rowMappingDevice_.data<int32_t>());
+  } else if (data.template IsType<int64_t>()) {
+    lengthsTileKernel<<<
+        numBlocks,
+        CAFFE_CUDA_NUM_THREADS,
+        0,
+        context_.cuda_stream()>>>(
+        numElements,
+        numElementsPerRow,
+        data.data<int64_t>(),
+        output->mutable_data<int64_t>(),
+        rowMappingDevice_.data<int32_t>());
+  } else {
+    CAFFE_THROW(
+        "LengthsTile operator only supports 32-bit float, int and int64_t"
+        " types but input was of type ",
+        data.meta().name());
+  }
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(LengthsTile, LengthsTileOp<CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/lengths_tile_op.h b/caffe2/operators/lengths_tile_op.h
index c92adcb956d916..c9bd5ef500d996 100644
--- a/caffe2/operators/lengths_tile_op.h
+++ b/caffe2/operators/lengths_tile_op.h
@@ -13,44 +13,6 @@ class LengthsTileOp : public Operator<Context> {
   USE_SIMPLE_CTOR_DTOR(LengthsTileOp);
 
   bool RunOnDevice() override {
-    auto& data = Input(DATA);
-    auto& lengths = Input(LENGTHS);
-    auto* output = Output(0);
-
-    CAFFE_ENFORCE_EQ(lengths.ndim(), 1, "LENGTHS must be 1-D");
-    CAFFE_ENFORCE_GE(data.ndim(), 1, "DATA should be at least 1-D");
-    CAFFE_ENFORCE_EQ(lengths.size(), data.dim(0));
-
-    // Context::CopyFrom and math::Sum need the same context to avoid race
-    // conditions
-    // why? CPUContext is not used in Sum
-    lengths_host_.CopyFrom(lengths, &context_);
-    context_.FinishDeviceComputation();
-    auto lengths_size = lengths_host_.size();
-    auto* lengths_data = lengths_host_.data<int32_t>();
-
-    int32_t total_length = 0;
-    CPUContext cpuContext;
-    math::Sum<int32_t, CPUContext>(
-        lengths_size, lengths_data, &total_length, &cpuContext);
-
-    auto shape = data.dims();
-    shape[0] = total_length;
-    output->Resize(shape);
-
-    auto block_bytesize = data.size_from_dim(1) * data.meta().itemsize();
-    auto src = static_cast<const char*>(data.raw_data());
-    auto out = static_cast<char*>(output->raw_mutable_data(data.meta()));
-
-    for (TIndex i = 0; i < lengths_size; ++i) {
-      auto length = lengths_data[i];
-      CAFFE_ENFORCE_GE(length, 0);
-      for (int32_t j = 0; j < length; ++j) {
-        context_.CopyBytesSameDevice(block_bytesize, src, out);
-        out += block_bytesize;
-      }
-      src += block_bytesize;
-    }
     return true;
   }
 
@@ -58,6 +20,8 @@ class LengthsTileOp : public Operator<Context> {
 
  private:
   Tensor lengths_host_{CPU};
+  Tensor rowMappingHost_{CPU};
+  Tensor rowMappingDevice_{Context::GetDeviceType()};
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/lengths_tile_op_gpu.cc b/caffe2/operators/lengths_tile_op_gpu.cc
deleted file mode 100644
index 65ed44b735de43..00000000000000
--- a/caffe2/operators/lengths_tile_op_gpu.cc
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/lengths_tile_op.h"
-
-namespace caffe2 {
-REGISTER_CUDA_OPERATOR(LengthsTile, LengthsTileOp<CUDAContext>);
-} // namespace caffe2
diff --git a/caffe2/operators/onnxifi_op.cc b/caffe2/operators/onnxifi_op.cc
index 4e9f6f2ac280f1..aee7fff4bc3391 100644
--- a/caffe2/operators/onnxifi_op.cc
+++ b/caffe2/operators/onnxifi_op.cc
@@ -15,7 +15,7 @@ void BlobToTensorDescriptor(
   // Memory type
   // We only allow weights to be CPU tensor for now
   CAFFE_ENFORCE(
-      blob->template IsType<Tensor>(CPU),
+      blob->IsTensorType(CPU),
       "Initialization blob ",
       name,
       " needs to be TensorCPU");
diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h
index b0ee9611a69042..8ef39e7c0e78d1 100644
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@@ -64,7 +64,7 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
   bool RunOnDevice() override {
     bool need_sync = false;
     for (int i = 0; i < InputSize(); ++i) {
-      if (this->template InputIsType<Tensor>(i, CUDA)) {
+      if (this->InputIsTensorType(i, CUDA)) {
         local_input_blobs_[i]->GetMutableTensor(CPU)->CopyFrom(
             Input(i), &context_);
         need_sync = true;
@@ -95,7 +95,7 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
         continue;
       }
       CAFFE_ENFORCE(
-          local_output_blobs_[i]->template IsType<Tensor>(CPU),
+          local_output_blobs_[i]->IsTensorType(CPU),
           "GPU fallback op currently does not support non-TensorCPU "
           "output type who needs copying.");
       Output(i)->CopyFrom(local_output_blobs_[i]->template Get<TensorCPU>());
diff --git a/caffe2/operators/order_switch_ops.cu b/caffe2/operators/order_switch_ops.cu
index 27a71a69141098..f63a7d87fa88fe 100644
--- a/caffe2/operators/order_switch_ops.cu
+++ b/caffe2/operators/order_switch_ops.cu
@@ -1,91 +1,168 @@
 #include "caffe2/operators/order_switch_ops.h"
+
 #include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/fixed_divisor.h"
 
 namespace caffe2 {
 
-__global__ void NHWC2NCHWKernel(
-    const int N,
-    const int HW,
+template <typename T>
+__global__ void NHWC2NCHWCUDAKernel(
+    const int size,
+#ifndef __HIPCC__
+    const FixedDivisor<int> C,
+    const FixedDivisor<int> HxW,
+#else
     const int C,
-    const float* X,
-    float* Y) {
-  CUDA_1D_KERNEL_LOOP(i, N * HW * C) {
-    const int c = i % C;
-    const int hw = i / C % HW;
-    const int n = i / C / HW;
-    Y[(n * C + c) * HW + hw] = X[i];
+    const int HxW,
+#endif
+    const T* X,
+    T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    int n;
+    int c;
+    int hxw;
+
+    int c_d;
+    int hxw_d;
+#ifndef __HIPCC__
+    HxW.DivMod(i, &c, &hxw);
+    C.DivMod(c, &n, &c);
+
+    c_d = C.d();
+    hxw_d = HxW.d();
+#else
+    c = i / HxW;
+    hxw = i % HxW;
+    n = c / C;
+    c = c % C;
+
+    c_d = C;
+    hxw_d = HxW;
+#endif
+
+#if __CUDA_ARCH__ >= 350
+    Y[i] = __ldg(X + (n * hxw_d + hxw) * c_d + c);
+#else
+    Y[i] = X[(n * hxw_d + hxw) * c_d + c];
+#endif
   }
 }
 
-__global__ void NCHW2NHWCKernel(
-    const int N,
+template <typename T>
+__global__ void NCHW2NHWCCUDAKernel(
+    const int size,
+#ifndef __HIPCC__
+    const FixedDivisor<int> C,
+    const FixedDivisor<int> HxW,
+#else
     const int C,
-    const int HW,
-    const float* X,
-    float* Y) {
-  CUDA_1D_KERNEL_LOOP(i, N * C * HW) {
-    const int hw = i % HW;
-    const int c = i / HW % C;
-    const int n = i / C / HW;
-    Y[(n * HW + hw) * C + c] = X[i];
+    const int HxW,
+#endif
+    const T* X,
+    T* Y) {
+  CUDA_1D_KERNEL_LOOP(i, size) {
+    int n;
+    int c;
+    int hxw;
+
+    int c_d;
+    int hxw_d;
+#ifndef __HIPCC__
+    C.DivMod(i, &hxw, &c);
+    HxW.DivMod(hxw, &n, &hxw);
+
+    c_d = C.d();
+    hxw_d = HxW.d();
+#else
+    hxw = i / C;
+    c = i % C;
+    n = hxw / HxW;
+    hxw = hxw % HxW;
+
+    c_d = C;
+    hxw_d = HxW;
+#endif
+#if __CUDA_ARCH__ >= 350
+    Y[i] = __ldg(X + (n * c_d + c) * hxw_d + hxw);
+#else
+    Y[i] = X[(n * c_d + c) * hxw_d + hxw];
+#endif
   }
 }
 
 template <>
 bool NHWC2NCHWOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
+  const auto& X = Input(0);
   auto* Y = Output(0);
-
-  auto ndim = X.ndim();
-  DCHECK_GE(ndim, 3);
-  const int N = X.dim32(0), C = X.dim32(ndim - 1);
+  const int ndim = X.ndim();
+  CAFFE_ENFORCE_GE(ndim, 3);
+  const int N = X.dim32(0);
+  const int C = X.dim32(ndim - 1);
   vector<TIndex> Y_dims(ndim);
   Y_dims[0] = N;
   Y_dims[1] = C;
-  size_t image_size = 1;
-  for (auto i = 2; i < ndim; ++i) {
+  int HxW = 1;
+  for (int i = 2; i < ndim; ++i) {
     Y_dims[i] = X.dim32(i - 1);
-    image_size *= Y_dims[i];
+    HxW *= Y_dims[i];
   }
   Y->Resize(Y_dims);
-
-  NHWC2NCHWKernel<<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      N, image_size, C, X.data<float>(), Y->template mutable_data<float>());
+  const int size = X.size();
+  NHWC2NCHWCUDAKernel<float>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          size,
+#ifndef __HIPCC__
+          FixedDivisor<int>(C),
+          FixedDivisor<int>(HxW),
+#else
+          C,
+          HxW,
+#endif
+          X.data<float>(),
+          Y->template mutable_data<float>());
   return true;
 }
 
 template <>
 bool NCHW2NHWCOp<float, CUDAContext>::RunOnDevice() {
-  auto& X = Input(0);
+  const auto& X = Input(0);
   auto* Y = Output(0);
-
-  auto ndim = X.ndim();
-  DCHECK_GE(X.ndim(), 3);
-  const int N = X.dim32(0), C = X.dim32(1);
+  const int ndim = X.ndim();
+  CAFFE_ENFORCE_GE(X.ndim(), 3);
+  const int N = X.dim32(0);
+  const int C = X.dim32(1);
   vector<TIndex> Y_dims(ndim);
   Y_dims[0] = N;
-  size_t image_size = 1;
+  int HxW = 1;
   for (auto i = 1; i < ndim - 1; ++i) {
     Y_dims[i] = X.dim32(i + 1);
-    image_size *= Y_dims[i];
+    HxW *= Y_dims[i];
   }
   Y_dims[ndim - 1] = C;
   Y->Resize(Y_dims);
-
-  NCHW2NHWCKernel<<<
-      CAFFE_GET_BLOCKS(X.size()),
-      CAFFE_CUDA_NUM_THREADS,
-      0,
-      context_.cuda_stream()>>>(
-      N, C, image_size, X.data<float>(), Y->template mutable_data<float>());
+  const int size = X.size();
+  NCHW2NHWCCUDAKernel<float>
+      <<<CAFFE_GET_BLOCKS(size),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          size,
+#ifndef __HIPCC__
+          FixedDivisor<int>(C),
+          FixedDivisor<int>(HxW),
+#else
+          C,
+          HxW,
+#endif
+          X.data<float>(),
+          Y->template mutable_data<float>());
   return true;
 }
 
-
 REGISTER_CUDA_OPERATOR(NHWC2NCHW, NHWC2NCHWOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(NCHW2NHWC, NCHW2NHWCOp<float, CUDAContext>);
-}  // namespace caffe2
+
+} // namespace caffe2
diff --git a/caffe2/operators/order_switch_ops_cudnn.cc b/caffe2/operators/order_switch_ops_cudnn.cc
new file mode 100644
index 00000000000000..4cb0034e7ee60d
--- /dev/null
+++ b/caffe2/operators/order_switch_ops_cudnn.cc
@@ -0,0 +1,160 @@
+#include "caffe2/operators/order_switch_ops.h"
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/cudnn_wrappers.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+
+namespace {
+
+class CuDNNOrderSwithOpBase : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+
+  CuDNNOrderSwithOpBase(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws), cudnn_wrapper_(&context_) {
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_));
+    CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_));
+  }
+
+  virtual ~CuDNNOrderSwithOpBase() {
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_));
+    CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_));
+  }
+
+ protected:
+  void SetTensorDescriptor(
+      const cudnnDataType_t data_type,
+      const StorageOrder order,
+      const std::vector<int>& data_dims,
+      cudnnTensorDescriptor_t data_desc) const {
+    const int ndim = data_dims.size();
+    const int N = data_dims[0];
+    const int C = order == StorageOrder::NCHW ? data_dims[1] : data_dims.back();
+    if (ndim == 3) {
+      const int H = 1;
+      const int W = order == StorageOrder::NCHW ? data_dims[2] : data_dims[1];
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          data_desc, GetCudnnTensorFormat(order), data_type, N, C, H, W));
+    } else if (ndim == 4) {
+      const int H = order == StorageOrder::NCHW ? data_dims[2] : data_dims[1];
+      const int W = order == StorageOrder::NCHW ? data_dims[3] : data_dims[2];
+      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
+          data_desc, GetCudnnTensorFormat(order), data_type, N, C, H, W));
+    } else {
+      const int H = order == StorageOrder::NCHW ? data_dims[2] : data_dims[1];
+      const int W = order == StorageOrder::NCHW ? data_dims[3] : data_dims[2];
+      const auto l_iter = order == StorageOrder::NCHW ? data_dims.cbegin() + 4
+                                                      : data_dims.cbegin() + 3;
+      const auto r_iter =
+          order == StorageOrder::NCHW ? data_dims.cend() : data_dims.cend() - 1;
+      const int D = std::accumulate(l_iter, r_iter, 1, std::multiplies<int>());
+      const std::array<int, 5> dims = {N, C, H, W, D};
+      const std::array<int, 5> strides = order == StorageOrder::NCHW
+          ? std::array<int, 5>{C * H * W * D, H * W * D, W * D, D, 1}
+          : std::array<int, 5>{C * H * W * D, 1, W * D * C, D * C, C};
+      CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
+          data_desc, data_type, 5, dims.data(), strides.data()));
+    }
+  }
+
+  CuDNNWrapper cudnn_wrapper_;
+  cudnnTensorDescriptor_t X_desc_;
+  cudnnTensorDescriptor_t Y_desc_;
+
+  std::vector<int> cached_X_dims_;
+};
+
+class CuDNNNHWC2NCHWOp final : public CuDNNOrderSwithOpBase {
+ public:
+  CuDNNNHWC2NCHWOp(const OperatorDef& operator_def, Workspace* ws)
+      : CuDNNOrderSwithOpBase(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    const int ndim = X.ndim();
+    const int N = X.dim32(0);
+    const int C = X.dim32(ndim - 1);
+    const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
+    std::vector<int> Y_dims(ndim);
+    Y_dims[0] = N;
+    Y_dims[1] = C;
+    std::copy(X_dims.cbegin() + 1, X_dims.cend() - 1, Y_dims.begin() + 2);
+    Y->Resize(Y_dims);
+    if (cached_X_dims_ != X_dims) {
+      cached_X_dims_ = X_dims;
+      SetTensorDescriptor(
+          cudnnTypeWrapper<T>::type, StorageOrder::NHWC, X_dims, X_desc_);
+      SetTensorDescriptor(
+          cudnnTypeWrapper<T>::type, StorageOrder::NCHW, Y_dims, Y_desc_);
+    }
+    CUDNN_ENFORCE(cudnnTransformTensor(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        cudnnTypeWrapper<T>::kOne(),
+        X_desc_,
+        X.template data<T>(),
+        cudnnTypeWrapper<T>::kZero(),
+        Y_desc_,
+        Y->template mutable_data<T>()));
+    return true;
+  }
+};
+
+class CuDNNNCHW2NHWCOp final : public CuDNNOrderSwithOpBase {
+ public:
+  CuDNNNCHW2NHWCOp(const OperatorDef& operator_def, Workspace* ws)
+      : CuDNNOrderSwithOpBase(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<TensorTypes<float, float16>>::call(this, Input(0));
+  }
+
+  template <typename T>
+  bool DoRunWithType() {
+    const auto& X = Input(0);
+    auto* Y = Output(0);
+    const int ndim = X.ndim();
+    const int N = X.dim32(0);
+    const int C = X.dim32(1);
+    const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
+    std::vector<int> Y_dims(ndim);
+    Y_dims[0] = N;
+    Y_dims[ndim - 1] = C;
+    std::copy(X_dims.cbegin() + 2, X_dims.cend(), Y_dims.begin() + 1);
+    Y->Resize(Y_dims);
+    if (cached_X_dims_ != X_dims) {
+      cached_X_dims_ = X_dims;
+      SetTensorDescriptor(
+          cudnnTypeWrapper<T>::type, StorageOrder::NCHW, X_dims, X_desc_);
+      SetTensorDescriptor(
+          cudnnTypeWrapper<T>::type, StorageOrder::NHWC, Y_dims, Y_desc_);
+    }
+    CUDNN_ENFORCE(cudnnTransformTensor(
+        cudnn_wrapper_.inline_cudnn_handle(),
+        cudnnTypeWrapper<T>::kOne(),
+        X_desc_,
+        X.template data<T>(),
+        cudnnTypeWrapper<T>::kZero(),
+        Y_desc_,
+        Y->template mutable_data<T>()));
+    return true;
+  }
+};
+
+} // namespace
+
+REGISTER_CUDNN_OPERATOR(NHWC2NCHW, CuDNNNHWC2NCHWOp);
+REGISTER_CUDNN_OPERATOR(NCHW2NHWC, CuDNNNCHW2NHWCOp);
+
+} // namespace caffe2
diff --git a/caffe2/operators/slice_op.cc b/caffe2/operators/slice_op.cc
index 529394b9092555..93c039c965f448 100644
--- a/caffe2/operators/slice_op.cc
+++ b/caffe2/operators/slice_op.cc
@@ -3,8 +3,8 @@
 
 namespace caffe2 {
 
-REGISTER_CPU_OPERATOR(Slice, SliceOp<int, CPUContext>);
-REGISTER_CPU_OPERATOR(SliceGradient, SliceGradientOp<int, CPUContext>);
+REGISTER_CPU_OPERATOR(Slice, SliceOp<CPUContext>);
+REGISTER_CPU_OPERATOR(SliceGradient, SliceGradientOp<CPUContext>);
 
 OPERATOR_SCHEMA(Slice)
     .NumInputs(1, 3)
diff --git a/caffe2/operators/slice_op.cu b/caffe2/operators/slice_op.cu
index e2523ad7cbf3fc..5de302814ba2aa 100644
--- a/caffe2/operators/slice_op.cu
+++ b/caffe2/operators/slice_op.cu
@@ -231,79 +231,133 @@ bool SliceImplGpu(
 
 } // namespace
 
-template <>
-bool SliceOp<int, CUDAContext>::RunOnDevice() {
-  auto* output = Output(0);
-  auto& data = Input(0);
+template<>
+class SliceOp<CUDAContext> : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  SliceOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws),
+        starts_(this->template GetRepeatedArgument<TIndex>("starts")),
+        ends_(this->template GetRepeatedArgument<TIndex>("ends")),
+        statically_inited_(false) {}
+
+  bool RunOnDevice() override {
+    if (InputSize() > 1) {
+      return DispatchHelper<TensorTypes<int, TIndex>>::call(this, Input(1));
+    } else {
+      return DoRunWithType<TIndex>();
+    }
+  }
 
-  if (InputSize() > 1) {
-    starts_host_.CopyFrom(Input(1));
-    ends_host_.CopyFrom(Input(2));
-  } else {
-    if (!statically_inited_) {
-      CAFFE_ENFORCE(HasArgument("starts"));
-      CAFFE_ENFORCE(HasArgument("ends"));
-      CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
-
-      starts_host_.Resize(starts_.size());
-      ends_host_.Resize(ends_.size());
-
-      memcpy(
-          starts_host_.mutable_data<int>(),
-          starts_.data(),
-          sizeof(int) * starts_.size());
-      memcpy(
-          ends_host_.mutable_data<int>(),
-          ends_.data(),
-          sizeof(int) * ends_.size());
-      statically_inited_ = true;
+  template <typename SIndex>
+  bool DoRunWithType() {
+    auto* output = Output(0);
+    auto& data = Input(0);
+
+    if (InputSize() > 1) {
+      starts_host_.CopyFrom(Input(1));
+      ends_host_.CopyFrom(Input(2));
+    } else {
+      if (!statically_inited_) {
+        CAFFE_ENFORCE(HasArgument("starts"));
+        CAFFE_ENFORCE(HasArgument("ends"));
+        CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
+
+        starts_host_.Resize(starts_.size());
+        ends_host_.Resize(ends_.size());
+
+        memcpy(
+            starts_host_.mutable_data<SIndex>(),
+            starts_.data(),
+            sizeof(SIndex) * starts_.size());
+        memcpy(
+            ends_host_.mutable_data<SIndex>(),
+            ends_.data(),
+            sizeof(SIndex) * ends_.size());
+        statically_inited_ = true;
+      }
     }
+
+    return SliceImplGpu<SIndex, CUDAContext>(
+        output, data, starts_host_, ends_host_, &context_);
   }
+ private:
+  std::vector<TIndex> starts_;
+  std::vector<TIndex> ends_;
+  bool statically_inited_;
+  Tensor starts_host_{CPU};
+  Tensor ends_host_{CPU};
 
-  return SliceImplGpu<int, CUDAContext>(
-      output, data, starts_host_, ends_host_, &context_);
-}
+};  // class SliceOp<CUDAContext>
 
-REGISTER_CUDA_OPERATOR(Slice, SliceOp<int, CUDAContext>);
+REGISTER_CUDA_OPERATOR(Slice, SliceOp<CUDAContext>);
 
 template <>
-bool SliceGradientOp<int, CUDAContext>::RunOnDevice() {
-  auto* gdata = Output(0);
-  auto& data = Input(0);
+class SliceGradientOp<CUDAContext> : public Operator<CUDAContext> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CUDAContext);
+  SliceGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CUDAContext>(operator_def, ws),
+        starts_(this->template GetRepeatedArgument<TIndex>("starts")),
+        ends_(this->template GetRepeatedArgument<TIndex>("ends")),
+        statically_inited_(false) {}
+
+  AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
+
+  bool RunOnDevice() override {
+    if (InputSize() == 4) {
+      return DispatchHelper<TensorTypes<int, TIndex>>::call(this, Input(1));
+    } else {
+      return DoRunWithType<TIndex>();
+    }
+  }
 
-  if (InputSize() == 4) {
-    starts_host_.CopyFrom(Input(1));
-    ends_host_.CopyFrom(Input(2));
+  template <typename SIndex>
+  bool DoRunWithType() {
+    auto* gdata = Output(0);
+    auto& data = Input(0);
 
-    auto& go = Input(3);
+    if (InputSize() == 4) {
+      starts_host_.CopyFrom(Input(1));
+      ends_host_.CopyFrom(Input(2));
 
-    return SliceImplGpu<int, CUDAContext>(
-        nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
-  } else {
-    if (!statically_inited_) {
-      CAFFE_ENFORCE(HasArgument("starts"));
-      CAFFE_ENFORCE(HasArgument("ends"));
-      CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
-
-      starts_host_.Resize(starts_.size());
-      ends_host_.Resize(ends_.size());
-
-      memcpy(
-          starts_host_.mutable_data<int>(),
-          starts_.data(),
-          sizeof(int) * starts_.size());
-      memcpy(
-          ends_host_.mutable_data<int>(),
-          ends_.data(),
-          sizeof(int) * ends_.size());
-
-      statically_inited_ = true;
-    }
-    auto& go = Input(1);
+      auto& go = Input(3);
 
-    return SliceImplGpu<int, CUDAContext>(
-        nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
+      return SliceImplGpu<SIndex, CUDAContext>(
+          nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
+    } else {
+      if (!statically_inited_) {
+        CAFFE_ENFORCE(HasArgument("starts"));
+        CAFFE_ENFORCE(HasArgument("ends"));
+        CAFFE_ENFORCE_EQ(starts_.size(), ends_.size());
+
+        starts_host_.Resize(starts_.size());
+        ends_host_.Resize(ends_.size());
+
+        memcpy(
+            starts_host_.mutable_data<SIndex>(),
+            starts_.data(),
+            sizeof(SIndex) * starts_.size());
+        memcpy(
+            ends_host_.mutable_data<SIndex>(),
+            ends_.data(),
+            sizeof(SIndex) * ends_.size());
+
+        statically_inited_ = true;
+      }
+      auto& go = Input(1);
+
+      return SliceImplGpu<SIndex, CUDAContext>(
+          nullptr, data, starts_host_, ends_host_, &context_, gdata, &go);
+    }
   }
-}
-REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp<int, CUDAContext>);
+ private:
+
+  std::vector<TIndex> starts_;
+  std::vector<TIndex> ends_;
+  bool statically_inited_;
+  Tensor starts_host_{CPU};
+  Tensor ends_host_{CPU};
+};  // class SliceGradientOp<CUDAContext>
+REGISTER_CUDA_OPERATOR(SliceGradient, SliceGradientOp<CUDAContext>);
 } // namespace caffe2
diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h
index 8f291affb8e8d0..aa8d4e50f0f9d9 100644
--- a/caffe2/operators/slice_op.h
+++ b/caffe2/operators/slice_op.h
@@ -198,22 +198,26 @@ bool SliceImpl(
 
 } // namespace
 
-template <class SIndex, class Context>
+template <class Context>
 class SliceOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   SliceOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
-        starts_(this->template GetRepeatedArgument<SIndex>("starts")),
-        ends_(this->template GetRepeatedArgument<SIndex>("ends")),
+        starts_(this->template GetRepeatedArgument<TIndex>("starts")),
+        ends_(this->template GetRepeatedArgument<TIndex>("ends")),
         statically_inited_(false) {}
 
   bool RunOnDevice() override {
-    return RunOnDeviceImpl(Input(0), Output(0));
+    if (InputSize() > 1) {
+      return DispatchHelper<TensorTypes<int, TIndex>>::call(this, Input(1));
+    } else {
+      return DoRunWithType<TIndex>();
+    }
   }
 
- protected:
-  bool RunOnDeviceImpl(const Tensor& data, Tensor* output) {
+  template <typename SIndex>
+  bool DoRunWithType() {
     if (InputSize() > 1) {
       starts_host_.CopyFrom(Input(1));
       ends_host_.CopyFrom(Input(2));
@@ -238,31 +242,45 @@ class SliceOp : public Operator<Context> {
       }
     }
 
+    auto data = Input(0);
+    auto output = Output(0);
+
     return SliceImpl<SIndex, Context>(
         output, data, starts_host_, ends_host_, &context_);
   }
 
   AT_DISABLE_COPY_AND_ASSIGN(SliceOp);
 
- private:
-  std::vector<SIndex> starts_;
-  std::vector<SIndex> ends_;
+ protected:
+  std::vector<TIndex> starts_;
+  std::vector<TIndex> ends_;
   bool statically_inited_;
   Tensor starts_host_{CPU};
   Tensor ends_host_{CPU};
 };
 
-template <class SIndex, class Context>
+template <class Context>
 class SliceGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   SliceGradientOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
-        starts_(this->template GetRepeatedArgument<SIndex>("starts")),
-        ends_(this->template GetRepeatedArgument<SIndex>("ends")),
+        starts_(this->template GetRepeatedArgument<TIndex>("starts")),
+        ends_(this->template GetRepeatedArgument<TIndex>("ends")),
         statically_inited_(false) {}
 
+        AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
+
   bool RunOnDevice() override {
+    if (InputSize() == 4) {
+      return DispatchHelper<TensorTypes<int, TIndex>>::call(this, Input(1));
+    } else {
+      return DoRunWithType<TIndex>();
+    }
+  }
+
+  template <typename SIndex>
+  bool DoRunWithType()  {
     auto* gdata = Output(0);
     auto& data = Input(0);
 
@@ -301,11 +319,10 @@ class SliceGradientOp : public Operator<Context> {
     }
   }
 
-  AT_DISABLE_COPY_AND_ASSIGN(SliceGradientOp);
-
  private:
-  std::vector<SIndex> starts_;
-  std::vector<SIndex> ends_;
+
+  std::vector<TIndex> starts_;
+  std::vector<TIndex> ends_;
   bool statically_inited_;
   Tensor starts_host_{CPU};
   Tensor ends_host_{CPU};
diff --git a/caffe2/operators/string_ops_test.cc b/caffe2/operators/string_ops_test.cc
index ece70ffd2425e1..c9ba13efb50258 100644
--- a/caffe2/operators/string_ops_test.cc
+++ b/caffe2/operators/string_ops_test.cc
@@ -26,7 +26,7 @@ class StringJoinOpTest : public testing::Test {
   const std::string* checkAndGetOutput(int outputSize) {
     const auto* output = ws_.GetBlob("Y");
     EXPECT_NE(output, nullptr);
-    EXPECT_TRUE(output->IsType<Tensor>(CPU));
+    EXPECT_TRUE(output->IsTensorType(CPU));
     const auto& outputTensor = output->Get<TensorCPU>();
     EXPECT_EQ(outputTensor.ndim(), 1);
     EXPECT_EQ(outputTensor.dim(0), outputSize);
diff --git a/caffe2/operators/stylizer_ops.cc b/caffe2/operators/stylizer_ops.cc
index 8f1e0895a28596..a6d395fe9ba647 100644
--- a/caffe2/operators/stylizer_ops.cc
+++ b/caffe2/operators/stylizer_ops.cc
@@ -82,7 +82,7 @@ class PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp
     auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
         "noise_size", 491 /* prime to avoid artifacts */);
 
-    if (!noiseBlob->IsType<Tensor>(CPU)) {
+    if (!noiseBlob->IsTensorType(CPU)) {
       // Initialize random noise on first use.
       // Cache it to maintain temporal consistency.
       auto* t = noiseBlob->GetMutableTensor(CPU);
diff --git a/caffe2/operators/utility_ops.cc b/caffe2/operators/utility_ops.cc
index 6c287c37f3d6c9..b370b5ecb438cc 100644
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@@ -52,14 +52,6 @@ REGISTER_CPU_OPERATOR(
     ScatterWeightedSumOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(ScatterAssign, ScatterAssignOp<CPUContext>);
 
-// From CPU, copy it to whatever the current context
-REGISTER_CPU_OPERATOR(
-    CopyFromCPUInput,
-    CopyOp<CPUContext, CPUContext, CPUContext>);
-REGISTER_CPU_OPERATOR(
-    CopyOnDeviceLike,
-    CopyOnDeviceLikeOp<CPUContext, CPUContext, CPUContext>);
-REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
 REGISTER_CPU_OPERATOR(LengthsToShape, LengthsToShapeOp<CPUContext>);
 REGISTER_CPU_OPERATOR(HasElements, HasElementsOp<CPUContext>);
 REGISTER_CPU_OPERATOR(GatherRanges, GatherRangesOp<CPUContext>);
@@ -379,133 +371,6 @@ Currently only works on CPU because of access to INDICES.
         "Update slices, with shape len(INDICES) + shape(X_0)[1:]")
     .Output(0, "DATA", "Has to be exactly the same tensor as the input 0");
 
-OPERATOR_SCHEMA(Copy)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .IdenticalTypeAndShape()
-    .InputsCanCrossDevices()
-    .SetDoc(R"DOC(
-Copy input tensor into output, potentially across devices.
-
-Github Links:
-
-- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.cc
-- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/utility_ops.h
-
-
-<details>
-
-<summary> <b>Example</b> </summary>
-
-**Code**
-
-```
-
-workspace.ResetWorkspace()
-
-op = core.CreateOperator(
-    "Copy",
-    ["input"],
-    ["output"]
-)
-
-workspace.FeedBlob("input", np.random.rand(3,3))
-print("input:", workspace.FetchBlob("input"))
-workspace.RunOperatorOnce(op)
-print("output:", workspace.FetchBlob("output"))
-
-```
-
-**Result**
-
-```
-
-input:
-[[0.16826761 0.68168217 0.55196001]
- [0.19735483 0.34837823 0.69015595]
- [0.09448514 0.57390828 0.37097193]]
-output:
-[[0.16826761 0.68168217 0.55196001]
- [0.19735483 0.34837823 0.69015595]
- [0.09448514 0.57390828 0.37097193]]
-
-```
-
-</details>
-
-)DOC")
-    .Input(0, "input", "(*Tensor*): input tensor to copy")
-    .Output(0, "output", "(*Tensor*): copy of input tensor");
-
-OPERATOR_SCHEMA(CopyGPUToCPU)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .IdenticalTypeAndShape()
-    .InputsCanCrossDevices()
-    .DeviceInferenceFunction([](const OperatorDef& def) {
-      CAFFE_ENFORCE(
-          def.has_device_option(),
-          "CopyGPUToCPU op should have cuda device option.");
-      auto& cuda_option = def.device_option();
-      auto cpu_option = DeviceOption();
-      vector<DeviceOption> in_dev(def.input_size(), cuda_option);
-      vector<DeviceOption> out_dev(def.output_size(), cpu_option);
-      return std::make_pair(in_dev, out_dev);
-    })
-    .SetDoc(R"DOC(
-Copy tensor for GPU to CPU context. Must be run under GPU device option.
-)DOC")
-    .Input(0, "input", "The input tensor.")
-    .Output(0, "output", "Tensor that will contain a copy of the input.");
-
-OPERATOR_SCHEMA(CopyCPUToGPU)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .IdenticalTypeAndShape()
-    .InputsCanCrossDevices()
-    .DeviceInferenceFunction([](const OperatorDef& def) {
-      CAFFE_ENFORCE(
-          def.has_device_option(),
-          "CopyCPUToGPU op should have cuda device option.");
-      auto& cuda_option = def.device_option();
-      auto cpu_option = DeviceOption();
-      vector<DeviceOption> in_dev(def.input_size(), cpu_option);
-      vector<DeviceOption> out_dev(def.output_size(), cuda_option);
-      return std::make_pair(in_dev, out_dev);
-    })
-    .SetDoc(R"DOC(
-Copy tensor for CPU to GPU context. Must be run under GPU device option.
-)DOC")
-    .Input(0, "input", "The input tensor.")
-    .Output(0, "output", "Tensor that will contain a copy of the input.");
-
-OPERATOR_SCHEMA(CopyFromCPUInput)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .IdenticalTypeAndShape()
-    .InputsCanCrossDevices()
-    .DeviceInferenceFunction([](const OperatorDef& def) {
-      auto op_device =
-          def.has_device_option() ? def.device_option() : DeviceOption();
-      auto cpu_option = DeviceOption();
-      vector<DeviceOption> in_dev(def.input_size(), cpu_option);
-      vector<DeviceOption> out_dev(def.output_size(), op_device);
-      return std::make_pair(in_dev, out_dev);
-    })
-    .SetDoc(R"DOC(
-Take a CPU input tensor and copy it to an output in the current
-Context (GPU or CPU). This may involves cross-device MemCpy.
-)DOC")
-    .Input(0, "input", "The input CPU tensor.")
-    .Output(0, "output", "either a TensorCUDA or a TensorCPU");
-
-OPERATOR_SCHEMA(CopyOnDeviceLike)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc("Copy input tensor into output to the specific device.")
-    .Input(0, "input", "The input tensor.")
-    .Input(1, "dst", "Tensor, on which device the copy will be performed.")
-    .Output(0, "output", "Tensor that will contain a copy of the input.");
 
 OPERATOR_SCHEMA(HasElements)
     .NumInputs(1)
@@ -937,62 +802,6 @@ struct GetFlattenToVecGradient : public GradientMakerBase {
 };
 REGISTER_GRADIENT(FlattenToVec, GetFlattenToVecGradient);
 
-struct GetCopyGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    return SingleGradientDef(
-        "CopyOnDeviceLike",
-        "",
-        vector<string>{GO(0), I(0)},
-        vector<string>{GI(0)});
-  }
-};
-REGISTER_GRADIENT(Copy, GetCopyGradient);
-
-struct GetGPUToCPUGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    if (g_output_[0].IsDense()) {
-      return SingleGradientDef(
-          "CopyCPUToGPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
-    } else {
-      return vector<OperatorDef>{CreateOperatorDef(
-                                     "CopyCPUToGPU",
-                                     "",
-                                     std::vector<string>{GO_I(0)},
-                                     std::vector<string>{GI_I(0)}),
-                                 CreateOperatorDef(
-                                     "CopyCPUToGPU",
-                                     "",
-                                     std::vector<string>{GO_V(0)},
-                                     std::vector<string>{GI_V(0)})};
-    }
-  }
-};
-REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient);
-
-struct GetCPUToGPUGradient : public GradientMakerBase {
-  using GradientMakerBase::GradientMakerBase;
-  vector<OperatorDef> GetGradientDefs() override {
-    if (g_output_[0].IsDense()) {
-      return SingleGradientDef(
-          "CopyGPUToCPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
-    } else {
-      return vector<OperatorDef>{CreateOperatorDef(
-                                     "CopyGPUToCPU",
-                                     "",
-                                     std::vector<string>{GO_I(0)},
-                                     std::vector<string>{GI_I(0)}),
-                                 CreateOperatorDef(
-                                     "CopyGPUToCPU",
-                                     "",
-                                     std::vector<string>{GO_V(0)},
-                                     std::vector<string>{GI_V(0)})};
-    }
-  }
-};
-REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);
-
 SHOULD_NOT_DO_GRADIENT(LengthsToSegmentIds);
 SHOULD_NOT_DO_GRADIENT(SegmentIdsToLengths);
 SHOULD_NOT_DO_GRADIENT(SegmentIdsToRanges);
diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu
index e771c4ee36e1cc..8272bbcad3e55c 100644
--- a/caffe2/operators/utility_ops.cu
+++ b/caffe2/operators/utility_ops.cu
@@ -36,28 +36,6 @@ bool SumOp<CUDAContext>::RunOnDevice() {
   return false;
 }
 
-template <>
-class CopyOnDeviceLikeOp<CUDAContext, CUDAContext, CUDAContext>
-    : public Operator<CUDAContext> {
- public:
-  CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CUDAContext>(operator_def, ws) {}
-  USE_OPERATOR_FUNCTIONS(CUDAContext);
-
-  bool RunOnDevice() override {
-    auto& input = Input(0);
-    auto* output = OperatorBase::Output<Tensor>(0, CUDA);
-    CUDAContext context(GetGPUIDForPointer(Input(1).raw_data()));
-    output->ResizeLike(input);
-    context.template CopyItems<CUDAContext, CUDAContext>(
-        input.meta(),
-        input.size(),
-        input.raw_data(),
-        output->raw_mutable_data(input.meta()));
-    return true;
-  }
-};
-
 REGISTER_CUDA_OPERATOR(Print, PrintOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(Flatten, FlattenOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(FlattenToVec, FlattenToVecOp<CUDAContext>);
@@ -66,27 +44,6 @@ REGISTER_CUDA_OPERATOR(ResizeLike, ResizeLikeOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(Sum, SumOp<CUDAContext>);
 REGISTER_CUDA_OPERATOR(WeightedSum, WeightedSumOp<CUDAContext>);
 
-// From CPU, copy it to whatever the current context
-REGISTER_CUDA_OPERATOR(
-    CopyFromCPUInput,
-    CopyOp<CUDAContext, CUDAContext, CPUContext>);
-
-// CopyGPUToCPU and CopyCPUToGPU should both be carried out in a cuda context,
-// since gpu code will be involved.
-REGISTER_CUDA_OPERATOR(
-    CopyGPUToCPU,
-    CopyOp<CUDAContext, CPUContext, CUDAContext>);
-REGISTER_CUDA_OPERATOR(
-    CopyCPUToGPU,
-    CopyOp<CUDAContext, CUDAContext, CPUContext>);
-// If we only specify Copy, we assume that it is a gpu to gpu copy - maybe
-// involving different GPUs.
-REGISTER_CUDA_OPERATOR(Copy, CopyOp<CUDAContext, CUDAContext, CUDAContext>);
-
-REGISTER_CUDA_OPERATOR(
-    CopyOnDeviceLike,
-    CopyOnDeviceLikeOp<CUDAContext, CUDAContext, CUDAContext>);
-
 REGISTER_CUDA_OPERATOR(UnsafeCoalesce, UnsafeCoalesceOp<CUDAContext>);
 
 CAFFE_KNOWN_TYPE(const float*);
diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h
index c9564dfa74a86a..9a615f53e25394 100644
--- a/caffe2/operators/utility_ops.h
+++ b/caffe2/operators/utility_ops.h
@@ -91,8 +91,8 @@ class PrintOp final : public Operator<Context> {
       return true;
     }
 
-    if (!this->template InputIsType<Tensor>(0, Context::GetDeviceType()) &&
-        !this->template InputIsType<Tensor>(0, CPU)) {
+    if (!this->InputIsTensorType(0, Context::GetDeviceType()) &&
+        !this->InputIsTensorType(0, CPU)) {
       LOG(INFO) << "Blob of type: "
                 << OperatorBase::Inputs().at(0)->meta().name();
       return true;
@@ -113,7 +113,7 @@ class PrintOp final : public Operator<Context> {
         unsigned char,
         std::string>;
 
-    if (this->template InputIsType<Tensor>(0, CPU)) {
+    if (this->InputIsTensorType(0, CPU)) {
       return DispatchHelper<Types>::call(
           this, this->template Input<Tensor>(0, CPU));
     } else {
@@ -129,7 +129,7 @@ class PrintOp final : public Operator<Context> {
     // will handle memory deallocation itself so no smart pointer is needed.
     const TensorCPU* tensor;
     Tensor tensor_copy_if_needed(CPU);
-    if (this->template InputIsType<Tensor>(0, CPU)) {
+    if (this->InputIsTensorType(0, CPU)) {
       tensor = &this->template Input<Tensor>(0, CPU);
     } else {
       tensor_copy_if_needed.CopyFrom(Input(0), &context_);
@@ -325,7 +325,7 @@ class WeightedSumOp : public Operator<Context> {
 
   template <typename T>
   bool DoRunWithType() {
-    const int input_size = InputSize();
+    const int input_size = this->InputSize();
     CAFFE_ENFORCE_EQ(input_size % 2, 0);
     const auto& X0 = Input(0);
     const auto& weight0 = Input(1);
@@ -698,33 +698,6 @@ class ScatterAssignOp : public Operator<Context> {
   INPUT_TAGS(DATA, INDICES, SLICES);
 };
 
-template <class Context, class DstContext, class SrcContext>
-class CopyOp : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_SIMPLE_CTOR_DTOR(CopyOp);
-
-  bool RunOnDevice() override {
-    auto& input = this->template Input<Tensor>(0, SrcContext::GetDeviceType());
-    auto* output =
-        this->template Output<Tensor>(0, DstContext::GetDeviceType());
-    output->ResizeLike(input);
-    this->context_.template CopyItems<SrcContext, DstContext>(
-        input.meta(),
-        input.size(),
-        input.raw_data(),
-        output->raw_mutable_data(input.meta()));
-    return true;
-  }
-};
-
-template <class Context, class DstContext, class SrcContext>
-class CopyOnDeviceLikeOp : public CopyOp<Context, DstContext, SrcContext> {
- public:
-  CopyOnDeviceLikeOp(const OperatorDef& operator_def, Workspace* ws)
-      : CopyOp<Context, DstContext, SrcContext>(operator_def, ws) {}
-};
-
 template <class Context>
 class LengthsToSegmentIdsOp : public Operator<Context> {
  public:
diff --git a/caffe2/operators/while_op.h b/caffe2/operators/while_op.h
index 258862b690e4a6..3e90341bcdd7ef 100644
--- a/caffe2/operators/while_op.h
+++ b/caffe2/operators/while_op.h
@@ -35,7 +35,7 @@ class WhileOp final : public Operator<Context> {
 
   bool RunOnDevice() override {
     CAFFE_ENFORCE(
-        this->template InputIsType<Tensor>(0, Context::GetDeviceType()),
+        this->InputIsTensorType(0, Context::GetDeviceType()),
         "Invalid condition in While operator: tensor expected");
 
     const auto& condition = Input(0);
diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
index 80e2308eabf3cd..f9956060b75cfd 100644
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@@ -264,8 +264,7 @@ std::unique_ptr<repr::NeuralNetOperator> convertToNeuralNetOperator(
 
 /// \brief Ingest a caffe2 protobuf model and output an NNModule.
 /// \param net The caffe2 protobuf NetDef
-/// \param blobMap [optional][output] A pointer to a blobMap to be populated with all the output blobs of the NetDef by name->NodeRef
-repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, repr::NNGraph::NodeRef>* blobMapOut) {
+repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict) {
   repr::NNModule module;
   repr::NNGraph& dfg = module.dataFlow;
   repr::NNCFGraph& cfg = module.controlFlow;
@@ -285,7 +284,6 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::st
   /// \brief For the construction of the control flow graph we keep track
   /// of a current basic block, which we split up as we come accross control
   /// flow operations such as if and while.
-  // std::unique_ptr<repr::BasicBlockType<repr::NNGraph>> currentBasicBlock =
   auto bbNode =
       cfg.createNode(util::make_unique<repr::BasicBlockType<repr::NNGraph>>());
 
@@ -323,17 +321,26 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::st
   }
 
   if (externalInputNames.size()) {
-    std::ostringstream os;
-    for (const auto& inputName : externalInputNames) {
-      os << "\"" << inputName << "\" ";
-    }
+    // In strict mode we ensure the input names are valid
+    if (strict) {
+      std::ostringstream os;
+      for (const auto& inputName : externalInputNames) {
+        os << "\"" << inputName << "\" ";
+      }
 
-    CAFFE_ENFORCE(
-        externalInputNames.size() == 0,
-        "Attempting to convert an ill-formed network: external_input contains ",
-        externalInputNames.size(),
-        " unused blobs: ",
-        os.str());
+      CAFFE_ENFORCE(
+          externalInputNames.size() == 0,
+          "Attempting to convert an ill-formed network: ",
+          "external_input contains ",
+          externalInputNames.size(),
+          " unused blobs: ",
+          os.str());
+    // Otherwise, we add the blobs to the graph as no-ops
+    } else {
+      for (const auto& input : externalInputNames) {
+        blobMap[input] = dfg.createNode(util::make_unique<repr::Tensor>(input));
+      }
+    }
   }
 
   for (const auto& outputName : net.external_output()) {
@@ -345,9 +352,6 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::st
     module.outputs.insert(blobMap[outputName]);
   }
 
-  if (blobMapOut) {
-    *blobMapOut = blobMap;
-  }
   return module;
 }
 
@@ -379,6 +383,21 @@ caffe2::OperatorDef convertToOperatorDef(
   return op;
 }
 
+Caffe2Annotation getOrAddCaffe2Annotation(
+    nom::repr::NNGraph::NodeRef& instrNode) {
+  auto* nnOp = repr::nn::get<repr::NeuralNetOperator>(instrNode);
+  auto* annotation = nnOp->getAnnotation();
+  if (!annotation) {
+    auto new_annot = util::make_unique<Caffe2Annotation>();
+    new_annot->setOperatorDef(convertToOperatorDef(instrNode));
+    nnOp->setAnnotation(std::move(new_annot));
+    annotation = nnOp->getAnnotation();
+  }
+  CAFFE_ENFORCE(isa<Caffe2Annotation>(annotation));
+  auto c2_annotation = dyn_cast<Caffe2Annotation>(annotation);
+  return *c2_annotation;
+}
+
 caffe2::NetDef convertToCaffe2Proto(repr::NNModule &m) {
   auto predictNet = caffe2::NetDef();
   return convertToCaffe2Proto(m, predictNet);
diff --git a/caffe2/opt/converter.h b/caffe2/opt/converter.h
index 31281ab90572ec..47b3b8c7a3ec3e 100644
--- a/caffe2/opt/converter.h
+++ b/caffe2/opt/converter.h
@@ -19,8 +19,12 @@ class Caffe2Annotation : public nom::repr::Annotation {
       : Annotation(AnnotationKind::Caffe2), Device(device) {}
   virtual ~Caffe2Annotation() {}
 
-  void setDevice(std::string device) { Device = device; }
-  const std::string getDevice() const { return Device; }
+  void setDevice(std::string device) {
+    Device = device;
+  }
+  const std::string getDevice() const {
+    return Device;
+  }
 
   void setDeviceType(int device) {
     DeviceType = device;
@@ -33,6 +37,11 @@ class Caffe2Annotation : public nom::repr::Annotation {
     OpDef = opDef;
     OpDefExists = true;
   }
+
+  bool hasOperatorDef() const {
+    return OpDefExists;
+  }
+
   const caffe2::OperatorDef& getOperatorDef() const {
     CAFFE_ENFORCE(
         OpDefExists,
@@ -57,7 +66,7 @@ class Caffe2Annotation : public nom::repr::Annotation {
   int DeviceType = caffe2::DeviceTypeProto::PROTO_CPU;
 };
 
-CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::string, nom::repr::NNGraph::NodeRef>* blobMapOut = nullptr);
+CAFFE2_API nom::repr::NNModule convertToNNModule(caffe2::NetDef &net, bool strict = false);
 
 CAFFE2_API caffe2::NetDef convertToCaffe2Proto(nom::repr::NNModule&);
 
@@ -73,6 +82,10 @@ CAFFE2_API std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOpera
 CAFFE2_API caffe2::OperatorDef convertToOperatorDef(
     const nom::repr::NNGraph::NodeRef& instrNode);
 
+// If the annotation doesn't exist, attempt to add it
+CAFFE2_API Caffe2Annotation
+getOrAddCaffe2Annotation(nom::repr::NNGraph::NodeRef& instrNode);
+
 class CAFFE2_API Converter {
  public:
   explicit Converter() = default;
diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc
index cb1c9028d5c12b..84dac93753d37a 100644
--- a/caffe2/predictor/predictor.cc
+++ b/caffe2/predictor/predictor.cc
@@ -10,7 +10,7 @@ void enforceIsTensor(Workspace* ws, const std::string& name) {
   auto blob = ws->GetBlob(name);
   CAFFE_ENFORCE(blob, "Blob does not exist: ", name);
   CAFFE_ENFORCE(
-      blob->template IsType<Tensor>(CPU), "Blob is not a CPU Tensor: ", name);
+      blob->IsTensorType(CPU), "Blob is not a CPU Tensor: ", name);
 }
 
 TensorCPU* getTensor(Workspace* ws, const std::string& name) {
diff --git a/caffe2/proto/torch.proto b/caffe2/proto/torch.proto
new file mode 100644
index 00000000000000..9e626d8d845260
--- /dev/null
+++ b/caffe2/proto/torch.proto
@@ -0,0 +1,550 @@
+syntax = "proto2";
+
+import "caffe2/proto/caffe2.proto";
+
+package torch;
+
+// Overview
+//
+// ONNX is an open specification that is comprised of the following components:
+//
+// 1)  A definition of an extensible computation graph model.
+// 2)  Definitions of standard data types.
+// 3)  Definitions of built-in operators.
+//
+// This document describes the syntax of models and their computation graphs,
+// as well as the standard data types. Together, they are referred to as the ONNX
+// Intermediate Representation, or 'IR' for short. 
+//
+// The normative semantic specification of the ONNX IR is found in docs/IR.md.
+// Definitions of the built-in neural network operators may be found in docs/Operators.md.
+
+// Notes
+//
+// Release
+//
+// We are still in the very early stage of defining ONNX. The current
+// version of ONNX is a starting point. While we are actively working
+// towards a complete spec, we would like to get the community involved
+// by sharing our working version of ONNX.
+//
+// Protobuf compatibility
+//
+// To simplify framework compatibility, ONNX is defined using the subset of
+// protobuf that is compatible with both protobuf v2 and v3. This means that we
+// do not use any protobuf features that are only available in one of the two
+// versions.
+//
+// Here are the most notable contortions we have to carry out to work around
+// these limitations:
+//
+//   - No 'map' (added protobuf 3.0). We instead represent mappings as lists
+//     of key-value pairs, where order does not matter and duplicates
+//     are not allowed.
+
+// Versioning
+//
+// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
+//
+// To be compatible with both proto2 and proto3, we will use a version number
+// that is not defined by the default value but an explicit enum number.
+enum Version {
+  // proto3 requires the first enum value to be zero.
+  // We add this just to appease the compiler.
+  _START_VERSION = 0;
+  // The version field is always serialized and we will use it to store the
+  // version that the  graph is generated from. This helps us set up version
+  // control.
+  // For the IR, we are using simple numbers starting with with 0x00000001,
+  // which was the version we published on Oct 10, 2017.
+  IR_VERSION_2017_10_10 = 0x0000000000000001;
+
+  // IR_VERSION 2 published on Oct 30, 2017
+  // - Added type discriminator to AttributeProto to support proto3 users
+  IR_VERSION_2017_10_30 = 0x0000000000000002;
+
+  // IR VERSION 3 published on Nov 3, 2017
+  // - For operator versioning:
+  //    - Added new message OperatorSetIdProto
+  //    - Added opset_import in ModelProto
+  // - For vendor extensions, added domain in NodeProto
+  IR_VERSION_NEWEST_ONNX = 0x0000000000000003;
+
+  // PYTORCH IR VERSION
+  IR_VERSION_NEWEST = 0x0000000000000103;
+}
+
+// Attributes
+//
+// A named attribute containing either singular float, integer, string, graph,
+// and tensor values, or repeated float, integer, string, graph, and tensor values.
+// An AttributeProto MUST contain the name field, and *only one* of the
+// following content fields, effectively enforcing a C/C++ union equivalent.
+message AttributeProto {
+
+  // Note: this enum is structurally identical to the OpSchema::AttrType
+  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
+  enum AttributeType {
+    UNDEFINED = 0;
+    FLOAT = 1;
+    INT = 2;
+    STRING = 3;
+    TENSOR = 4;
+    GRAPH = 5;
+
+    FLOATS = 6;
+    INTS = 7;
+    STRINGS = 8;
+    TENSORS = 9;
+    GRAPHS = 10;
+  }
+
+  // The name field MUST be present for this version of the IR.
+  optional string name = 1;           // namespace Attribute
+ 
+  // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
+  // In this case, this AttributeProto does not contain data, and it's a reference of attribute
+  // in parent scope.
+  // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
+  optional string ref_attr_name = 21;
+
+  // A human-readable documentation for this attribute. Markdown is allowed.
+  optional string doc_string = 13;
+
+  // The type field MUST be present for this version of the IR.
+  // For 0.0.1 versions of the IR, this field was not defined, and
+  // implementations needed to use has_field hueristics to determine
+  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
+  // field MUST be set and match the f|i|s|t|... field in use.  This
+  // change was made to accomodate proto3 implementations.
+  optional AttributeType type = 20;   // discriminator that indicates which field below is in use
+
+  // Exactly ONE of the following fields must be present for this version of the IR
+  optional float f = 2;               // float
+  optional int64 i = 3;               // int
+  optional bytes s = 4;               // UTF-8 string
+  optional TensorProto t = 5;         // tensor value
+  optional GraphProto g = 6;          // graph
+  // Do not use field below, it's deprecated.
+  // optional ValueProto v = 12;         // value - subsumes everything but graph
+
+  repeated float floats = 7;          // list of floats
+  repeated int64 ints = 8;            // list of ints
+  repeated bytes strings = 9;         // list of UTF-8 strings
+  repeated TensorProto tensors = 10;  // list of tensors
+  repeated GraphProto graphs = 11;    // list of graph
+}
+
+// Defines information on value, including the name, the type, and
+// the shape of the value.
+message ValueInfoProto {
+  // This field MUST be present in this version of the IR.
+  optional string name = 1;     // namespace Value
+  // This field MUST be present in this version of the IR.
+  optional TypeProto type = 2;
+  // A human-readable documentation for this value. Markdown is allowed.
+  optional string doc_string = 3;
+}
+
+// Nodes
+//
+// Computation graphs are made up of a DAG of nodes, which represent what is
+// commonly called a "layer" or "pipeline stage" in machine learning frameworks.
+//
+// For example, it can be a node of type "Conv" that takes in an image, a filter 
+// tensor and a bias tensor, and produces the convolved output.
+message NodeProto {
+  repeated string input = 1;    // namespace Value
+  repeated string output = 2;   // namespace Value
+
+  // An optional identifier for this node in a graph.
+  // This field MAY be absent in ths version of the IR.
+  optional string name = 3;     // namespace Node
+
+  // The symbolic identifier of the Operator to execute.
+  optional string op_type = 4;  // namespace Operator
+  // The domain of the OperatorSet that specifies the operator named by op_type.
+  optional string domain = 7;   // namespace Domain
+
+  // Additional named attributes.
+  repeated AttributeProto attribute = 5;
+
+  // A human-readable documentation for this node. Markdown is allowed.
+  optional string doc_string = 6;
+
+  // Additional annotations, attributes are defined in Schema
+  // To be added as annotations:
+  //    string engine
+  //    string list control_input
+  //    int64 is_gradient_op
+  //    string debug_info
+  repeated AttributeProto annotations = 8;
+
+  // Besides the node type, PyTorhc also serialize ATen function signature
+  optional caffe2.DeviceOption device_option = 51;
+  optional string aten_function = 52;
+}
+
+// Models
+//
+// ModelProto is a top-level file/container format for bundling a ML model and
+// associating its computation graph with metadata.
+//
+// The semantics of the model are described by the associated GraphProto.
+//
+// Model ==> Caffe2 MetaNetDef
+//       ==> PyTorch Module
+message ModelProto {
+  // The version of the IR this model targets. See Version enum above.
+  // This field MUST be present.
+  optional int64 ir_version = 1;
+
+  // The OperatorSets this model relies on.
+  // All ModelProtos MUST have at least one entry that
+  // specifies which version of the ONNX OperatorSet is
+  // being imported.
+  //
+  // All nodes in the ModelProto's graph will bind against the operator
+  // with the same-domain/same-op_type operator with the HIGHEST version
+  // in the referenced operator sets.
+  repeated OperatorSetIdProto opset_import = 8;
+
+  // The name of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_name = 2;
+
+  // The version of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_version = 3;
+
+  // Domain name of the model.
+  // We use reverse domain names as name space indicators. For example:
+  // `com.facebook.fair` or `com.microsoft.cognitiveservices`
+  //
+  // Together with `model_version` and GraphProto.name, this forms the unique identity of
+  // the graph.
+  optional string domain = 4;
+
+  // The version of the graph encoded. See Version enum below.
+  optional int64 model_version = 5;
+
+  // A human-readable documentation for this model. Markdown is allowed.
+  optional string doc_string = 6;
+
+  // The parameterized graph that is evaluated to execute the model.
+  // The main graph, in single graph case, it is ONNX compatible.
+  optional GraphProto graph = 7;
+
+  // The remaining nets in MetaNetDef.
+  // Submodules and methods in PyTorch.
+  repeated GraphProto methods = 15;
+
+  // Named metadata values; keys should be distinct.
+  // Many meta data in MetaNetDef and preditor are piggy backed here.
+  // 1) project
+  // 2) model_class
+  // 3) internal_version
+  // 4) predictor_type
+  // 5) predictor_id
+  // 6) execute_plan
+  // 7) applicationSpecificInfo (another string map, need to verify it has no duplicate.)
+  // 8) engine
+  // 9) publish time
+  repeated StringStringEntryProto metadata_props = 14;
+
+  // Model name
+  optional string name = 16;
+
+  // Model name
+  repeated AttributeProto annotations = 17;
+
+  // Mapping from list name to blob name list, must be string list type.
+  // Equivalent to blobs in MetaNetDef.
+  repeated AttributeProto blob_lists = 51;
+
+  // Mapping from plan name to serialized plan, must be string list type.
+  // Equivalent to plans in MetaNetDef.
+  repeated AttributeProto plans = 52;
+};
+
+// StringStringEntryProto follows the pattern for cross-proto-version maps.
+// See https://developers.google.com/protocol-buffers/docs/proto3#maps
+message StringStringEntryProto {
+  optional string key = 1;
+  optional string value= 2;
+};
+
+// Graphs
+//
+// A graph defines the computational logic of a model and is comprised of a parameterized 
+// list of nodes that form a directed acyclic graph based on their inputs and outputs.
+// This is the equivalent of the "network" or "graph" in many deep learning
+// frameworks.
+// Graph ==> NetDef in Caffe2
+//       ==> Submodule/Method in PyTorch
+message GraphProto {
+  // The nodes in the graph, sorted topologically.
+  repeated NodeProto node = 1;
+
+  // The name of the graph.
+  optional string name = 2;   // namespace Graph
+
+  // A list of named tensor values, used to specify constant inputs of the graph.
+  // Each TensorProto entry must have a distinct name (within the list) that
+  // also appears in the input list.
+  repeated TensorProto initializer = 5;
+
+  // A human-readable documentation for this graph. Markdown is allowed.
+  optional string doc_string = 10;
+
+  // The inputs and outputs of the graph.
+  repeated ValueInfoProto input = 11;
+  repeated ValueInfoProto output = 12;
+
+  // Information for the values in the graph. The ValueInfoProto.name's
+  // must be distinct. It is optional for a value to appear in value_info list.
+  repeated ValueInfoProto value_info = 13;
+
+  // Additional annotations.
+  repeated AttributeProto annotations = 14;
+
+  // DO NOT USE the following fields, they were deprecated from earlier versions.
+  // repeated string input = 3;
+  // repeated string output = 4;
+  // optional int64 ir_version = 6;
+  // optional int64 producer_version = 7;
+  // optional string producer_tag = 8;
+  // optional string domain = 9;
+}
+
+// Tensors
+//
+// A serialized tensor value.
+message TensorProto {
+  enum DataType {
+    UNDEFINED = 0;
+    // Basic types.
+    FLOAT = 1;   // float
+    UINT8 = 2;   // uint8_t
+    INT8 = 3;    // int8_t
+    UINT16 = 4;  // uint16_t
+    INT16 = 5;   // int16_t
+    INT32 = 6;   // int32_t
+    INT64 = 7;   // int64_t
+    STRING = 8;  // string
+    BOOL = 9;    // bool
+
+    // Advanced types
+    FLOAT16 = 10;
+    DOUBLE = 11;
+    UINT32 = 12;
+    UINT64 = 13;
+    COMPLEX64 = 14;     // complex with float32 real and imaginary components
+    COMPLEX128 = 15;    // complex with float64 real and imaginary components
+    // Future extensions go here.
+
+    // Special data type, real type information is stored in ValueInfoProto.
+    // If data_type is SPECIAL, raw_data should be used.
+    SPECIAL = 51;
+  }
+
+  // The shape of the tensor.
+  repeated int64 dims = 1;
+  repeated int64 strides = 14;
+
+  // The data type of the tensor.
+  optional DataType data_type = 2;
+
+  // For very large tensors, we may want to store them in chunks, in which
+  // case the following fields will specify the segment that is stored in
+  // the current TensorProto.
+  message Segment {
+    optional int64 begin = 1;
+    optional int64 end = 2;
+    optional int64 chuck_num = 51;
+    optional int64 chuck_id = 52;
+  }
+  // Used as offset in the external shared data.
+  optional Segment segment = 3;
+
+  // Tensor content must be organized in row-major order.
+  //
+  // Depending on the data_type field, exactly one of the fields below with
+  // name ending in _data is used to store the elements of the tensor.
+
+  // For float and complex64 values
+  // Complex64 tensors are encoded as a single array of floats,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component apparing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
+  repeated float float_data = 4 [packed = true];
+
+  // For int32, uint8, int8, uint16, int16, bool, and float16 values
+  // float16 values must be bit-wise converted to an uint16_t prior
+  // to writing to the buffer.
+  // When this field is present, the data_type field MUST be
+  // INT32, INT16, INT8, UINT16, INT8, BOOL, or FLOAT16
+  repeated int32 int32_data = 5 [packed = true];
+
+  // For strings.
+  // Each element of string_data is a UTF-8 encoded Unicode
+  // string. No trailing null, no leading BOM. The protobuf "string"
+  // scalar type is not used to match ML community conventions.
+  // When this field is present, the data_type field MUST be STRING
+  repeated bytes string_data = 6;
+
+  // For int64.
+  // When this field is present, the data_type field MUST be INT64
+  repeated int64 int64_data = 7 [packed = true];
+
+  // Optionally, a name for the tensor.
+  optional string name = 8; // namespace Value
+
+  // A human-readable documentation for this tensor. Markdown is allowed.
+  optional string doc_string = 12;
+
+  // Serializations can either use one of the fields above, or use this
+  // raw bytes field. The only exception is the string case, where one is
+  // required to store the content in the repeated bytes string_data field.
+  //
+  // When this raw_data field is used to store tensor value, elements MUST
+  // be stored in as fixed-width, little-endian order.
+  // Floating-point data types MUST be stored in IEEE 754 format.
+  // Complex64 elements must be written as two consecutive FLOAT values, real component first.
+  // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
+  // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
+  //
+  // Note: the advantage of specific field rather than the raw_data field is
+  // that in some cases (e.g. int data), protobuf does a better packing via
+  // variable length storage, and may lead to smaller binary footprint.
+  // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
+  optional bytes raw_data = 9;
+
+  // For double
+  // Complex64 tensors are encoded as a single array of doubles,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component apparing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
+  repeated double double_data = 10 [packed = true];
+
+  // For uint64 and uint32 values
+  // When this field is present, the data_type field MUST be
+  // UINT32 or UINT64
+  repeated uint64 uint64_data = 11 [packed = true];
+
+  // External data by file name
+  optional string external_data = 13;
+
+  // If two tensors represent the same weights/content, use alias.
+  // Must exist a TensorProto named alias in the initializer list.
+  // To avoid the duplicate tensor in attribute, such as value in Constant node.
+  // This is useful, if everything is stored just in the proto.
+  optional string alias = 16;
+
+  // Additional annotations.
+  repeated AttributeProto annotations = 17;
+
+  // Device info
+  optional caffe2.DeviceOption device_option = 51;
+
+  // For PyTorch serialized tensor.
+  optional int64 require_gradient = 52;
+  optional int64 is_buffer = 53;
+}
+
+// Defines a tensor shape. A dimension can be either an integer value
+// or a symbolic variable. A symbolic variable represents an unknown
+// dimension.
+message TensorShapeProto {
+  message Dimension {
+    oneof value {
+      int64 dim_value = 1;
+      string dim_param = 2;   // namespace Shape
+    };
+    // Standard denotation can optionally be used to denote tensor
+    // dimensions with standard semantic descriptions to ensure
+    // that operations are applied to the correct axis of a tensor.
+    // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
+    // for pre-defined dimension denotations.
+    optional string denotation = 3;
+  };
+  // To represent a scalar, using no dim to represent 0-d tensor.
+  repeated Dimension dim = 1;
+
+  repeated Dimension stride = 51;
+}
+
+// Types
+//
+// The standard ONNX data types.
+message TypeProto {
+
+  message Tensor {
+    // This field MUST NOT have the value of UNDEFINED
+    // This field MUST be present for this version of the IR.
+    optional TensorProto.DataType elem_type = 1;
+    optional TensorShapeProto shape = 2;
+  }
+
+  // Sequence type: List, Tuple
+  message Sequence {
+    // elem_type and elem_type_list cannot appear together.
+    // If all the element types are the same, we use elem_type,
+    // otherwise, we specify the type of each element in elem_type_list.
+    optional TypeProto elem_type = 1;
+    repeated TypeProto elem_type_list = 51;
+    enum SequenceType {
+      UNDEFINED = 0;
+      LIST = 1;
+      TUPLE = 2;
+    }
+    optional SequenceType sequence_type = 52;
+  }
+
+  // Map<K, V>, (not necessary at this moment)
+  message Map {
+    optional TensorProto.DataType key_type = 1;
+    optional TypeProto value_type = 2;
+  }
+
+  // Special type of blobs, based on the type_name, we can choose the right
+  // serializer and deserialzier.
+  message SpecialBlob {
+    optional string type_name = 1;
+  }
+
+  oneof value {
+    // The type of a tensor.
+    Tensor tensor_type = 1;
+    Sequence sequence_type = 4;
+    Map map_type = 5;
+    SpecialBlob special_type = 51;
+  }
+
+  // An optional denotation can be used to denote the whole 
+  // type with a standard semantic description as to what is 
+  // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
+  // for pre-defined type denotations.
+  optional string denotation = 6;
+}
+
+// Operator Sets
+//
+// OperatorSets are uniquely identified by a (domain, opset_version) pair.
+message OperatorSetIdProto {
+  // The domain of the operator set being identified.
+  // The empty string ("") or absence of this field implies the operator
+  // set that is defined as part of the ONNX specification.
+  // This field MUST be present in this version of the IR when referring to any other operator set.
+  optional string domain = 1;
+
+  // The version of the operator set being identified.
+  // This field MUST be present in this version of the IR.
+  optional int64 version = 2;
+}
diff --git a/caffe2/python/compatibility.py b/caffe2/python/compatibility.py
new file mode 100644
index 00000000000000..9d615a30833371
--- /dev/null
+++ b/caffe2/python/compatibility.py
@@ -0,0 +1,8 @@
+from six import PY2, PY3
+
+if PY2:
+    import collections
+    container_abcs = collections
+elif PY3:
+    import collections.abc
+    container_abcs = collections.abc
diff --git a/caffe2/python/examples/resnet50_trainer.py b/caffe2/python/examples/resnet50_trainer.py
index e1716857bb16b0..05b753b8fd397e 100644
--- a/caffe2/python/examples/resnet50_trainer.py
+++ b/caffe2/python/examples/resnet50_trainer.py
@@ -22,8 +22,10 @@
 from caffe2.python.predictor_constants import predictor_constants as predictor_constants
 
 '''
-Parallelized multi-GPU distributed trainer for Resnet 50. Can be used to train
-on imagenet data, for example.
+Parallelized multi-GPU distributed trainer for Resne(X)t.
+Can be used to train on imagenet data, for example.
+The default parameters can train a standard Resnet-50 (1x64d), and parameters
+can be provided to train ResNe(X)t models (e.g., ResNeXt-101 32x4d).
 
 To run the trainer in single-machine multi-gpu mode by setting num_shards = 1.
 
@@ -39,14 +41,23 @@
 '''
 
 logging.basicConfig()
-log = logging.getLogger("resnet50_trainer")
+log = logging.getLogger("ResNe(X)t_trainer")
 log.setLevel(logging.DEBUG)
 
 dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops')
 dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:redis_store_handler_ops')
 
 
-def AddImageInput(model, reader, batch_size, img_size, dtype, is_test):
+def AddImageInput(
+    model,
+    reader,
+    batch_size,
+    img_size,
+    dtype,
+    is_test,
+    mean_per_channel=None,
+    std_per_channel=None,
+):
     '''
     The image input operator loads image and label data from the reader and
     applies transformations to the images (random cropping, mirroring, ...).
@@ -58,6 +69,9 @@ def AddImageInput(model, reader, batch_size, img_size, dtype, is_test):
         output_type=dtype,
         use_gpu_transform=True if model._device_type == 1 else False,
         use_caffe_datum=True,
+        mean_per_channel=mean_per_channel,
+        std_per_channel=std_per_channel,
+        # mean_per_channel takes precedence over mean
         mean=128.,
         std=128.,
         scale=256,
@@ -166,6 +180,7 @@ def RunEpoch(
     # TODO: add loading from checkpoint
     log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
     epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
+    test_epoch_iters = int(args.test_epoch_size / total_batch_size / num_shards)
     for i in range(epoch_iters):
         # This timeout is required (temporarily) since CUDA-NCCL
         # operators might deadlock when synchronizing between GPUs.
@@ -194,19 +209,25 @@ def RunEpoch(
         data_parallel_model.GetLearningRateBlobNames(train_model)[0]
     )
     test_accuracy = 0
-    if (test_model is not None):
+    test_accuracy_top5 = 0
+    if test_model is not None:
         # Run 100 iters of testing
         ntests = 0
-        for _ in range(0, 100):
+        for _ in range(test_epoch_iters):
             workspace.RunNet(test_model.net.Proto().name)
             for g in test_model._devices:
                 test_accuracy += np.asscalar(workspace.FetchBlob(
                     "{}_{}".format(test_model._device_prefix, g) + '/accuracy'
                 ))
+                test_accuracy_top5 += np.asscalar(workspace.FetchBlob(
+                    "{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5'
+                ))
                 ntests += 1
         test_accuracy /= ntests
+        test_accuracy_top5 /= ntests
     else:
         test_accuracy = (-1)
+        test_accuracy_top5 = (-1)
 
     explog.log(
         input_count=num_images,
@@ -216,7 +237,8 @@ def RunEpoch(
             'loss': loss,
             'learning_rate': learning_rate,
             'epoch': epoch,
-            'test_accuracy': test_accuracy,
+            'top1_test_accuracy': test_accuracy,
+            'top5_test_accuracy': test_accuracy_top5,
         }
     )
     assert loss < 40, "Exploded gradients :("
@@ -243,6 +265,17 @@ def Train(args):
         total_batch_size % num_gpus == 0, \
         "Number of GPUs must divide batch size"
 
+    # Verify valid image mean/std per channel
+    if args.image_mean_per_channel:
+        assert \
+            len(args.image_mean_per_channel) == args.num_channels, \
+            "The number of channels of image mean doesn't match input"
+
+    if args.image_std_per_channel:
+        assert \
+            len(args.image_std_per_channel) == args.num_channels, \
+            "The number of channels of image std doesn't match input"
+
     # Round down epoch size to closest multiple of batch size across machines
     global_batch_size = total_batch_size * args.num_shards
     epoch_iters = int(args.epoch_size / global_batch_size)
@@ -262,7 +295,7 @@ def Train(args):
         'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
     }
     train_model = model_helper.ModelHelper(
-        name="resnet50", arg_scope=train_arg_scope
+        name='resnext' + str(args.num_layers), arg_scope=train_arg_scope
     )
 
     num_shards = args.num_shards
@@ -324,7 +357,7 @@ def Train(args):
         rendezvous = None
 
     # Model building functions
-    def create_resnet50_model_ops(model, loss_scale):
+    def create_resnext_model_ops(model, loss_scale):
         initializer = (PseudoFP16Initializer if args.dtype == 'float16'
                        else Initializer)
 
@@ -333,11 +366,14 @@ def create_resnet50_model_ops(model, loss_scale):
                             BiasInitializer=initializer,
                             enable_tensor_core=args.enable_tensor_core,
                             float16_compute=args.float16_compute):
-            pred = resnet.create_resnet50(
+            pred = resnet.create_resnext(
                 model,
                 "data",
                 num_input_channels=args.num_channels,
                 num_labels=args.num_labels,
+                num_layers=args.num_layers,
+                num_groups=args.resnext_num_groups,
+                num_width_per_group=args.resnext_width_per_group,
                 no_bias=True,
                 no_loss=True,
             )
@@ -348,7 +384,8 @@ def create_resnet50_model_ops(model, loss_scale):
         softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                               ['softmax', 'loss'])
         loss = model.Scale(loss, scale=loss_scale)
-        brew.accuracy(model, [softmax, "label"], "accuracy")
+        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
+        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
         return [loss]
 
     def add_optimizer(model):
@@ -408,6 +445,8 @@ def add_image_input(model):
                 img_size=args.image_size,
                 dtype=args.dtype,
                 is_test=False,
+                mean_per_channel=args.image_mean_per_channel,
+                std_per_channel=args.image_std_per_channel,
             )
 
     def add_post_sync_ops(model):
@@ -423,7 +462,7 @@ def add_post_sync_ops(model):
     data_parallel_model.Parallelize(
         train_model,
         input_builder_fun=add_image_input,
-        forward_pass_builder_fun=create_resnet50_model_ops,
+        forward_pass_builder_fun=create_resnext_model_ops,
         optimizer_builder_fun=add_optimizer,
         post_sync_builder_fun=add_post_sync_ops,
         devices=gpus,
@@ -449,7 +488,9 @@ def add_post_sync_ops(model):
             'cudnn_exhaustive_search': True,
         }
         test_model = model_helper.ModelHelper(
-            name="resnet50_test", arg_scope=test_arg_scope, init_params=False
+            name='resnext' + str(args.num_layers) + "_test",
+            arg_scope=test_arg_scope,
+            init_params=False,
         )
 
         test_reader = test_model.CreateDB(
@@ -466,12 +507,14 @@ def test_input_fn(model):
                 img_size=args.image_size,
                 dtype=args.dtype,
                 is_test=True,
+                mean_per_channel=args.image_mean_per_channel,
+                std_per_channel=args.image_std_per_channel,
             )
 
         data_parallel_model.Parallelize(
             test_model,
             input_builder_fun=test_input_fn,
-            forward_pass_builder_fun=create_resnet50_model_ops,
+            forward_pass_builder_fun=create_resnext_model_ops,
             post_sync_builder_fun=add_post_sync_ops,
             param_update_builder_fun=None,
             devices=gpus,
@@ -497,7 +540,8 @@ def test_input_fn(model):
         else:
             log.warning("The format of load_model_path doesn't match!")
 
-    expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % (
+    expname = "resnext_%d_gpu%d_b%d_L%d_lr%.2f_v2" % (
+        args.num_layers,
         args.num_gpus,
         total_batch_size,
         args.num_labels,
@@ -534,12 +578,24 @@ def test_input_fn(model):
 def main():
     # TODO: use argv
     parser = argparse.ArgumentParser(
-        description="Caffe2: Resnet-50 training"
+        description="Caffe2: ResNe(X)t training"
     )
     parser.add_argument("--train_data", type=str, default=None, required=True,
                         help="Path to training data (or 'null' to simulate)")
+    parser.add_argument("--num_layers", type=int, default=50,
+                        help="The number of layers in ResNe(X)t model")
+    parser.add_argument("--resnext_num_groups", type=int, default=1,
+                        help="The cardinality of resnext")
+    parser.add_argument("--resnext_width_per_group", type=int, default=64,
+                        help="The cardinality of resnext")
     parser.add_argument("--test_data", type=str, default=None,
                         help="Path to test data")
+    parser.add_argument("--image_mean_per_channel", type=float, nargs='+',
+                        help="The per channel mean for the images")
+    parser.add_argument("--image_std_per_channel", type=float, nargs='+',
+                        help="The per channel standard deviation for the images")
+    parser.add_argument("--test_epoch_size", type=int, default=50000,
+                        help="Number of test images")
     parser.add_argument("--db_type", type=str, default="lmdb",
                         help="Database type (such as lmdb or leveldb)")
     parser.add_argument("--gpus", type=str,
@@ -576,7 +632,7 @@ def main():
                         help="Port of Redis server (for rendezvous)")
     parser.add_argument("--file_store_path", type=str, default="/tmp",
                         help="Path to directory to use for rendezvous")
-    parser.add_argument("--save_model_name", type=str, default="resnet50_model",
+    parser.add_argument("--save_model_name", type=str, default="resnext_model",
                         help="Save the trained model to a given name")
     parser.add_argument("--load_model_path", type=str, default=None,
                         help="Load previously saved model to continue training")
@@ -598,6 +654,7 @@ def main():
 
     Train(args)
 
+
 if __name__ == '__main__':
     workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
     main()
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index 64174f6e71c676..5d41fdfb18262b 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -2229,7 +2229,7 @@ def ref_nhwc(x, scale, bias):
            in_place=st.booleans(),
            **hu.gcs)
     def test_unsafe_coalesce(self, sizes, in_place, gc, dc):
-        gAlignment = 32
+        gAlignment = 64
         Xs = [np.random.randn(size)
               .astype(np.random.choice([np.float32, np.float64, np.uint8]))
               for size in sizes]
diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py
index d96739b66c865b..4c3661b284dc3c 100644
--- a/caffe2/python/layers/sparse_lookup.py
+++ b/caffe2/python/layers/sparse_lookup.py
@@ -28,6 +28,12 @@ def get_sparse_lookup_predictor_version(version):
     return version
 
 
+def get_sparse_lookup_trainer_version(version):
+    assert version in {'fp32', 'fp16'},\
+        "Unexpected version of sparse_lookup layer {0}".format(version)
+    return version
+
+
 def _is_id_list(input_record):
     return schema.equal_schemas(input_record, IdList)
 
@@ -72,10 +78,12 @@ def __init__(self, model, input_record, inner_shape, reducer,
             "{} should have categorical limit > 0, but got {}".format(
                 get_key(input_record)(), input_dim))
 
-        scale = math.sqrt(1.0 / input_dim)
+        self.input_dim = input_dim
         self.shape = [input_dim] + inner_shape
-        self.weight_init = weight_init if weight_init else (
-            'UniformFill', {'min': -scale, 'max': scale})
+
+        default_init_op = self._get_default_init_op()
+
+        self.weight_init = weight_init or default_init_op
 
         if _is_id_list(self.input_record):
             sparse_key = self.input_record.items()
@@ -141,6 +149,25 @@ def get_8bits_compatible_parameters(self, fused=True):
             )
             return [RowwiseQuantized8BitsWeight(self.w, self.scale_bias)]
 
+    def _get_default_init_op(self):
+        scale = math.sqrt(1.0 / self.input_dim)
+
+        cur_scope = get_current_scope()
+        trainer_version = get_sparse_lookup_trainer_version(
+            **cur_scope.get(get_sparse_lookup_trainer_version.__name__,
+                            {'version': 'fp32'}))
+
+        if trainer_version == 'fp32':
+            default_weight_init = ('UniformFill', {'min': -scale, 'max': scale})
+        elif trainer_version == 'fp16':
+            default_weight_init = ("Float16UniformFill", {'min': -scale, 'max': scale})
+        else:
+            raise NotImplementedError(
+                "Train version {} is not currently supported".format(trainer_version)
+            )
+
+        return default_weight_init
+
     def _gather_wrapper(self, net, version, in_indices, out):
         # Gather can work on all kinds of input data types, and output
         # data with the same type. Convert the output of Gather to float,
diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py
index ae8b6cd889ef38..146d5eb53cebe9 100644
--- a/caffe2/python/mkl/rewrite_graph.py
+++ b/caffe2/python/mkl/rewrite_graph.py
@@ -86,6 +86,131 @@ def mkl_tmp(name):
         fix_BoxWithNMSLimit(net)
 
 
+def rewrite_run_net_simple_xrayocr_lstm(net, ideep=True):
+    # For xrayocr model with lstm, only rewrite the non-lstm part of the net to
+    # enable mkl, then copy the temporary output blob at the break point
+    # and all external inputs for lstm part to cpu, and execuate rest of the net
+    # (two lstm) on cpu
+    # This only works for the xrayocr lstm model which uses the first 'Shape' op
+    # to decide the break point, and after two lstm it's external_output
+    # directly so there's no need to copy back to ideep/mkl
+
+    def mkl_tmp(name):
+        return "{}__MKL__".format(name)
+
+    def cpu_tmp(name):
+        return "{}__CPU__".format(name)
+
+    input_blob = net.external_input[0]
+    if input_blob != net.op[0].input[0]:
+        raise Exception(
+            "Input blob: {} is not consumed by first op: {}".format(
+                input_blob, net.op[0]))
+    # Modify input/outputs to point to copied MKL blobs.
+    from_cpu = "CopyCPUToIDEEP" if ideep else "CopyCPUToMKL"
+    to_cpu = "CopyIDEEPToCPU" if ideep else "CopyMKLToCPU"
+    copy_input_op = core.CreateOperator(
+        from_cpu, input_blob, mkl_tmp(input_blob))
+    net.op[0].input[0] = mkl_tmp(input_blob)
+
+    # the net may contain some external_inputs falsely added during ONNX->Caffe2
+    # This should be taken care of in early steps during pytorch_to_caffe2,
+    # but if not it can cause issue in follow up steps, so check here to confirm
+    for input_blob in net.external_input:
+        for op in net.op:
+            # look for if the external_input blob is output of any op in the net
+            assert input_blob not in op.output
+
+    external_output = None
+    external_inputs_to_cpu = set()
+    find_first_shape_op = False
+    cpu_op_start_idx = -1
+    for op_idx, op in enumerate(net.op):
+        # the first Shape op mark the starting point of LSTM chunk of the net
+        if not find_first_shape_op:
+            if op.type == 'Shape':
+                external_output = op.input
+                find_first_shape_op = True
+                cpu_op_start_idx = op_idx
+        else:
+            # any external input in the LSTM part need to be copied to CPU
+            for in_blob in op.input:
+                if in_blob in net.external_input:
+                    external_inputs_to_cpu.add(in_blob)
+
+    # make sure we found the expected break point of the net
+    assert external_output is not None
+
+    # create op to copy external input blobs used in LSTM part from IDEEP to CPU
+    copy_extra_input_ops = []
+    for in_blob in external_inputs_to_cpu:
+        copy_extra_input_ops.append(core.CreateOperator(to_cpu, in_blob,
+                                                        cpu_tmp(in_blob)))
+        # rename input blobs in LSTM part to use the CPU copy
+        for op in net.op[cpu_op_start_idx:]:
+            renamed_input = [blob if blob != in_blob else cpu_tmp(in_blob)
+                             for blob in op.input]
+            op.input[:] = renamed_input
+
+    copy_output_ops = [
+        core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
+        for output_blob in external_output]
+
+    for output_blob in external_output:
+        last_producer_idx = last_producer(net.op, output_blob)
+        renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
+                           for blob in net.op[last_producer_idx].output]
+        net.op[last_producer_idx].output[:] = renamed_outputs
+
+    # rearrange all ops in correct order
+    ops = [copy_input_op] + net.op[:cpu_op_start_idx] \
+          + copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:]
+    del net.op[:]
+    net.op.extend(ops)
+
+    device = caffe2_pb2.IDEEP if ideep else caffe2_pb2.MKLDNN
+    for op in net.op:
+        # the first Shape op mark the starting point of LSTM chunk of the net
+        if op.type == 'Shape':
+            # all LSTM ops should run on CPU
+            device = caffe2_pb2.CPU
+        op.device_option.MergeFrom(
+            core.DeviceOption(device_type=device))
+        op.engine = ""
+
+        # RecurrentNetwork has a nested step_net that needs special treatment
+        if op.type == 'RecurrentNetwork':
+            for arg in op.arg:
+                if arg.name == 'step_net':
+                    for nested_op in arg.n.op:
+                        # set device to CPU
+                        nested_op.device_option.MergeFrom(
+                            core.DeviceOption(device_type=device))
+                        nested_op.engine = ""
+
+                        # rename inputs in op of nested net
+                        renamed_input = []
+                        for blob in nested_op.input:
+                            renamed_input.append(blob
+                                if blob not in external_inputs_to_cpu
+                                else cpu_tmp(blob))
+                        nested_op.input[:] = renamed_input
+
+                    # rename external inputs of nested net
+                    new_external_input = []
+                    for blob in arg.n.external_input:
+                        new_external_input.append(blob
+                            if blob not in external_inputs_to_cpu
+                            else cpu_tmp(blob))
+                    arg.n.external_input[:] = new_external_input
+
+    if ideep:
+        # Temporarily disbale conv+relu fusion until we verify further
+        # net.ParseFromString(
+        #     C.transform_optimizeForIDEEP(net.SerializeToString()))
+        fix_BoxWithNMSLimit(net)
+
+
 def rewrite_model_helper_simple(model, ideep=True):
     model = copy.deepcopy(model)
     # All parameter initialization should run on MKL
diff --git a/caffe2/python/models/resnet.py b/caffe2/python/models/resnet.py
index 60e00ed1a1ae24..7c6c6dc27fe10a 100644
--- a/caffe2/python/models/resnet.py
+++ b/caffe2/python/models/resnet.py
@@ -6,9 +6,12 @@
 from __future__ import print_function
 
 from caffe2.python import brew
+import logging
+
 '''
-Utility for creating ResNets
-See "Deep Residual Learning for Image Recognition" by He, Zhang et. al. 2015
+Utility for creating ResNe(X)t
+"Deep Residual Learning for Image Recognition" by He, Zhang et. al. 2015
+"Aggregated Residual Transformations for Deep Neural Networks" by Xie et. al. 2016
 '''
 
 
@@ -17,16 +20,33 @@ class ResNetBuilder():
     Helper class for constructing residual blocks.
     '''
 
-    def __init__(self, model, prev_blob, no_bias, is_test, spatial_bn_mom=0.9):
+    def __init__(
+        self,
+        model,
+        prev_blob,
+        no_bias,
+        is_test,
+        bn_epsilon=1e-5,
+        bn_momentum=0.9,
+    ):
         self.model = model
         self.comp_count = 0
         self.comp_idx = 0
         self.prev_blob = prev_blob
         self.is_test = is_test
-        self.spatial_bn_mom = spatial_bn_mom
+        self.bn_epsilon = bn_epsilon
+        self.bn_momentum = bn_momentum
         self.no_bias = 1 if no_bias else 0
 
-    def add_conv(self, in_filters, out_filters, kernel, stride=1, pad=0):
+    def add_conv(
+        self,
+        in_filters,
+        out_filters,
+        kernel,
+        stride=1,
+        group=1,
+        pad=0,
+    ):
         self.comp_idx += 1
         self.prev_blob = brew.conv(
             self.model,
@@ -37,6 +57,7 @@ def add_conv(self, in_filters, out_filters, kernel, stride=1, pad=0):
             weight_init=("MSRAFill", {}),
             kernel=kernel,
             stride=stride,
+            group=group,
             pad=pad,
             no_bias=self.no_bias,
         )
@@ -56,8 +77,8 @@ def add_spatial_bn(self, num_filters):
             self.prev_blob,
             'comp_%d_spatbn_%d' % (self.comp_count, self.comp_idx),
             num_filters,
-            epsilon=1e-3,
-            momentum=self.spatial_bn_mom,
+            epsilon=self.bn_epsilon,
+            momentum=self.bn_momentum,
             is_test=self.is_test,
         )
         return self.prev_blob
@@ -71,7 +92,8 @@ def add_bottleneck(
         input_filters,   # num of feature maps from preceding layer
         base_filters,    # num of filters internally in the component
         output_filters,  # num of feature maps to output
-        down_sampling=False,
+        stride=1,
+        group=1,
         spatial_batch_norm=True,
     ):
         self.comp_idx = 0
@@ -82,7 +104,7 @@ def add_bottleneck(
             input_filters,
             base_filters,
             kernel=1,
-            stride=1
+            stride=1,
         )
 
         if spatial_batch_norm:
@@ -95,8 +117,9 @@ def add_bottleneck(
             base_filters,
             base_filters,
             kernel=3,
-            stride=(1 if down_sampling is False else 2),
-            pad=1
+            stride=stride,
+            group=group,
+            pad=1,
         )
 
         if spatial_batch_norm:
@@ -109,9 +132,10 @@ def add_bottleneck(
             last_conv = self.add_spatial_bn(output_filters)
 
         # Summation with input signal (shortcut)
-        # If we need to increase dimensions (feature maps), need to
-        # do a projection for the short cut
-        if (output_filters > input_filters):
+        # When the number of feature maps mismatch between the input
+        # and output (this usually happens when the residual stage
+        # changes), we need to do a projection for the short cut
+        if output_filters != input_filters:
             shortcut_blob = brew.conv(
                 self.model,
                 shortcut_blob,
@@ -120,7 +144,7 @@ def add_bottleneck(
                 output_filters,
                 weight_init=("MSRAFill", {}),
                 kernel=1,
-                stride=(1 if down_sampling is False else 2),
+                stride=stride,
                 no_bias=self.no_bias,
             )
             if spatial_batch_norm:
@@ -129,8 +153,8 @@ def add_bottleneck(
                     shortcut_blob,
                     'shortcut_projection_%d_spatbn' % self.comp_count,
                     output_filters,
-                    epsilon=1e-3,
-                    momentum=self.spatial_bn_mom,
+                    epsilon=self.bn_epsilon,
+                    momentum=self.bn_momentum,
                     is_test=self.is_test,
                 )
 
@@ -144,6 +168,8 @@ def add_bottleneck(
         # Keep track of number of high level components if this ResNetBuilder
         self.comp_count += 1
 
+        return output_filters
+
     def add_simple_block(
         self,
         input_filters,
@@ -205,28 +231,102 @@ def add_simple_block(
         self.comp_count += 1
 
 
+def create_resnet_32x32(
+    model, data, num_input_channels, num_groups, num_labels, is_test=False
+):
+    '''
+    Create residual net for smaller images (sec 4.2 of He et. al (2015))
+    num_groups = 'n' in the paper
+    '''
+    # conv1 + maxpool
+    brew.conv(
+        model, data, 'conv1', num_input_channels, 16, kernel=3, stride=1
+    )
+    brew.spatial_bn(
+        model, 'conv1', 'conv1_spatbn', 16, epsilon=1e-3, is_test=is_test
+    )
+    brew.relu(model, 'conv1_spatbn', 'relu1')
+
+    # Number of blocks as described in sec 4.2
+    filters = [16, 32, 64]
+
+    builder = ResNetBuilder(model, 'relu1', no_bias=0, is_test=is_test)
+    prev_filters = 16
+    for groupidx in range(0, 3):
+        for blockidx in range(0, 2 * num_groups):
+            builder.add_simple_block(
+                prev_filters if blockidx == 0 else filters[groupidx],
+                filters[groupidx],
+                down_sampling=(True if blockidx == 0 and
+                               groupidx > 0 else False))
+        prev_filters = filters[groupidx]
+
+    # Final layers
+    brew.average_pool(
+        model, builder.prev_blob, 'final_avg', kernel=8, stride=1
+    )
+    brew.fc(model, 'final_avg', 'last_out', 64, num_labels)
+    softmax = brew.softmax(model, 'last_out', 'softmax')
+    return softmax
+
+
+RESNEXT_BLOCK_CONFIG = {
+    18: (2, 2, 2, 2),
+    34: (3, 4, 6, 3),
+    50: (3, 4, 6, 3),
+    101: (3, 4, 23, 3),
+    152: (3, 8, 36, 3),
+    200: (3, 24, 36, 3),
+}
+
+RESNEXT_STRIDES = [1, 2, 2, 2]
+
+logging.basicConfig()
+log = logging.getLogger("resnext_builder")
+log.setLevel(logging.DEBUG)
+
+
 # The conv1 and final_avg kernel/stride args provide a basic mechanism for
 # adapting resnet50 for different sizes of input images.
-def create_resnet50(
+def create_resnext(
     model,
     data,
     num_input_channels,
     num_labels,
+    num_layers,
+    num_groups,
+    num_width_per_group,
     label=None,
     is_test=False,
     no_loss=False,
-    no_bias=0,
+    no_bias=1,
     conv1_kernel=7,
     conv1_stride=2,
     final_avg_kernel=7,
+    log=None,
+    bn_epsilon=1e-5,
+    bn_momentum=0.9,
 ):
+    if num_layers not in RESNEXT_BLOCK_CONFIG:
+        log.error("{}-layer is invalid for resnext config".format(num_layers))
+
+    num_blocks = RESNEXT_BLOCK_CONFIG[num_layers]
+    strides = RESNEXT_STRIDES
+    num_filters = [64, 256, 512, 1024, 2048]
+
+    if num_layers in [18, 34]:
+        num_filters = [64, 64, 128, 256, 512]
+
+    # the number of features before the last FC layer
+    num_features = num_filters[-1]
+
     # conv1 + maxpool
-    brew.conv(
+    conv_blob = brew.conv(
         model,
         data,
         'conv1',
         num_input_channels,
-        64,
+        num_filters[0],
         weight_init=("MSRAFill", {}),
         kernel=conv1_kernel,
         stride=conv1_stride,
@@ -234,41 +334,40 @@ def create_resnet50(
         no_bias=no_bias
     )
 
-    brew.spatial_bn(
+    bn_blob = brew.spatial_bn(
         model,
-        'conv1',
+        conv_blob,
         'conv1_spatbn_relu',
-        64,
-        epsilon=1e-3,
-        momentum=0.1,
+        num_filters[0],
+        epsilon=bn_epsilon,
+        momentum=bn_momentum,
         is_test=is_test
     )
-    brew.relu(model, 'conv1_spatbn_relu', 'conv1_spatbn_relu')
-    brew.max_pool(model, 'conv1_spatbn_relu', 'pool1', kernel=3, stride=2)
+    relu_blob = brew.relu(model, bn_blob, bn_blob)
+    max_pool = brew.max_pool(model, relu_blob, 'pool1', kernel=3, stride=2, pad=1)
 
     # Residual blocks...
-    builder = ResNetBuilder(model, 'pool1', no_bias=no_bias,
-                            is_test=is_test, spatial_bn_mom=0.1)
-
-    # conv2_x (ref Table 1 in He et al. (2015))
-    builder.add_bottleneck(64, 64, 256)
-    builder.add_bottleneck(256, 64, 256)
-    builder.add_bottleneck(256, 64, 256)
-
-    # conv3_x
-    builder.add_bottleneck(256, 128, 512, down_sampling=True)
-    for _ in range(1, 4):
-        builder.add_bottleneck(512, 128, 512)
-
-    # conv4_x
-    builder.add_bottleneck(512, 256, 1024, down_sampling=True)
-    for _ in range(1, 6):
-        builder.add_bottleneck(1024, 256, 1024)
+    builder = ResNetBuilder(model, max_pool, no_bias=no_bias,
+                            is_test=is_test, bn_epsilon=1e-5, bn_momentum=0.9)
+
+    inner_dim = num_groups * num_width_per_group
+
+    # 4 different kinds of residual blocks
+    for residual_idx in range(4):
+        residual_num = num_blocks[residual_idx]
+        residual_stride = strides[residual_idx]
+        dim_in = num_filters[residual_idx]
+
+        for blk_idx in range(residual_num):
+            dim_in = builder.add_bottleneck(
+                dim_in,
+                inner_dim,
+                num_filters[residual_idx + 1],  # dim out
+                stride=residual_stride if blk_idx == 0 else 1,
+                group=num_groups,
+            )
 
-    # conv5_x
-    builder.add_bottleneck(1024, 512, 2048, down_sampling=True)
-    builder.add_bottleneck(2048, 512, 2048)
-    builder.add_bottleneck(2048, 512, 2048)
+        inner_dim *= 2
 
     # Final layers
     final_avg = brew.average_pool(
@@ -282,7 +381,7 @@ def create_resnet50(
 
     # Final dimension of the "image" is reduced to 7x7
     last_out = brew.fc(
-        model, final_avg, 'last_out_L{}'.format(num_labels), 2048, num_labels
+        model, final_avg, 'last_out_L{}'.format(num_labels), num_features, num_labels
     )
 
     if no_loss:
@@ -301,40 +400,35 @@ def create_resnet50(
         return brew.softmax(model, last_out, "softmax")
 
 
-def create_resnet_32x32(
-    model, data, num_input_channels, num_groups, num_labels, is_test=False
+# The conv1 and final_avg kernel/stride args provide a basic mechanism for
+# adapting resnet50 for different sizes of input images.
+def create_resnet50(
+    model,
+    data,
+    num_input_channels,
+    num_labels,
+    label=None,
+    is_test=False,
+    no_loss=False,
+    no_bias=0,
+    conv1_kernel=7,
+    conv1_stride=2,
+    final_avg_kernel=7,
 ):
-    '''
-    Create residual net for smaller images (sec 4.2 of He et. al (2015))
-    num_groups = 'n' in the paper
-    '''
-    # conv1 + maxpool
-    brew.conv(
-        model, data, 'conv1', num_input_channels, 16, kernel=3, stride=1
-    )
-    brew.spatial_bn(
-        model, 'conv1', 'conv1_spatbn', 16, epsilon=1e-3, is_test=is_test
-    )
-    brew.relu(model, 'conv1_spatbn', 'relu1')
-
-    # Number of blocks as described in sec 4.2
-    filters = [16, 32, 64]
-
-    builder = ResNetBuilder(model, 'relu1', no_bias=0, is_test=is_test)
-    prev_filters = 16
-    for groupidx in range(0, 3):
-        for blockidx in range(0, 2 * num_groups):
-            builder.add_simple_block(
-                prev_filters if blockidx == 0 else filters[groupidx],
-                filters[groupidx],
-                down_sampling=(True if blockidx == 0 and
-                               groupidx > 0 else False))
-        prev_filters = filters[groupidx]
-
-    # Final layers
-    brew.average_pool(
-        model, builder.prev_blob, 'final_avg', kernel=8, stride=1
+    # resnet50 is a special case for ResNeXt50-1x64d
+    return create_resnext(
+        model,
+        data,
+        num_input_channels,
+        num_labels,
+        num_layers=50,
+        num_groups=1,
+        num_width_per_group=64,
+        label=label,
+        is_test=is_test,
+        no_loss=no_loss,
+        no_bias=no_bias,
+        conv1_kernel=conv1_kernel,
+        conv1_stride=conv1_stride,
+        final_avg_kernel=final_avg_kernel,
     )
-    brew.fc(model, 'final_avg', 'last_out', 64, num_labels)
-    softmax = brew.softmax(model, 'last_out', 'softmax')
-    return softmax
diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py
index 708eae6b2a071c..abe1971680a7e9 100644
--- a/caffe2/python/nomnigraph.py
+++ b/caffe2/python/nomnigraph.py
@@ -68,7 +68,10 @@ def render(s):
 
 
 NeuralNetOperator = C.NeuralNetOperator
+Operator = C.NeuralNetOperator
 NeuralNetData = C.NeuralNetData
+Data = C.NeuralNetData
 NNSubgraph = C.NNSubgraph
 NNMatchGraph = C.NNMatchGraph
 Graph = C.Graph
+Annotation = C.Annotation
diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py
index 7739ac05f2979f..9288364bbcb944 100644
--- a/caffe2/python/nomnigraph_test.py
+++ b/caffe2/python/nomnigraph_test.py
@@ -174,3 +174,52 @@ def test_convertToProto(self):
             assert a == b
         for a, b in zip(new_netdef.external_output, net.Proto().external_output):
             assert a == b
+
+    def test_node_interactions(self):
+        nn = ng.NNModule()
+        dfg = nn.dataFlow
+        test1 = dfg.createNode(ng.Operator("test1"))
+        test2 = dfg.createNode(ng.Operator("test2"))
+        x = dfg.createNode(ng.Data("x"))
+        dfg.createEdge(test1, x)
+        dfg.createEdge(x, test2)
+        p = test2.getOperatorPredecessors()
+        assert len(p) == 1
+        assert p[0] == test1
+
+        # Add another node
+        test3 = dfg.createNode(ng.Operator("test3"))
+        y = dfg.createNode(ng.Data("y"))
+        dfg.createEdge(test3, y)
+        dfg.createEdge(y, test2)
+        p = test2.getOperatorPredecessors()
+        assert len(p) == 2
+        assert test1 in p
+        assert test3 in p
+
+        # Successors
+        assert len(test2.getOperatorSuccessors()) == 0
+        assert len(test1.getOperatorSuccessors()) == 1
+        assert test1.getOperatorSuccessors()[0] == test2
+
+        # Check all the nodes are valid (pybind ownership test)
+        for node in [test1, test2, test3]:
+            assert node.isOperator()
+        for node in [x, y]:
+            assert node.isTensor()
+
+    def test_annotation_basic(self):
+        annot = ng.Annotation()
+        annot.setDevice("woot")
+        assert annot.getDevice() == "woot"
+        annot.setDeviceType(7)
+        assert annot.getDeviceType() == 7
+
+    def test_annotation_from_graph(self):
+        nn = ng.NNModule()
+        node = nn.dataFlow.createNode(ng.NeuralNetOperator("TestOp"))
+        annot = node.getAnnotation()
+        annot.setDeviceType(7)
+        node.setAnnotation(annot)
+        new_annot = node.getAnnotation()
+        assert new_annot.getDeviceType() == 7
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 79dadb091488e6..7eacaf327ad264 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -25,6 +25,7 @@
 
 import caffe2
 from caffe2.python import core, workspace, rnn_cell, gru_cell
+from caffe2.python.compatibility import container_abcs
 from caffe2.python.model_helper import ModelHelper
 from caffe2.proto import caffe2_pb2
 import caffe2.python.utils
@@ -140,7 +141,7 @@ class Caffe2Backend(Backend):
     # If you increase this, make SURE you cross-reference all BC-breaking
     # changes from one version to the next, and any that you did not
     # implement, mark as broken in _broken_operators
-    _known_opset_version = 7
+    _known_opset_version = 9
 
     # This dictionary will record operators which are KNOWN to be
     # broken, so we give a good error message rather than do something
@@ -778,7 +779,7 @@ def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version
         ops = translator(init_model, pred_model, OnnxNode(node_def), opset_version)
         if isinstance(ops, Caffe2Ops):
             return ops
-        if not isinstance(ops, collections.Iterable):
+        if not isinstance(ops, container_abcs.Iterable):
             ops = [ops]
         return Caffe2Ops(ops, [], [])
 
diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py
index 5fd470c932ac59..379ef65af904a6 100644
--- a/caffe2/python/onnx/frontend.py
+++ b/caffe2/python/onnx/frontend.py
@@ -12,11 +12,11 @@
 from __future__ import unicode_literals
 
 import itertools
-import collections
 import logging
 import re
 
 from caffe2.python import core as caffe2_core
+from caffe2.python.compatibility import container_abcs
 from caffe2.proto import caffe2_legacy_pb2
 from enum import Enum
 from onnx import (defs, checker, helper, numpy_helper, mapping,
@@ -156,7 +156,7 @@ def caffe2_op_to_onnx_node(cls, op_def, shapes):
         const_tensors = []
         if isinstance(nodes, tuple):
             nodes, const_tensors = nodes
-        if not isinstance(nodes, collections.Iterable):
+        if not isinstance(nodes, container_abcs.Iterable):
             nodes = [nodes]
         return nodes, const_tensors
 
diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py
index 0d35110740825b..5be8b689f115cb 100644
--- a/caffe2/python/operator_test/activation_ops_test.py
+++ b/caffe2/python/operator_test/activation_ops_test.py
@@ -11,12 +11,13 @@
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.mkl_test_util as mu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import unittest
 
 
-class TestActivations(hu.HypothesisTestCase):
-    @given(X=hu.tensor(), in_place=st.booleans(),
+class TestActivations(serial.SerializedTestCase):
+    @serial.given(X=hu.tensor(), in_place=st.booleans(),
            engine=st.sampled_from(["", "CUDNN"]), **mu.gcs)
     def test_relu(self, X, in_place, engine, gc, dc):
         if gc == mu.mkl_do:
@@ -74,7 +75,7 @@ def relu_grad_ref(g_out, outputs, fwd_inputs):
             output_to_grad="X" if in_place else "Y",
             grad_reference=relu_grad_ref)
 
-    @given(X=hu.tensor(elements=st.floats(-3.0, 3.0)),
+    @serial.given(X=hu.tensor(elements=st.floats(-3.0, 3.0)),
            n=st.floats(min_value=0.5, max_value=2.0),
            in_place=st.booleans(), **hu.gcs)
     def test_relu_n(self, X, n, in_place, gc, dc):
@@ -100,7 +101,7 @@ def relu_n_ref(X):
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=0.005)
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            alpha=st.floats(min_value=0.1, max_value=2.0),
            in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
            **hu.gcs)
@@ -169,7 +170,7 @@ def prelu_ref(X, W):
             # Gradient check wrt W
             self.assertGradientChecks(gc, op, [X, W], 1, [0], stepsize=1e-2)
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            alpha=st.floats(min_value=0.1, max_value=2.0),
            inplace=st.booleans(),
            **hu.gcs)
diff --git a/caffe2/python/operator_test/adadelta_test.py b/caffe2/python/operator_test/adadelta_test.py
index 9112d50f38df86..2976b06108ff51 100644
--- a/caffe2/python/operator_test/adadelta_test.py
+++ b/caffe2/python/operator_test/adadelta_test.py
@@ -12,9 +12,10 @@
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestAdadelta(hu.HypothesisTestCase):
+class TestAdadelta(serial.SerializedTestCase):
     @staticmethod
     def ref_adadelta(param_in,
                      mom_in,
@@ -44,7 +45,7 @@ def ref_adadelta(param_in,
             return (param_out.astype(np.float32), mom_out.astype(np.float32),
                     mom_delta_out.astype(np.float32))
 
-    @given(inputs=hu.tensors(n=4),
+    @serial.given(inputs=hu.tensors(n=4),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            epsilon=st.floats(min_value=0.01, max_value=0.99,
@@ -136,7 +137,7 @@ def ref_sparse(param, moment, moment_delta, indices, grad, lr, decay,
                     ref_using_fp16
                 ], ref_sparse)
 
-    @given(inputs=hu.tensors(n=3),
+    @serial.given(inputs=hu.tensors(n=3),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            epsilon=st.floats(min_value=0.01, max_value=0.99,
diff --git a/caffe2/python/operator_test/adagrad_test.py b/caffe2/python/operator_test/adagrad_test.py
index e4101e92cf01cb..69aead865d1c3f 100644
--- a/caffe2/python/operator_test/adagrad_test.py
+++ b/caffe2/python/operator_test/adagrad_test.py
@@ -15,11 +15,13 @@
 from caffe2.python.operator_test.adagrad_test_helper import (
     ref_adagrad, adagrad_sparse_test_helper
 )
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import unittest
 import os
 
-class TestAdagrad(hu.HypothesisTestCase):
+
+class TestAdagrad(serial.SerializedTestCase):
     @staticmethod
     def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon):
         mom_out = mom_in + np.mean(np.square(grad))
@@ -27,7 +29,7 @@ def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon):
         param_out = param_in + grad_adj
         return (param_out, mom_out)
 
-    @given(inputs=hu.tensors(n=3),
+    @serial.given(inputs=hu.tensors(n=3),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            epsilon=st.floats(min_value=0.01, max_value=0.99,
@@ -112,7 +114,7 @@ def test_sparse_adagrad(self, inputs, lr, epsilon, gc, dc):
         return adagrad_sparse_test_helper(self, inputs, lr, epsilon,
             None, ref_adagrad, gc, dc)
 
-    @given(inputs=hu.tensors(n=2),
+    @serial.given(inputs=hu.tensors(n=2),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            epsilon=st.floats(min_value=0.01, max_value=0.99,
@@ -223,7 +225,7 @@ def ref_row_wise_sparse(param, momentum, indices, grad, lr):
             [param, momentum, indices, grad, lr],
             ref_row_wise_sparse)
 
-    @given(inputs=hu.tensors(n=1),
+    @serial.given(inputs=hu.tensors(n=1),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            epsilon=st.floats(min_value=0.01, max_value=0.99,
diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py
index 9cf3f7c06b4385..8209b1c0493095 100644
--- a/caffe2/python/operator_test/adam_test.py
+++ b/caffe2/python/operator_test/adam_test.py
@@ -34,15 +34,18 @@ def ref_adam(param, mom1, mom2, grad, LR, ITER,
 
     @staticmethod
     def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER,
-                          beta1, beta2, epsilon):
+                          beta1, beta2, epsilon, output_grad=False):
         t = ITER + 1
-        corrected_local_rate = LR * np.sqrt(1 - np.power(beta2, t)) / \
+        corrected_local_rate = np.sqrt(1 - np.power(beta2, t)) / \
             (1 - np.power(beta1, t))
         mom1_out = (beta1 * mom1) + (1 - beta1) * grad
         mom2_out = (beta2 * mom2) + (1 - beta2) * np.mean(np.square(grad))
-        param_out = param + corrected_local_rate * mom1_out / \
-            (np.sqrt(mom2_out) + epsilon)
-        return (param_out, mom1_out, mom2_out)
+        grad_out = corrected_local_rate * mom1_out / (np.sqrt(mom2_out) + epsilon)
+        param_out = param + LR * grad_out
+        if output_grad:
+            return param_out, mom1_out, mom2_out, grad_out
+        else:
+            return param_out, mom1_out, mom2_out
 
     @given(inputs=hu.tensors(n=4),
            ITER=st.integers(min_value=0, max_value=10000),
@@ -176,6 +179,76 @@ def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER):
             ref_sparse,
             input_device_options=input_device_options)
 
+    @given(inputs=hu.tensors(n=4),
+           ITER=st.integers(min_value=0, max_value=10000),
+           LR=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           beta1=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           beta2=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           data_strategy=st.data(),
+           **hu.gcs)
+    def test_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon,
+                         data_strategy, gc, dc):
+        param, mom1, mom2, grad = inputs
+        mom2 = np.absolute(mom2)
+        ITER = np.array([ITER], dtype=np.int64)
+        LR = np.array([LR], dtype=np.float32)
+
+        # Create an indexing array containing values which index into grad
+        indices = data_strategy.draw(
+            hu.tensor(
+                max_dim=1,
+                min_value=1,
+                max_value=grad.shape[0],
+                dtype=np.int64,
+                elements=st.sampled_from(np.arange(grad.shape[0])),
+            ),
+        )
+
+        # Verify that the generated indices are unique
+        hypothesis.assume(
+            np.array_equal(
+                np.unique(indices.flatten()),
+                np.sort(indices.flatten())))
+
+        # Sparsify grad
+        grad = grad[indices]
+
+        op = core.CreateOperator(
+            "SparseAdam",
+            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
+            ["param", "mom1", "mom2", "output_grad"],
+            beta1=beta1, beta2=beta2, epsilon=epsilon)
+
+        def ref_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
+                                beta1, beta2, epsilon, output_grad):
+            param_out = np.copy(param)
+            mom1_out = np.copy(mom1)
+            mom2_out = np.copy(mom2)
+            grad_out = np.copy(grad)
+
+            for i, index in enumerate(indices):
+                param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \
+                    self.ref_adam(param[index], mom1[index], mom2[index],
+                                  grad[i], LR, ITER,
+                                  beta1, beta2, epsilon, output_grad)
+            return (param_out, mom1_out, mom2_out, grad_out)
+
+        # Iter lives on the CPU
+        input_device_options = {'iter': hu.cpu_do}
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, mom1, mom2, indices, grad, LR, ITER],
+            functools.partial(
+                ref_sparse_output_grad,
+                beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True),
+            input_device_options=input_device_options)
+
     @given(inputs=hu.tensors(n=3),
            ITER=st.integers(min_value=0, max_value=10000),
            LR=st.floats(min_value=0.01, max_value=0.99,
@@ -252,6 +325,87 @@ def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER):
             ref_row_wise_sparse,
             input_device_options=input_device_options)
 
+    @given(inputs=hu.tensors(n=3),
+           ITER=st.integers(min_value=0, max_value=10000),
+           LR=st.floats(min_value=0.01, max_value=0.99,
+                        allow_nan=False, allow_infinity=False),
+           beta1=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           beta2=st.floats(min_value=0.01, max_value=0.99,
+                           allow_nan=False, allow_infinity=False),
+           epsilon=st.floats(min_value=0.01, max_value=0.99,
+                             allow_nan=False, allow_infinity=False),
+           data_strategy=st.data(),
+               **hu.gcs_cpu_only)
+    def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2,
+                                  epsilon, data_strategy, gc, dc):
+        param, mom1, grad = inputs
+        ITER = np.array([ITER], dtype=np.int64)
+        LR = np.array([LR], dtype=np.float32)
+
+        # Create a 1D row-wise average 2nd moment tensor.
+        mom2 = data_strategy.draw(
+            hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0],
+                        elements=hu.elements_of_type(dtype=np.float32))
+        )
+        mom2 = np.absolute(mom2)
+
+        # Create an indexing array containing values which index into grad
+        indices = data_strategy.draw(
+            hu.tensor(
+                max_dim=1,
+                min_value=1,
+                max_value=grad.shape[0],
+                dtype=np.int64,
+                elements=st.sampled_from(np.arange(grad.shape[0])),
+            ),
+        )
+
+        # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment
+        # tensor that is strictly 1-dimensional and equal in length to the
+        # first dimension of the parameters, so indices must also be
+        # 1-dimensional.
+        indices = indices.flatten()
+
+        hypothesis.note('indices.shape: %s' % str(indices.shape))
+
+        # Verify that the generated indices are unique
+        hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices)))
+
+        # Sparsify grad
+        grad = grad[indices]
+
+        op = core.CreateOperator(
+            "RowWiseSparseAdam",
+            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
+            ["param", "mom1", "mom2", "output_grad"],
+            beta1=beta1, beta2=beta2, epsilon=epsilon)
+
+        def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
+                                        beta1, beta2, epsilon, output_grad):
+            param_out = np.copy(param)
+            mom1_out = np.copy(mom1)
+            mom2_out = np.copy(mom2)
+            grad_out = np.copy(grad)
+
+            for i, index in enumerate(indices):
+                param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \
+                    self.ref_row_wise_adam(param[index], mom1[index], mom2[index],
+                                           grad[i], LR, ITER,
+                                           beta1, beta2, epsilon, output_grad)
+            return (param_out, mom1_out, mom2_out, grad_out)
+
+        # Iter lives on the CPU
+        input_device_options = {'iter': hu.cpu_do}
+
+        self.assertReferenceChecks(
+            gc, op,
+            [param, mom1, mom2, indices, grad, LR, ITER],
+            functools.partial(
+                ref_row_wise_sparse_output_grad,
+                beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True),
+            input_device_options=input_device_options)
+
 
 if __name__ == "__main__":
     import unittest
diff --git a/caffe2/python/operator_test/affine_channel_op_test.py b/caffe2/python/operator_test/affine_channel_op_test.py
index 6e56da29b7f6a9..bcce4efc8ec529 100644
--- a/caffe2/python/operator_test/affine_channel_op_test.py
+++ b/caffe2/python/operator_test/affine_channel_op_test.py
@@ -2,16 +2,15 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from caffe2.python import core
-from hypothesis import given
-
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+from hypothesis import given
 import hypothesis.strategies as st
+import numpy as np
 
 
-class TestAffineChannelOp(hu.HypothesisTestCase):
+class TestAffineChannelOp(serial.SerializedTestCase):
     def affine_channel_nchw_ref(self, X, scale, bias):
         dims = X.shape
         N = dims[0]
@@ -30,9 +29,10 @@ def affine_channel_nhwc_ref(self, X, scale, bias):
         Y = X * scale + bias
         return [Y.reshape(dims)]
 
-    @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5),
-           W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]),
-           is_learnable=st.booleans(), in_place=st.booleans(), **hu.gcs)
+    @serial.given(N=st.integers(1, 5), C=st.integers(1, 5),
+            H=st.integers(1, 5), W=st.integers(1, 5),
+            order=st.sampled_from(["NCHW", "NHWC"]), is_learnable=st.booleans(),
+            in_place=st.booleans(), **hu.gcs)
     def test_affine_channel_2d(
             self, N, C, H, W, order, is_learnable, in_place, gc, dc):
         op = core.CreateOperator(
diff --git a/caffe2/python/operator_test/arg_ops_test.py b/caffe2/python/operator_test/arg_ops_test.py
index 6492189808a72c..9bdea7ecf5cdbf 100644
--- a/caffe2/python/operator_test/arg_ops_test.py
+++ b/caffe2/python/operator_test/arg_ops_test.py
@@ -9,11 +9,13 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestArgOps(hu.HypothesisTestCase):
-    @given(X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5),
-           keepdims=st.booleans(), **hu.gcs)
+class TestArgOps(serial.SerializedTestCase):
+    @serial.given(
+        X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5),
+        keepdims=st.booleans(), **hu.gcs)
     def test_argmax(self, X, axis, keepdims, gc, dc):
         if axis >= len(X.shape):
             axis %= len(X.shape)
@@ -32,8 +34,9 @@ def argmax_ref(X):
         self.assertReferenceChecks(gc, op, [X], argmax_ref)
         self.assertDeviceChecks(dc, op, [X], [0])
 
-    @given(X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5),
-           keepdims=st.booleans(), **hu.gcs)
+    @serial.given(
+        X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5),
+        keepdims=st.booleans(), **hu.gcs)
     def test_argmin(self, X, axis, keepdims, gc, dc):
         if axis >= len(X.shape):
             axis %= len(X.shape)
diff --git a/caffe2/python/operator_test/batch_box_cox_test.py b/caffe2/python/operator_test/batch_box_cox_test.py
index f8bc77ba9e5a7a..7252499352ee84 100644
--- a/caffe2/python/operator_test/batch_box_cox_test.py
+++ b/caffe2/python/operator_test/batch_box_cox_test.py
@@ -7,6 +7,7 @@
 from hypothesis import given
 
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
@@ -51,8 +52,8 @@ def _inputs(draw):
     )
 
 
-class TestBatchBoxCox(hu.HypothesisTestCase):
-    @given(
+class TestBatchBoxCox(serial.SerializedTestCase):
+    @serial.given(
         inputs=_inputs(),
         **hu.gcs_cpu_only
     )
diff --git a/caffe2/python/operator_test/batch_bucketize_op_test.py b/caffe2/python/operator_test/batch_bucketize_op_test.py
index 711240de9cfdfc..301941afb590c3 100644
--- a/caffe2/python/operator_test/batch_bucketize_op_test.py
+++ b/caffe2/python/operator_test/batch_bucketize_op_test.py
@@ -6,13 +6,14 @@
 import numpy as np
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
 
 
-class TestBatchBucketize(hu.HypothesisTestCase):
-    @given(**hu.gcs_cpu_only)
+class TestBatchBucketize(serial.SerializedTestCase):
+    @serial.given(**hu.gcs_cpu_only)
     def test_batch_bucketize_example(self, gc, dc):
         op = core.CreateOperator('BatchBucketize',
                                  ["FEATURE", "INDICES", "BOUNDARIES", "LENGTHS"],
diff --git a/caffe2/python/operator_test/batch_moments_op_test.py b/caffe2/python/operator_test/batch_moments_op_test.py
index 2db25e73892563..91d49b76ee4119 100644
--- a/caffe2/python/operator_test/batch_moments_op_test.py
+++ b/caffe2/python/operator_test/batch_moments_op_test.py
@@ -2,16 +2,15 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from caffe2.python import core
-from hypothesis import given
-
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+from hypothesis import given
 import hypothesis.strategies as st
+import numpy as np
 
 
-class TestBatchMomentsOp(hu.HypothesisTestCase):
+class TestBatchMomentsOp(serial.SerializedTestCase):
     def batch_moments_nchw_ref(self, X):
         dims = X.shape
         N = dims[0]
@@ -29,9 +28,9 @@ def batch_moments_nhwc_ref(self, X):
         var = np.mean(np.square(X), axis=0)
         return [mu, var]
 
-    @given(N=st.integers(1, 5), C=st.integers(1, 5), H=st.integers(1, 5),
-           W=st.integers(1, 5), order=st.sampled_from(["NCHW", "NHWC"]),
-           **hu.gcs)
+    @serial.given(N=st.integers(1, 5), C=st.integers(1, 5),
+            H=st.integers(1, 5), W=st.integers(1, 5),
+            order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
     def test_batch_moments_2d(self, N, C, H, W, order, gc, dc):
         op = core.CreateOperator(
             "BatchMoments",
diff --git a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
index a8e04fb2e14750..a47cc44e593c8b 100644
--- a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
+++ b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
@@ -3,17 +3,17 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
-
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
+import numpy as np
 
 
-class TestBatchSparseToDense(hu.HypothesisTestCase):
+class TestBatchSparseToDense(serial.SerializedTestCase):
 
-    @given(
+    @serial.given(
         batch_size=st.integers(5, 10),
         dense_last_dim=st.integers(5, 10),
         default_value=st.floats(min_value=2.0, max_value=3.0),
diff --git a/caffe2/python/operator_test/bbox_transform_test.py b/caffe2/python/operator_test/bbox_transform_test.py
index b54a4435513be7..f76891bae4dc89 100644
--- a/caffe2/python/operator_test/bbox_transform_test.py
+++ b/caffe2/python/operator_test/bbox_transform_test.py
@@ -2,9 +2,11 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
@@ -201,8 +203,8 @@ def generate_rois_rotated(roi_counts, im_dims):
     return rotated_rois
 
 
-class TestBBoxTransformOp(hu.HypothesisTestCase):
-    @given(
+class TestBBoxTransformOp(serial.SerializedTestCase):
+    @serial.given(
         num_rois=st.integers(1, 10),
         num_classes=st.integers(1, 10),
         im_dim=st.integers(100, 600),
diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py
index 638248d60bafe5..8811f5667503b8 100644
--- a/caffe2/python/operator_test/boolean_mask_test.py
+++ b/caffe2/python/operator_test/boolean_mask_test.py
@@ -2,18 +2,18 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-from hypothesis import assume, given
-import hypothesis.strategies as st
-
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+from hypothesis import assume, given
+import hypothesis.strategies as st
+import numpy as np
 
 
-class TestBooleanMaskOp(hu.HypothesisTestCase):
+class TestBooleanMaskOp(serial.SerializedTestCase):
 
-    @given(x=hu.tensor(min_dim=1,
+    @serial.given(x=hu.tensor(min_dim=1,
                        max_dim=5,
                        elements=st.floats(min_value=0.5, max_value=1.0)),
            **hu.gcs)
@@ -54,7 +54,7 @@ def _dtype_conversion(x, dtype, gc, dc):
             x = x.astype(dtype)
         return x, dc
 
-    @given(x=hu.tensor(min_dim=2,
+    @serial.given(x=hu.tensor(min_dim=2,
                        max_dim=5,
                        elements=st.floats(min_value=0.5, max_value=1.0)),
            dtype=st.sampled_from([np.float32, np.float16]),
diff --git a/caffe2/python/operator_test/boolean_unmask_test.py b/caffe2/python/operator_test/boolean_unmask_test.py
index 86b2fedb49a245..e3bc9f248d3a26 100644
--- a/caffe2/python/operator_test/boolean_unmask_test.py
+++ b/caffe2/python/operator_test/boolean_unmask_test.py
@@ -3,16 +3,15 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
-from hypothesis import given
-import hypothesis.strategies as st
-
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+import hypothesis.strategies as st
+import numpy as np
 
 
-class TestUnmaskOp(hu.HypothesisTestCase):
-    @given(N=st.integers(min_value=2, max_value=20),
+class TestUnmaskOp(serial.SerializedTestCase):
+    @serial.given(N=st.integers(min_value=2, max_value=20),
            dtype=st.sampled_from([
                np.bool_,
                np.int8,
diff --git a/caffe2/python/operator_test/box_with_nms_limit_op_test.py b/caffe2/python/operator_test/box_with_nms_limit_op_test.py
index 8cd9acbd6a5d3c..52155c0a5d7649 100644
--- a/caffe2/python/operator_test/box_with_nms_limit_op_test.py
+++ b/caffe2/python/operator_test/box_with_nms_limit_op_test.py
@@ -3,13 +3,13 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
-import unittest
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
-
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core
+import unittest
+import numpy as np
 
 
 def get_op(input_len, output_len, args):
@@ -64,8 +64,8 @@ def gen_multiple_boxes(centers, scores, count, num_classes):
     return ret_box, ret_scores
 
 
-class TestBoxWithNMSLimitOp(hu.HypothesisTestCase):
-    @given(**HU_CONFIG)
+class TestBoxWithNMSLimitOp(serial.SerializedTestCase):
+    @serial.given(**HU_CONFIG)
     def test_simple(self, gc):
         in_centers = [(0, 0), (20, 20), (50, 50)]
         in_scores = [0.9, 0.8, 0.6]
diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py
index 79293fc6453232..130364261ea166 100644
--- a/caffe2/python/operator_test/ceil_op_test.py
+++ b/caffe2/python/operator_test/ceil_op_test.py
@@ -4,17 +4,18 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
 import numpy as np
 
 import unittest
 
 
-class TestCeil(hu.HypothesisTestCase):
+class TestCeil(serial.SerializedTestCase):
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            engine=st.sampled_from(["", "CUDNN"]),
            **hu.gcs)
     def test_ceil(self, X, gc, dc, engine):
diff --git a/caffe2/python/operator_test/channel_backprop_stats_op_test.py b/caffe2/python/operator_test/channel_backprop_stats_op_test.py
index 5c59b8d6f05c1b..e516288b436c38 100644
--- a/caffe2/python/operator_test/channel_backprop_stats_op_test.py
+++ b/caffe2/python/operator_test/channel_backprop_stats_op_test.py
@@ -3,17 +3,17 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import assume, given
 import hypothesis.strategies as st
 import numpy as np
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.proto import caffe2_pb2
 import unittest
 
 
-class TestChannelBackpropStats(hu.HypothesisTestCase):
-    @given(
+class TestChannelBackpropStats(serial.SerializedTestCase):
+    @serial.given(
         size=st.integers(7, 10),
         inputChannels=st.integers(1, 10),
         batchSize=st.integers(1, 3),
diff --git a/caffe2/python/operator_test/channel_shuffle_test.py b/caffe2/python/operator_test/channel_shuffle_test.py
index e17d7a736063c4..34417fd1847337 100644
--- a/caffe2/python/operator_test/channel_shuffle_test.py
+++ b/caffe2/python/operator_test/channel_shuffle_test.py
@@ -3,14 +3,15 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
-import caffe2.python.hypothesis_test_util as hu
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
+import numpy as np
 
 
-class ChannelShuffleOpsTest(hu.HypothesisTestCase):
+class ChannelShuffleOpsTest(serial.SerializedTestCase):
     def _channel_shuffle_nchw_ref(self, X, group):
         dims = X.shape
         N = dims[0]
@@ -31,7 +32,7 @@ def _channel_shuffle_nhwc_ref(self, X, group):
         Y = np.transpose(X, axes=(0, 1, 3, 2))
         return [Y.reshape(dims)]
 
-    @given(N=st.integers(1, 5), G=st.integers(1, 5), K=st.integers(1, 5),
+    @serial.given(N=st.integers(1, 5), G=st.integers(1, 5), K=st.integers(1, 5),
            H=st.integers(1, 5), W=st.integers(1, 5),
            order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
     def test_channel_shuffle(self, N, G, K, H, W, order, gc, dc):
diff --git a/caffe2/python/operator_test/channel_stats_op_test.py b/caffe2/python/operator_test/channel_stats_op_test.py
index 2a238e67542f5a..f1daddee7721dd 100644
--- a/caffe2/python/operator_test/channel_stats_op_test.py
+++ b/caffe2/python/operator_test/channel_stats_op_test.py
@@ -3,17 +3,17 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import assume, given
 import hypothesis.strategies as st
 import numpy as np
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.proto import caffe2_pb2
 import unittest
 
 
-class TestChannelStats(hu.HypothesisTestCase):
-    @given(
+class TestChannelStats(serial.SerializedTestCase):
+    @serial.given(
         size=st.integers(7, 10),
         inputChannels=st.integers(1, 10),
         batchSize=st.integers(1, 3),
diff --git a/caffe2/python/operator_test/clip_op_test.py b/caffe2/python/operator_test/clip_op_test.py
index 38499a69eb1d90..46163d30dedc2f 100644
--- a/caffe2/python/operator_test/clip_op_test.py
+++ b/caffe2/python/operator_test/clip_op_test.py
@@ -10,10 +10,11 @@
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestClip(hu.HypothesisTestCase):
-    @given(X=hu.tensor(),
+class TestClip(serial.SerializedTestCase):
+    @serial.given(X=hu.tensor(),
            min_=st.floats(min_value=-2, max_value=0),
            max_=st.floats(min_value=0, max_value=2),
            inplace=st.booleans(),
diff --git a/caffe2/python/operator_test/clip_tensor_op_test.py b/caffe2/python/operator_test/clip_tensor_op_test.py
index bea2133d5182ee..042b4ef2a8326c 100644
--- a/caffe2/python/operator_test/clip_tensor_op_test.py
+++ b/caffe2/python/operator_test/clip_tensor_op_test.py
@@ -4,15 +4,15 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestClipTensorByScalingOp(hu.HypothesisTestCase):
+class TestClipTensorByScalingOp(serial.SerializedTestCase):
 
-    @given(n=st.integers(5, 8), d=st.integers(2, 4),
+    @serial.given(n=st.integers(5, 8), d=st.integers(2, 4),
            threshold=st.floats(0.1, 10),
            additional_threshold=st.floats(0.1, 10),
            use_additional_threshold=st.booleans(),
diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
index facb675a4944ec..e37738801745c2 100644
--- a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
+++ b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
@@ -4,15 +4,16 @@
 from __future__ import unicode_literals
 
 import numpy as np
-import unittest
 import os
+import unittest
 
 from hypothesis import given, settings
 import hypothesis.strategies as st
 
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core, utils
 from caffe2.proto import caffe2_pb2
+from caffe2.python import core, utils
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 #
 # Should match original Detectron code at
@@ -129,17 +130,17 @@ def collect_and_distribute_fpn_rpn_ref(*inputs):
     return outputs
 
 
-class TestCollectAndDistributeFpnRpnProposals(hu.HypothesisTestCase):
+class TestCollectAndDistributeFpnRpnProposals(serial.SerializedTestCase):
     @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
-    @given(proposal_count=st.integers(min_value=1000, max_value=8000),
-           rpn_min_level=st.integers(min_value=1, max_value=4),
-           rpn_num_levels=st.integers(min_value=1, max_value=6),
-           roi_min_level=st.integers(min_value=1, max_value=4),
-           roi_num_levels=st.integers(min_value=1, max_value=6),
-           rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000),
-           roi_canonical_scale=st.integers(min_value=100, max_value=300),
-           roi_canonical_level=st.integers(min_value=1, max_value=8),
-           **hu.gcs_cpu_only)
+    @serial.given(proposal_count=st.integers(min_value=1000, max_value=8000),
+                  rpn_min_level=st.integers(min_value=1, max_value=4),
+                  rpn_num_levels=st.integers(min_value=1, max_value=6),
+                  roi_min_level=st.integers(min_value=1, max_value=4),
+                  roi_num_levels=st.integers(min_value=1, max_value=6),
+                  rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000),
+                  roi_canonical_scale=st.integers(min_value=100, max_value=300),
+                  roi_canonical_level=st.integers(min_value=1, max_value=8),
+                  **hu.gcs_cpu_only)
     def test_collect_and_dist(
         self,
         proposal_count,
diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py
index af8d5486e6e6ab..3d2c4ae31946d9 100644
--- a/caffe2/python/operator_test/concat_split_op_test.py
+++ b/caffe2/python/operator_test/concat_split_op_test.py
@@ -3,13 +3,14 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-import hypothesis.strategies as st
-import unittest
-import caffe2.python.hypothesis_test_util as hu
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+import unittest
 
 
 @st.composite
@@ -44,8 +45,8 @@ def _tensor_splits(draw, add_axis=False):
         )
 
 
-class TestConcatSplitOps(hu.HypothesisTestCase):
-    @given(tensor_splits=_tensor_splits(),
+class TestConcatSplitOps(serial.SerializedTestCase):
+    @serial.given(tensor_splits=_tensor_splits(),
            **hu.gcs)
     def test_concat(self, tensor_splits, gc, dc):
         axis, _, splits = tensor_splits
@@ -92,7 +93,7 @@ def test_concat_add_axis(self, tensor_splits, gc, dc):
         for i in range(len(splits)):
             self.assertGradientChecks(gc, op, splits, i, [0])
 
-    @given(tensor_splits=_tensor_splits(),
+    @serial.given(tensor_splits=_tensor_splits(),
            split_as_arg=st.booleans(),
            **hu.gcs)
     def test_split(self, tensor_splits, split_as_arg, gc, dc):
@@ -127,7 +128,7 @@ def split_ref(input, split=split_info):
         self.assertDeviceChecks(dc, op, input_tensors, outputs_with_grad)
         self.assertGradientChecks(gc, op, input_tensors, 0, outputs_with_grad)
 
-    @given(
+    @serial.given(
         inputs=hu.lengths_tensor(
             dtype=np.float32,
             min_value=1,
diff --git a/caffe2/python/operator_test/conditional_test.py b/caffe2/python/operator_test/conditional_test.py
index b96b530d687751..88d8fd8b7a27a3 100644
--- a/caffe2/python/operator_test/conditional_test.py
+++ b/caffe2/python/operator_test/conditional_test.py
@@ -2,15 +2,15 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-from hypothesis import given
-import hypothesis.strategies as st
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+import hypothesis.strategies as st
+import numpy as np
 
 
-class TestConditionalOp(hu.HypothesisTestCase):
-    @given(rows_num=st.integers(1, 10000), **hu.gcs_cpu_only)
+class TestConditionalOp(serial.SerializedTestCase):
+    @serial.given(rows_num=st.integers(1, 10000), **hu.gcs_cpu_only)
     def test_conditional(self, rows_num, gc, dc):
         op = core.CreateOperator(
             "Conditional", ["condition", "data_t", "data_f"], "output"
diff --git a/caffe2/python/operator_test/conftest.py b/caffe2/python/operator_test/conftest.py
new file mode 100644
index 00000000000000..54a57dfd51928d
--- /dev/null
+++ b/caffe2/python/operator_test/conftest.py
@@ -0,0 +1,39 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import caffe2.python.serialized_test.serialized_test_util as serial
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        '-G',
+        '--generate-serialized',
+        action='store_true',
+        dest='generate',
+        help='generate output files (default=false, compares to current files)',
+    )
+    parser.addoption(
+        '-O',
+        '--output',
+        default=serial.DATA_DIR,
+        dest='output',
+        help='output directory (default: %(default)s)'
+    )
+    parser.addoption(
+        '-D',
+        '--disable-serialized-check',
+        action='store_true',
+        dest='disable',
+        help='disable checking serialized tests'
+    )
+
+
+def pytest_configure(config):
+    generate = config.getoption('generate', default=False)
+    output = config.getoption('output', default=serial.DATA_DIR)
+    disable = config.getoption('disable', default=False)
+    serial._output_context.__setattr__('should_generate_output', generate)
+    serial._output_context.__setattr__('output_dir', output)
+    serial._output_context.__setattr__('disable_serialized_check', disable)
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index 8e65a9324535a6..d29d724b89c29d 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -13,11 +13,13 @@
 from caffe2.python import brew, core, workspace
 import caffe2.python.hypothesis_test_util as hu
 from caffe2.python.model_helper import ModelHelper
+import caffe2.python.serialized_test.serialized_test_util as serial
 import caffe2.python._import_c_extension as C
 
 import unittest
 import os
 
+
 def _cudnn_supports(
         dilation=False,
         nhwc=False,
@@ -54,7 +56,7 @@ def _cudnn_convolution_algo_count(direction):
         return st.sampled_from([-1])
 
 
-class TestConvolution(hu.HypothesisTestCase):
+class TestConvolution(serial.SerializedTestCase):
     # CUDNN does NOT support different padding values and we skip it
     @given(op_type=st.sampled_from(["Conv", "Conv2D"]),
            stride_h=st.integers(1, 3),
@@ -636,14 +638,15 @@ def test_use_cudnn_engine_interactions(self):
                         self.assertEqual(model.Proto().op[-1].engine,
                                          expected_engine)
 
-    @given(op_type=st.sampled_from(["Conv", "Conv2D"]), N=st.integers(1, 4),
-           G=st.integers(1, 4), DX=st.integers(1, 4), DY=st.integers(1, 4),
-           H=st.integers(1, 4), W=st.integers(1, 4), use_bias=st.booleans(),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
-           force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
-           force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
-           **hu.gcs)
+    @serial.given(
+        op_type=st.sampled_from(["Conv", "Conv2D"]), N=st.integers(1, 4),
+        G=st.integers(1, 4), DX=st.integers(1, 4), DY=st.integers(1, 4),
+        H=st.integers(1, 4), W=st.integers(1, 4), use_bias=st.booleans(),
+        order=st.sampled_from(["NCHW", "NHWC"]),
+        force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
+        force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
+        force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
+        **hu.gcs)
     def test_1x1_conv(self, op_type, N, G, DX, DY, H, W, use_bias, order,
                       force_algo_fwd, force_algo_dgrad,
                       force_algo_wgrad, gc, dc):
diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
index d67df5fd3e1f32..1124df94e67ae7 100644
--- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
+++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
@@ -9,10 +9,11 @@
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestCosineEmbeddingCriterion(hu.HypothesisTestCase):
-    @given(N=st.integers(min_value=10, max_value=20),
+class TestCosineEmbeddingCriterion(serial.SerializedTestCase):
+    @serial.given(N=st.integers(min_value=10, max_value=20),
            seed=st.integers(min_value=0, max_value=65535),
            margin=st.floats(min_value=-0.5, max_value=0.5),
            **hu.gcs)
diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
index 51d2bbc6f484ab..4deef35c5bb506 100644
--- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
+++ b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
@@ -2,10 +2,12 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core
 from collections import defaultdict, Counter
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
@@ -15,9 +17,9 @@
 DEFAULT_PRUNE_THRESHOLD = 0.001
 
 
-class TestCTCBeamSearchDecoderOp(hu.HypothesisTestCase):
+class TestCTCBeamSearchDecoderOp(serial.SerializedTestCase):
 
-    @given(
+    @serial.given(
         batch=st.sampled_from([1, 2, 4]),
         max_time=st.sampled_from([1, 8, 64]),
         alphabet_size=st.sampled_from([1, 2, 32, 128, 512]),
diff --git a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
index 0e638e8155e9e7..98079d2b026ae0 100644
--- a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
+++ b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
@@ -2,18 +2,19 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
-
 import unittest
 
 
-class TestCTCGreedyDecoderOp(hu.HypothesisTestCase):
+class TestCTCGreedyDecoderOp(serial.SerializedTestCase):
 
-    @given(
+    @serial.given(
         batch=st.sampled_from([2, 4, 128, 256]),
         max_time=st.sampled_from([2, 10, 30, 50]),
         num_classes=st.sampled_from([2, 10, 26, 40]),
diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py
index 0a5f9a38a4a7c6..753b94d20f1f54 100644
--- a/caffe2/python/operator_test/distance_op_test.py
+++ b/caffe2/python/operator_test/distance_op_test.py
@@ -3,15 +3,16 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
+import numpy as np
 
 
-class DistanceTest(hu.HypothesisTestCase):
-    @given(n=st.integers(1, 3),
+class DistanceTest(serial.SerializedTestCase):
+    @serial.given(n=st.integers(1, 3),
            dim=st.integers(4, 16),
            **hu.gcs)
     def test_cosine_similarity(self, n, dim, gc, dc):
@@ -32,7 +33,7 @@ def test_cosine_similarity(self, n, dim, gc, dc):
         self.assertGradientChecks(gc, cos_op, [X, Y], 1, [0],
                                   stepsize=1e-2, threshold=1e-2)
 
-    @given(inputs=hu.tensors(n=2,
+    @serial.given(inputs=hu.tensors(n=2,
                              min_dim=1,
                              max_dim=2,
                              dtype=np.float32),
@@ -57,7 +58,7 @@ def dot_ref(X, Y):
         # Gradient check wrt Y
         self.assertGradientChecks(gc, op, [X, Y], 1, [0])
 
-    @given(n=st.integers(1, 3),
+    @serial.given(n=st.integers(1, 3),
            dim=st.integers(4, 16),
            **hu.gcs)
     def test_L1_distance(self, n, dim, gc, dc):
@@ -88,7 +89,7 @@ def test_L1_distance(self, n, dim, gc, dc):
         self.assertGradientChecks(gc, op, [X, Y], 1, [0],
                                   stepsize=1e-2, threshold=1e-2)
 
-    @given(n=st.integers(1, 3),
+    @serial.given(n=st.integers(1, 3),
            dim=st.integers(4, 16),
            **hu.gcs)
     def test_L2_distance(self, n, dim, gc, dc):
diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py
index 89d63b7e13286b..2bbd9ba4efe114 100644
--- a/caffe2/python/operator_test/dropout_op_test.py
+++ b/caffe2/python/operator_test/dropout_op_test.py
@@ -10,11 +10,12 @@
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestDropout(hu.HypothesisTestCase):
+class TestDropout(serial.SerializedTestCase):
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            in_place=st.booleans(),
            ratio=st.floats(0, 0.999),
            engine=st.sampled_from(["", "CUDNN"]),
diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py
index c67a84921a4ad0..8c7df5f33625b2 100644
--- a/caffe2/python/operator_test/elementwise_linear_op_test.py
+++ b/caffe2/python/operator_test/elementwise_linear_op_test.py
@@ -6,13 +6,14 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestElementwiseLinearOp(hu.HypothesisTestCase):
+class TestElementwiseLinearOp(serial.SerializedTestCase):
 
-    @given(n=st.integers(2, 100), d=st.integers(2, 10), **hu.gcs)
+    @serial.given(n=st.integers(2, 100), d=st.integers(2, 10), **hu.gcs)
     # @given(n=st.integers(2, 50), d=st.integers(2, 50), **hu.gcs_cpu_only)
     def test(self, n, d, gc, dc):
         X = np.random.rand(n, d).astype(np.float32)
diff --git a/caffe2/python/operator_test/elementwise_logical_ops_test.py b/caffe2/python/operator_test/elementwise_logical_ops_test.py
index 7279dd5e1fb99c..8f665a06cd9e9a 100644
--- a/caffe2/python/operator_test/elementwise_logical_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_logical_ops_test.py
@@ -4,10 +4,10 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
-
 import numpy as np
 import unittest
 
@@ -21,7 +21,7 @@ def rowmux(select_vec, left, right):
     return mux(select, left, right)
 
 
-class TestWhere(hu.HypothesisTestCase):
+class TestWhere(serial.SerializedTestCase):
 
     def test_reference(self):
         self.assertTrue((
@@ -35,7 +35,7 @@ def test_reference(self):
                                         [[3], [4]])[0]
         ).all())
 
-    @given(N=st.integers(min_value=1, max_value=10),
+    @serial.given(N=st.integers(min_value=1, max_value=10),
            engine=st.sampled_from(["", "CUDNN"]),
            **hu.gcs_cpu_only)
     def test_where(self, N, gc, dc, engine):
@@ -107,9 +107,9 @@ def test_rowwhere_dim2(self, N, gc, dc, engine):
         self.assertReferenceChecks(gc, op, [C, X, Y], rowmux)
 
 
-class TestIsMemberOf(hu.HypothesisTestCase):
+class TestIsMemberOf(serial.SerializedTestCase):
 
-    @given(N=st.integers(min_value=1, max_value=10),
+    @serial.given(N=st.integers(min_value=1, max_value=10),
            engine=st.sampled_from(["", "CUDNN"]),
            **hu.gcs_cpu_only)
     def test_is_member_of(self, N, gc, dc, engine):
diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
index e767a0db161a9f..161f5fc0724b14 100644
--- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py
+++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
@@ -11,10 +11,11 @@
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
 # TODO(jiayq): make them hypothesis tests for better coverage.
-class TestElementwiseBroadcast(hu.HypothesisTestCase):
+class TestElementwiseBroadcast(serial.SerializedTestCase):
     @given(**hu.gcs)
     def test_broadcast_Add(self, gc, dc):
         # Set broadcast and no axis, i.e. broadcasting last dimensions.
@@ -168,7 +169,7 @@ def test_broadcast_Sub(self, gc, dc):
         self.assertDeviceChecks(dc, op, [X, Y], [0])
         self.assertGradientChecks(gc, op, [X, Y], 1, [0])
 
-    @given(**hu.gcs)
+    @serial.given(**hu.gcs)
     def test_broadcast_powt(self, gc, dc):
         np.random.seed(101)
 
diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py
index efd056c8f1654d..4b608a4418dddc 100644
--- a/caffe2/python/operator_test/expand_op_test.py
+++ b/caffe2/python/operator_test/expand_op_test.py
@@ -7,11 +7,12 @@
 from hypothesis import given
 
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestExpandOp(hu.HypothesisTestCase):
+class TestExpandOp(serial.SerializedTestCase):
     def _rand_shape(self, X_shape, max_length):
         length = np.random.randint(max_length)
         shape = np.ones(length, dtype=np.int64)
@@ -39,7 +40,7 @@ def ref(X, shape):
         self.assertDeviceChecks(dc, op, [X, shape], [0])
         self.assertGradientChecks(gc, op, [X, shape], 0, [0])
 
-    @given(X=hu.tensor(max_dim=5, dtype=np.float32),
+    @serial.given(X=hu.tensor(max_dim=5, dtype=np.float32),
            **hu.gcs)
     def test_expand_rand_shape(self, X, gc, dc):
         shape = self._rand_shape(X.shape, 5)
diff --git a/caffe2/python/operator_test/fc_operator_test.py b/caffe2/python/operator_test/fc_operator_test.py
index 1c444da6a8b80e..d10e19e4932ade 100644
--- a/caffe2/python/operator_test/fc_operator_test.py
+++ b/caffe2/python/operator_test/fc_operator_test.py
@@ -7,11 +7,12 @@
 from caffe2.python import core
 from hypothesis import assume, given, settings, HealthCheck
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestFcOperator(hu.HypothesisTestCase):
+class TestFcOperator(serial.SerializedTestCase):
     def _run_test(self, n, m, k, transposed, multi_dim, dtype, engine, gc, dc):
         if dtype == np.float16:
             # fp16 only supported with CUDA
@@ -76,7 +77,7 @@ def fc_tranposed_op(X, W, b):
                                       threshold=threshold, stepsize=stepsize)
 
     @settings(max_examples=50, suppress_health_check=[HealthCheck.filter_too_much])
-    @given(n=st.integers(1, 5),
+    @serial.given(n=st.integers(1, 5),
            m=st.integers(0, 5),
            k=st.integers(1, 5),
            multi_dim=st.sampled_from([True, False]),
diff --git a/caffe2/python/operator_test/filler_ops_test.py b/caffe2/python/operator_test/filler_ops_test.py
index df13cba4a3b961..df7cd1dc4e7960 100644
--- a/caffe2/python/operator_test/filler_ops_test.py
+++ b/caffe2/python/operator_test/filler_ops_test.py
@@ -3,13 +3,13 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import hypothesis.strategies as st
-
-from caffe2.python import core, workspace
 from caffe2.proto import caffe2_pb2
-from hypothesis import given
+from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
+from hypothesis import given
+import hypothesis.strategies as st
 import numpy as np
 
 
@@ -19,7 +19,7 @@ def _fill_diagonal(shape, value):
     return (result,)
 
 
-class TestFillerOperator(hu.HypothesisTestCase):
+class TestFillerOperator(serial.SerializedTestCase):
 
     @given(**hu.gcs)
     def test_shape_error(self, gc, dc):
@@ -127,7 +127,7 @@ def test_uniform_fill_using_arg(self, gc, dc):
 
         self.assertNotEqual(min_data, max_data)
 
-    @given(
+    @serial.given(
         shape=st.sampled_from(
             [
                 [3, 3],
@@ -168,9 +168,9 @@ def test_diagonal_fill_op_int(self, gc, dc):
         # Check against numpy reference
         self.assertReferenceChecks(gc, op, [shape, value], _fill_diagonal)
 
-    @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
-                            min_size=0,
-                            max_size=10),
+    @serial.given(lengths=st.lists(st.integers(min_value=0, max_value=10),
+                                   min_size=0,
+                                   max_size=10),
            **hu.gcs)
     def test_lengths_range_fill(self, lengths, gc, dc):
         op = core.CreateOperator(
diff --git a/caffe2/python/operator_test/find_op_test.py b/caffe2/python/operator_test/find_op_test.py
index febf6efeef76ec..153724a5f49d29 100644
--- a/caffe2/python/operator_test/find_op_test.py
+++ b/caffe2/python/operator_test/find_op_test.py
@@ -4,20 +4,18 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
-import hypothesis.strategies as st
-from hypothesis import given
-
-
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
+import hypothesis.strategies as st
 import numpy as np
 
 
-class TestFindOperator(hu.HypothesisTestCase):
+class TestFindOperator(serial.SerializedTestCase):
 
-    @given(n=st.sampled_from([1, 4, 8, 31, 79, 150]),
-           idxsize=st.sampled_from([2, 4, 8, 1000, 5000]),
-           **hu.gcs)
+    @serial.given(n=st.sampled_from([1, 4, 8, 31, 79, 150]),
+        idxsize=st.sampled_from([2, 4, 8, 1000, 5000]),
+        **hu.gcs)
     def test_find(self, n, idxsize, gc, dc):
         maxval = 10
 
diff --git a/caffe2/python/operator_test/flexible_top_k_test.py b/caffe2/python/operator_test/flexible_top_k_test.py
index 08f079b5f8cfdd..fcd20278c6caba 100644
--- a/caffe2/python/operator_test/flexible_top_k_test.py
+++ b/caffe2/python/operator_test/flexible_top_k_test.py
@@ -3,15 +3,16 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from collections import OrderedDict
-import numpy as np
-
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
+from collections import OrderedDict
+from hypothesis import given
+import numpy as np
 
-class TestFlexibleTopK(hu.HypothesisTestCase):
+
+class TestFlexibleTopK(serial.SerializedTestCase):
     def flexible_top_k_ref(self, X, k):
         X_flat = X.reshape((-1, X.shape[-1]))
         indices_ref = np.ndarray(shape=sum(k), dtype=np.int32)
@@ -38,7 +39,7 @@ def flexible_top_k_ref(self, X, k):
 
         return (values_ref, indices_ref)
 
-    @given(X=hu.tensor(min_dim=2), **hu.gcs_cpu_only)
+    @serial.given(X=hu.tensor(min_dim=2), **hu.gcs_cpu_only)
     def test_flexible_top_k(self, X, gc, dc):
         X = X.astype(dtype=np.float32)
         k_shape = (int(X.size / X.shape[-1]), )
diff --git a/caffe2/python/operator_test/floor_op_test.py b/caffe2/python/operator_test/floor_op_test.py
index aac1e81efa6570..4cbd269620673d 100644
--- a/caffe2/python/operator_test/floor_op_test.py
+++ b/caffe2/python/operator_test/floor_op_test.py
@@ -4,17 +4,18 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
 import numpy as np
-
 import unittest
 
 
-class TestFloor(hu.HypothesisTestCase):
+class TestFloor(serial.SerializedTestCase):
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            engine=st.sampled_from(["", "CUDNN"]),
            **hu.gcs)
     def test_floor(self, X, gc, dc, engine):
diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py
index 2c2bc33910c560..d5ab8e58cec0f2 100644
--- a/caffe2/python/operator_test/gather_ops_test.py
+++ b/caffe2/python/operator_test/gather_ops_test.py
@@ -7,12 +7,13 @@
 from caffe2.python import core, workspace
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import hypothesis.extra.numpy as hnp
 
 
-class TestGatherOps(hu.HypothesisTestCase):
-    @given(rows_num=st.integers(1, 10000),
+class TestGatherOps(serial.SerializedTestCase):
+    @serial.given(rows_num=st.integers(1, 10000),
            index_num=st.integers(0, 5000),
            **hu.gcs)
     def test_gather_ops(self, rows_num, index_num, gc, dc):
@@ -52,8 +53,8 @@ def _inputs(draw):
     )
 
 
-class TestBatchGatherOps(hu.HypothesisTestCase):
-    @given(inputs=_inputs(),
+class TestBatchGatherOps(serial.SerializedTestCase):
+    @serial.given(inputs=_inputs(),
            **hu.gcs)
     def test_batch_gather_ops(self, inputs, gc, dc):
         data, ind = inputs
diff --git a/caffe2/python/operator_test/gather_ranges_op_test.py b/caffe2/python/operator_test/gather_ranges_op_test.py
index d653dd3297bcd2..a16b92ba7d6992 100644
--- a/caffe2/python/operator_test/gather_ranges_op_test.py
+++ b/caffe2/python/operator_test/gather_ranges_op_test.py
@@ -6,8 +6,8 @@
 from caffe2.python import core, workspace
 from hypothesis import given
 from hypothesis import strategies as st
-
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import numpy as np
 
 
@@ -121,8 +121,9 @@ def gather_ranges_to_dense_with_key(data, ranges, key, lengths):
     return outputs
 
 
-class TestGatherRanges(hu.HypothesisTestCase):
-    @given(boarders_and_data=batched_boarders_and_data(), **hu.gcs_cpu_only)
+class TestGatherRanges(serial.SerializedTestCase):
+    @serial.given(
+        boarders_and_data=batched_boarders_and_data(), **hu.gcs_cpu_only)
     def test_gather_ranges(self, boarders_and_data, gc, dc):
         boarders, data = boarders_and_data
 
@@ -142,7 +143,7 @@ def boarders_to_range(boarders):
             reference=gather_ranges,
         )
 
-    @given(tensor_splits=_tensor_splits(), **hu.gcs_cpu_only)
+    @serial.given(tensor_splits=_tensor_splits(), **hu.gcs_cpu_only)
     def test_gather_ranges_split(self, tensor_splits, gc, dc):
         data, ranges, lengths, _ = tensor_splits
 
diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py
index 98ebc9bed4012d..56902bb444efde 100644
--- a/caffe2/python/operator_test/glu_op_test.py
+++ b/caffe2/python/operator_test/glu_op_test.py
@@ -4,20 +4,21 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import assume, given, settings, HealthCheck
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
 import numpy as np
 
 import unittest
 
 
-class TestGlu(hu.HypothesisTestCase):
+class TestGlu(serial.SerializedTestCase):
     # Suppress filter_too_much health check.
     # Reproduce by commenting @settings and uncommenting @seed.
     # @seed(302934307671667531413257853548643485645)
     @settings(suppress_health_check=[HealthCheck.filter_too_much])
-    @given(
+    @serial.given(
         X=hu.tensor(),
         axis=st.integers(min_value=0, max_value=3),
         **hu.gcs
diff --git a/caffe2/python/operator_test/group_norm_op_test.py b/caffe2/python/operator_test/group_norm_op_test.py
index caa9121e924a0c..febf05136e4ded 100644
--- a/caffe2/python/operator_test/group_norm_op_test.py
+++ b/caffe2/python/operator_test/group_norm_op_test.py
@@ -2,16 +2,16 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from caffe2.python import core
-from hypothesis import given
-
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+
+from hypothesis import given
 import hypothesis.strategies as st
+import numpy as np
 
 
-class TestGroupNormOp(hu.HypothesisTestCase):
+class TestGroupNormOp(serial.SerializedTestCase):
     def group_norm_nchw_ref(self, X, gamma, beta, group, epsilon):
         dims = X.shape
         N = dims[0]
@@ -40,10 +40,11 @@ def group_norm_nhwc_ref(self, X, gamma, beta, group, epsilon):
         Y = gamma * (X - mu) / std + beta
         return [Y.reshape(dims), mu.reshape(N, G), (1.0 / std).reshape(N, G)]
 
-    @given(N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(1, 5),
-           H=st.integers(2, 5), W=st.integers(2, 5),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-4),
-           order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
+    @serial.given(
+        N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(1, 5),
+        H=st.integers(2, 5), W=st.integers(2, 5),
+        epsilon=st.floats(min_value=1e-5, max_value=1e-4),
+        order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
     def test_group_norm_2d(
             self, N, G, D, H, W, epsilon, order, gc, dc):
         op = core.CreateOperator(
diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py
index 1292d843d0a827..ed8945b7927e90 100644
--- a/caffe2/python/operator_test/gru_test.py
+++ b/caffe2/python/operator_test/gru_test.py
@@ -7,6 +7,7 @@
 from caffe2.python.model_helper import ModelHelper
 from caffe2.python.rnn.rnn_cell_test_util import sigmoid, tanh, _prepare_rnn
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from caffe2.proto import caffe2_pb2
 
 from functools import partial
@@ -246,11 +247,11 @@ def generate_input_state(n, d):
     return hidden_t, model.net
 
 
-class GRUCellTest(hu.HypothesisTestCase):
+class GRUCellTest(serial.SerializedTestCase):
 
     # Test just for GRUUnitOp
     @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
-    @given(
+    @serial.given(
         seed=st.integers(0, 2**32 - 1),
         input_tensor=gru_unit_op_input(),
         fwd_only=st.booleans(),
diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py
index dbb308f680021b..252855b39e4f25 100644
--- a/caffe2/python/operator_test/hyperbolic_ops_test.py
+++ b/caffe2/python/operator_test/hyperbolic_ops_test.py
@@ -6,11 +6,12 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestHyperbolicOps(hu.HypothesisTestCase):
+class TestHyperbolicOps(serial.SerializedTestCase):
     def _test_hyperbolic_op(self, op_name, np_ref, X, in_place, engine, gc, dc):
         op = core.CreateOperator(
             op_name,
@@ -30,15 +31,15 @@ def ref(X):
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0])
 
-    @given(X=hu.tensor(dtype=np.float32), **hu.gcs)
+    @serial.given(X=hu.tensor(dtype=np.float32), **hu.gcs)
     def test_sinh(self, X, gc, dc):
         self._test_hyperbolic_op("Sinh", np.sinh, X, False, "", gc, dc)
 
-    @given(X=hu.tensor(dtype=np.float32), **hu.gcs)
+    @serial.given(X=hu.tensor(dtype=np.float32), **hu.gcs)
     def test_cosh(self, X, gc, dc):
         self._test_hyperbolic_op("Cosh", np.cosh, X, False, "", gc, dc)
 
-    @given(X=hu.tensor(dtype=np.float32), in_place=st.booleans(),
+    @serial.given(X=hu.tensor(dtype=np.float32), in_place=st.booleans(),
            engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
     def test_tanh(self, X, in_place, engine, gc, dc):
         self._test_hyperbolic_op("Tanh", np.tanh, X, in_place, engine, gc, dc)
diff --git a/caffe2/python/operator_test/index_hash_ops_test.py b/caffe2/python/operator_test/index_hash_ops_test.py
index 1f8b7344b74b9a..6a3678abdbe3fb 100644
--- a/caffe2/python/operator_test/index_hash_ops_test.py
+++ b/caffe2/python/operator_test/index_hash_ops_test.py
@@ -2,15 +2,17 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core, workspace
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestIndexHashOps(hu.HypothesisTestCase):
-    @given(
+class TestIndexHashOps(serial.SerializedTestCase):
+    @serial.given(
         indices=st.sampled_from([
             np.int32, np.int64
         ]).flatmap(lambda dtype: hu.tensor(min_dim=1, max_dim=1, dtype=dtype)),
diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py
index a91154a4e45f6d..1d072a230ae3a7 100644
--- a/caffe2/python/operator_test/instance_norm_test.py
+++ b/caffe2/python/operator_test/instance_norm_test.py
@@ -8,11 +8,13 @@
 
 from caffe2.python import core, model_helper, brew
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import unittest
 import os
 
-class TestInstanceNorm(hu.HypothesisTestCase):
+
+class TestInstanceNorm(serial.SerializedTestCase):
 
     def _get_inputs(self, N, C, H, W, order):
         if order == 'NCHW':
@@ -131,7 +133,7 @@ def test_instance_norm_layout(self, gc, dc, N, C, H, W, store_mean,
             atol=1e-4,
             rtol=1e-4)
 
-    @given(gc=hu.gcs['gc'],
+    @serial.given(gc=hu.gcs['gc'],
            dc=hu.gcs['dc'],
            N=st.integers(2, 10),
            C=st.integers(3, 10),
diff --git a/caffe2/python/operator_test/integral_image_ops_test.py b/caffe2/python/operator_test/integral_image_ops_test.py
index 011720d109d6d2..6f9e5d90572ab1 100644
--- a/caffe2/python/operator_test/integral_image_ops_test.py
+++ b/caffe2/python/operator_test/integral_image_ops_test.py
@@ -2,15 +2,16 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-from hypothesis import given
+
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestIntegralImageOps(hu.HypothesisTestCase):
-    @given(batch_size=st.integers(1, 3),
+class TestIntegralImageOps(serial.SerializedTestCase):
+    @serial.given(batch_size=st.integers(1, 3),
            height=st.integers(7, 10),
            width=st.integers(7, 10),
            channels=st.integers(1, 8),
@@ -45,7 +46,7 @@ def integral_image(im):
         self.assertDeviceChecks(dc, op, [im], [0])
         self.assertReferenceChecks(gc, op, [im], integral_image)
 
-    @given(batch_size=st.integers(1, 3),
+    @serial.given(batch_size=st.integers(1, 3),
            height=st.integers(7, 10),
            width=st.integers(7, 10),
            channels=st.integers(1, 8),
diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py
index 97a1fbaeec2222..51faa14b9029fe 100644
--- a/caffe2/python/operator_test/jsd_ops_test.py
+++ b/caffe2/python/operator_test/jsd_ops_test.py
@@ -2,9 +2,11 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
@@ -24,8 +26,8 @@ def jsd_grad(go, o, pq_list):
     return [np.log(p * (1 - m) / (1 - p) / m) / 2. * go, None]
 
 
-class TestJSDOps(hu.HypothesisTestCase):
-    @given(n=st.integers(10, 100), **hu.gcs_cpu_only)
+class TestJSDOps(serial.SerializedTestCase):
+    @serial.given(n=st.integers(10, 100), **hu.gcs_cpu_only)
     def test_bernoulli_jsd(self, n, gc, dc):
         p = np.random.rand(n).astype(np.float32)
         q = np.random.rand(n).astype(np.float32)
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
index fa0958afd1f99a..59203fd960c88b 100644
--- a/caffe2/python/operator_test/layer_norm_op_test.py
+++ b/caffe2/python/operator_test/layer_norm_op_test.py
@@ -4,17 +4,18 @@
 from __future__ import unicode_literals
 
 from caffe2.python import brew, core
+from caffe2.python.model_helper import ModelHelper
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
-import unittest
-import os
+import caffe2.python.serialized_test.serialized_test_util as serial
 import numpy as np
+import os
+import unittest
 
-from caffe2.python.model_helper import ModelHelper
 
-class TestLayerNormOp(hu.HypothesisTestCase):
+class TestLayerNormOp(serial.SerializedTestCase):
     @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI")
-    @given(X=hu.tensors(n=1), **hu.gcs)
+    @serial.given(X=hu.tensors(n=1), **hu.gcs)
     def test_layer_norm_grad_op(self, X, gc, dc):
         X = X[0]
         if len(X.shape) == 1:
diff --git a/caffe2/python/operator_test/learning_rate_adaption_op_test.py b/caffe2/python/operator_test/learning_rate_adaption_op_test.py
index 2284fdbba0785c..84e1307568f22d 100644
--- a/caffe2/python/operator_test/learning_rate_adaption_op_test.py
+++ b/caffe2/python/operator_test/learning_rate_adaption_op_test.py
@@ -3,16 +3,17 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from hypothesis import given
-import hypothesis.strategies as st
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
+from hypothesis import given
+import hypothesis.strategies as st
 import numpy as np
 
 
-class TestLearningRateAdaption(hu.HypothesisTestCase):
-    @given(inputs=hu.tensors(n=2),
+class TestLearningRateAdaption(serial.SerializedTestCase):
+    @serial.given(inputs=hu.tensors(n=2),
            lr=st.floats(min_value=0.01, max_value=0.99,
                         allow_nan=False, allow_infinity=False),
            lr_alpha=st.floats(min_value=0.01, max_value=0.99,
diff --git a/caffe2/python/operator_test/learning_rate_op_test.py b/caffe2/python/operator_test/learning_rate_op_test.py
index 73710e520f9495..3677239817d7e7 100644
--- a/caffe2/python/operator_test/learning_rate_op_test.py
+++ b/caffe2/python/operator_test/learning_rate_op_test.py
@@ -5,6 +5,7 @@
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 from hypothesis import given
 import hypothesis.strategies as st
@@ -15,8 +16,8 @@
 import numpy as np
 
 
-class TestLearningRate(hu.HypothesisTestCase):
-    @given(**hu.gcs_cpu_only)
+class TestLearningRate(serial.SerializedTestCase):
+    @serial.given(**hu.gcs_cpu_only)
     def test_alter_learning_rate_op(self, gc, dc):
         iter = np.random.randint(low=1, high=1e5, size=1)
         active_period = int(np.random.randint(low=1, high=1e3, size=1))
diff --git a/caffe2/python/operator_test/length_split_op_test.py b/caffe2/python/operator_test/length_split_op_test.py
new file mode 100644
index 00000000000000..20ac2b25ba1103
--- /dev/null
+++ b/caffe2/python/operator_test/length_split_op_test.py
@@ -0,0 +1,152 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core
+from hypothesis import given
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+import hypothesis.strategies as st
+import numpy as np
+
+
+class TestLengthSplitOperator(serial.SerializedTestCase):
+
+    def _length_split_op_ref(self, input_lengths, n_split_array):
+        output = []
+        n_split = n_split_array[0]
+        for x in input_lengths:
+            mod = x % n_split
+            val = x // n_split + 1
+            for _ in range(n_split):
+                if mod > 0:
+                    output.append(val)
+                    mod -= 1
+                else:
+                    output.append(val - 1)
+        return [np.array(output).astype(np.int32)]
+
+    @serial.given(**hu.gcs_cpu_only)
+    def test_length_split_edge(self, gc, dc):
+        input_lengths = np.array([3, 4, 5]).astype(np.int32)
+        n_split_ = np.array([5]).astype(np.int32)
+        # Expected output:
+        # [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]
+        op = core.CreateOperator(
+            'LengthsSplit',
+            ['input_lengths',
+             'n_split'],
+            ['Y'],
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_lengths,
+                    n_split_],
+            reference=self._length_split_op_ref,
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0])
+
+    @given(**hu.gcs_cpu_only)
+    def test_length_split_arg(self, gc, dc):
+        input_lengths = np.array([9, 4, 5]).astype(np.int32)
+        n_split = 3
+        # Expected output:
+        # [3, 3, 3, 2, 1, 1, 2, 2, 1]
+        op = core.CreateOperator(
+            'LengthsSplit',
+            ['input_lengths'],
+            ['Y'], n_split=n_split
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_lengths],
+            reference=lambda x : self._length_split_op_ref(x, [n_split]),
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [input_lengths], [0])
+
+    @given(**hu.gcs_cpu_only)
+    def test_length_split_override_arg(self, gc, dc):
+        input_lengths = np.array([9, 4, 5]).astype(np.int32)
+        n_split_ignored = 2
+        n_split_used = np.array([3]).astype(np.int32)
+
+        op = core.CreateOperator(
+            'LengthsSplit',
+            ['input_lengths',
+             'n_split'],
+            ['Y'], n_split=n_split_ignored
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_lengths,
+                    n_split_used],
+            reference=self._length_split_op_ref,
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [input_lengths, n_split_used], [0])
+
+    @given(m=st.integers(1, 100), n_split=st.integers(1, 20),
+           **hu.gcs_cpu_only)
+    def test_length_split_even_divide(self, m, n_split, gc, dc):
+        # multiples of n_split
+        input_lengths = np.random.randint(100, size=m).astype(np.int32) * n_split
+        n_split_ = np.array([n_split]).astype(np.int32)
+
+        op = core.CreateOperator(
+            'LengthsSplit',
+            ['input_lengths',
+             'n_split'],
+            ['Y'],
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_lengths,
+                    n_split_],
+            reference=self._length_split_op_ref,
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0])
+
+    @given(m=st.integers(1, 100), n_split=st.integers(1, 20),
+           **hu.gcs_cpu_only)
+    def test_length_split_random(self, m, n_split, gc, dc):
+        input_lengths = np.random.randint(100, size=m).astype(np.int32)
+        n_split_ = np.array([n_split]).astype(np.int32)
+
+        op = core.CreateOperator(
+            'LengthsSplit',
+            ['input_lengths',
+             'n_split'],
+            ['Y'],
+        )
+
+        # Check against numpy reference
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[input_lengths,
+                    n_split_],
+            reference=self._length_split_op_ref,
+        )
+        # Check over multiple devices
+        self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0])
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()
diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py
index f879b702cd5092..d9cd2b2446045d 100644
--- a/caffe2/python/operator_test/lengths_pad_op_test.py
+++ b/caffe2/python/operator_test/lengths_pad_op_test.py
@@ -6,13 +6,14 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestLengthsPadOp(hu.HypothesisTestCase):
+class TestLengthsPadOp(serial.SerializedTestCase):
 
-    @given(
+    @serial.given(
         inputs=hu.lengths_tensor(
             dtype=np.float32,
             min_value=1,
diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py
index d37904c08ec08b..4a9a6b0ff1a9da 100644
--- a/caffe2/python/operator_test/lengths_tile_op_test.py
+++ b/caffe2/python/operator_test/lengths_tile_op_test.py
@@ -6,16 +6,17 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestLengthsTileOp(hu.HypothesisTestCase):
+class TestLengthsTileOp(serial.SerializedTestCase):
 
-    @given(
+    @serial.given(
         inputs=st.integers(min_value=1, max_value=20).flatmap(
             lambda size: st.tuples(
-                hu.arrays([size]),
+                hu.arrays([size], dtype=np.float32),
                 hu.arrays([size], dtype=np.int32,
                           elements=st.integers(min_value=0, max_value=20)),
             )
@@ -32,7 +33,7 @@ def lengths_tile_op(data, lengths):
         op = core.CreateOperator(
             "LengthsTile",
             ["data", "lengths"],
-            ["output"]
+            ["output"],
         )
 
         self.assertReferenceChecks(
diff --git a/caffe2/python/operator_test/lengths_top_k_ops_test.py b/caffe2/python/operator_test/lengths_top_k_ops_test.py
index 6ffb5fc4fa843f..8bc27c31144f48 100644
--- a/caffe2/python/operator_test/lengths_top_k_ops_test.py
+++ b/caffe2/python/operator_test/lengths_top_k_ops_test.py
@@ -2,15 +2,17 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestLengthsTopKOps(hu.HypothesisTestCase):
-    @given(N=st.integers(min_value=0, max_value=10),
+class TestLengthsTopKOps(serial.SerializedTestCase):
+    @serial.given(N=st.integers(min_value=0, max_value=10),
            K=st.integers(min_value=1, max_value=10),
            **hu.gcs_cpu_only)
     def test_lengths_top_k_op(self, N, K, gc, dc):
diff --git a/caffe2/python/operator_test/listwise_l2r_operator_test.py b/caffe2/python/operator_test/listwise_l2r_operator_test.py
index c690b3aed3891e..b98100168df022 100644
--- a/caffe2/python/operator_test/listwise_l2r_operator_test.py
+++ b/caffe2/python/operator_test/listwise_l2r_operator_test.py
@@ -2,6 +2,7 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+
 from caffe2.python import core, workspace
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/locally_connected_op_test.py b/caffe2/python/operator_test/locally_connected_op_test.py
index 29ff72cd1e72db..49051442350e28 100644
--- a/caffe2/python/operator_test/locally_connected_op_test.py
+++ b/caffe2/python/operator_test/locally_connected_op_test.py
@@ -8,10 +8,11 @@
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestLocallyConnectedOp(hu.HypothesisTestCase):
-    @given(N=st.integers(1, 3),
+class TestLocallyConnectedOp(serial.SerializedTestCase):
+    @serial.given(N=st.integers(1, 3),
            C=st.integers(1, 3),
            H=st.integers(1, 5),
            W=st.integers(1, 5),
diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py
index a6ea88e55737e5..e57bdb7a1d41df 100644
--- a/caffe2/python/operator_test/loss_ops_test.py
+++ b/caffe2/python/operator_test/loss_ops_test.py
@@ -6,13 +6,14 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestLossOps(hu.HypothesisTestCase):
+class TestLossOps(serial.SerializedTestCase):
 
-    @given(n=st.integers(1, 8), **hu.gcs)
+    @serial.given(n=st.integers(1, 8), **hu.gcs)
     def test_averaged_loss(self, n, gc, dc):
         X = np.random.rand(n).astype(np.float32)
 
diff --git a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
index 28f8e0a20f0a90..9e0168eacf9354 100644
--- a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
+++ b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
@@ -3,16 +3,17 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+
 from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
 
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-
 
-class TestMarginRankingCriterion(hu.HypothesisTestCase):
-    @given(N=st.integers(min_value=10, max_value=20),
+class TestMarginRankingCriterion(serial.SerializedTestCase):
+    @serial.given(N=st.integers(min_value=10, max_value=20),
            seed=st.integers(min_value=0, max_value=65535),
            margin=st.floats(min_value=-0.5, max_value=0.5),
            **hu.gcs)
diff --git a/caffe2/python/operator_test/math_ops_test.py b/caffe2/python/operator_test/math_ops_test.py
index 4661c7715ec518..0772aee5c9b285 100644
--- a/caffe2/python/operator_test/math_ops_test.py
+++ b/caffe2/python/operator_test/math_ops_test.py
@@ -7,12 +7,13 @@
 from hypothesis import given
 from hypothesis import strategies as st
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import numpy as np
 import unittest
 
 
-class TestMathOps(hu.HypothesisTestCase):
+class TestMathOps(serial.SerializedTestCase):
 
     @given(X=hu.tensor(),
            exponent=st.floats(min_value=2.0, max_value=3.0),
@@ -31,7 +32,7 @@ def powf_grad(g_out, outputs, fwd_inputs):
                                    output_to_grad="Y",
                                    grad_reference=powf_grad),
 
-    @given(X=hu.tensor(),
+    @serial.given(X=hu.tensor(),
            exponent=st.floats(min_value=-3.0, max_value=3.0),
            **hu.gcs)
     def test_sign(self, X, exponent, gc, dc):
diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py
index 67fdf2cf5ffe11..1872a129e569c8 100644
--- a/caffe2/python/operator_test/matmul_op_test.py
+++ b/caffe2/python/operator_test/matmul_op_test.py
@@ -13,10 +13,11 @@
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestMatMul(hu.HypothesisTestCase):
-    @given(
+class TestMatMul(serial.SerializedTestCase):
+    @serial.given(
         M=st.integers(min_value=1, max_value=10),
         K=st.integers(min_value=1, max_value=10),
         N=st.integers(min_value=1, max_value=10),
@@ -125,7 +126,7 @@ def matmul_ref(X, Y, axis_a, axis_b, trans_a, trans_b):
         self.assertGradientChecks(gc, op, [X, Y], 1, [0])
 
 
-class TestBatchMatMul(hu.HypothesisTestCase):
+class TestBatchMatMul(serial.SerializedTestCase):
     @settings(max_examples=30)
     @given(
         C=st.integers(min_value=0, max_value=3),  # number of batch dims
@@ -214,7 +215,7 @@ def matmul_ref(X, Y, trans_a, trans_b, dtype):
         # Check over multiple devices
         self.assertDeviceChecks(dc, op, [X, Y], [0])
 
-    @given(
+    @serial.given(
         C_1=st.integers(min_value=0, max_value=3),  # number of batch dims
         C_2=st.integers(min_value=0, max_value=3),
         M=st.integers(min_value=1, max_value=10),
diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py
index cbb1adc954784d..77c6b82625b139 100644
--- a/caffe2/python/operator_test/mean_op_test.py
+++ b/caffe2/python/operator_test/mean_op_test.py
@@ -4,16 +4,17 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
 import numpy as np
-
 import unittest
 
 
-class TestMean(hu.HypothesisTestCase):
-    @given(
+class TestMean(serial.SerializedTestCase):
+    @serial.given(
         k=st.integers(1, 5),
         n=st.integers(1, 10),
         m=st.integers(1, 10),
diff --git a/caffe2/python/operator_test/merge_id_lists_op_test.py b/caffe2/python/operator_test/merge_id_lists_op_test.py
index 1b4322e0624f70..9f3302c6e75a3f 100644
--- a/caffe2/python/operator_test/merge_id_lists_op_test.py
+++ b/caffe2/python/operator_test/merge_id_lists_op_test.py
@@ -3,15 +3,13 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
-
-from hypothesis import given
-import hypothesis.strategies as st
-
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import hypothesis.extra.numpy as hnp
+import hypothesis.strategies as st
+import numpy as np
 
 
 @st.composite
@@ -53,7 +51,7 @@ def merge_arrays(vs, offs, j):
     return merged_lengths, merged_values
 
 
-class TestMergeIdListsOp(hu.HypothesisTestCase):
+class TestMergeIdListsOp(serial.SerializedTestCase):
     def test_merge_id_lists_ref(self):
         # Verify that the reference implementation is correct!
         lengths_0 = np.array([3, 0, 4], dtype=np.int32)
@@ -69,8 +67,7 @@ def test_merge_id_lists_ref(self):
         np.testing.assert_array_equal(merged_lengths, expected_lengths)
         np.testing.assert_array_equal(merged_values, expected_values)
 
-    @given(inputs=id_list_batch(),
-           **hu.gcs_cpu_only)
+    @serial.given(inputs=id_list_batch(), **hu.gcs_cpu_only)
     def test_merge_id_lists_op(self, inputs, gc, dc):
         num_inputs = int(len(inputs) / 2)
         op = core.CreateOperator(
diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py
index fa456c8382f64b..ae9d9158f5062b 100644
--- a/caffe2/python/operator_test/moments_op_test.py
+++ b/caffe2/python/operator_test/moments_op_test.py
@@ -7,12 +7,13 @@
 from hypothesis import given
 
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
-import numpy as np
 import itertools as it
+import numpy as np
 
 
-class TestMomentsOp(hu.HypothesisTestCase):
+class TestMomentsOp(serial.SerializedTestCase):
     def run_moments_test(self, X, axes, keepdims, gc, dc):
         if axes is None:
             op = core.CreateOperator(
@@ -41,7 +42,7 @@ def ref(X):
         self.assertDeviceChecks(dc, op, [X], [0, 1])
         self.assertGradientChecks(gc, op, [X], 0, [0, 1])
 
-    @given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(),
+    @serial.given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(),
            num_axes=st.integers(1, 4), **hu.gcs)
     def test_moments(self, X, keepdims, num_axes, gc, dc):
         self.run_moments_test(X, None, keepdims, gc, dc)
diff --git a/caffe2/python/operator_test/momentum_sgd_test.py b/caffe2/python/operator_test/momentum_sgd_test.py
index 7bfceb61121ca2..39e358f30d386e 100644
--- a/caffe2/python/operator_test/momentum_sgd_test.py
+++ b/caffe2/python/operator_test/momentum_sgd_test.py
@@ -5,6 +5,7 @@
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 import hypothesis
 from hypothesis import given
@@ -13,8 +14,8 @@
 import unittest
 
 
-class TestMomentumSGD(hu.HypothesisTestCase):
-    @given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs)
+class TestMomentumSGD(serial.SerializedTestCase):
+    @serial.given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs)
     def test_momentum_sgd(self, n, nesterov, gc, dc):
         param = np.random.rand(n).astype(np.float32)
         grad = np.random.rand(n).astype(np.float32)
@@ -69,7 +70,7 @@ def momentum_sgd(grad, param_momentum, lr, param=None):
             reference=momentum_sgd
         )
 
-    @given(
+    @serial.given(
         inputs=hu.tensors(n=3),
         momentum=st.floats(min_value=0.1, max_value=0.9),
         nesterov=st.booleans(),
diff --git a/caffe2/python/operator_test/negate_gradient_op_test.py b/caffe2/python/operator_test/negate_gradient_op_test.py
index d37955ac1b4d2e..309236a281a492 100644
--- a/caffe2/python/operator_test/negate_gradient_op_test.py
+++ b/caffe2/python/operator_test/negate_gradient_op_test.py
@@ -3,18 +3,17 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-import numpy as np
+from caffe2.python import workspace, core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import workspace, core
+import numpy as np
 
 
-class TestNegateGradient(hu.HypothesisTestCase):
+class TestNegateGradient(serial.SerializedTestCase):
 
-    @given(X=hu.tensor(),
-           inplace=st.booleans(),
-            **hu.gcs)
+    @serial.given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs)
     def test_forward(self, X, inplace, gc, dc):
         def neg_grad_ref(X):
             return (X,)
diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py
index c1d02de1d09690..42fde4c9452251 100644
--- a/caffe2/python/operator_test/numpy_tile_op_test.py
+++ b/caffe2/python/operator_test/numpy_tile_op_test.py
@@ -11,10 +11,11 @@
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 
-class TestNumpyTile(hu.HypothesisTestCase):
-    @given(ndim=st.integers(min_value=1, max_value=4),
+class TestNumpyTile(serial.SerializedTestCase):
+    @serial.given(ndim=st.integers(min_value=1, max_value=4),
            seed=st.integers(min_value=0, max_value=65536),
            **hu.gcs_cpu_only)
     def test_numpy_tile(self, ndim, seed, gc, dc):
diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py
index da1c11fbc2cc2e..19e6ee10e3ddf9 100644
--- a/caffe2/python/operator_test/one_hot_ops_test.py
+++ b/caffe2/python/operator_test/one_hot_ops_test.py
@@ -7,6 +7,7 @@
 from caffe2.proto import caffe2_pb2
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
@@ -25,8 +26,8 @@ def _one_hots():
                 max_size=sum(x[1]))))
 
 
-class TestOneHotOps(hu.HypothesisTestCase):
-    @given(
+class TestOneHotOps(serial.SerializedTestCase):
+    @serial.given(
         x=hu.tensor(
             min_dim=2, max_dim=2, dtype=np.int32,
             elements=st.integers(min_value=0, max_value=10)),
@@ -56,7 +57,7 @@ def ref(x, lens, vals):
         op = core.CreateOperator('BatchOneHot', ["X", "LENS", "VALS"], ["Y"])
         self.assertReferenceChecks(gc, op, [x, lens, vals], ref)
 
-    @given(
+    @serial.given(
         x=hu.tensor(
             min_dim=2, max_dim=2, dtype=np.float32,
             elements=st.integers(min_value=-5, max_value=5)),
@@ -108,7 +109,7 @@ def ref(x, lens, boundaries):
                                  ["X", "LENS", "BOUNDARIES"], ["Y"])
         self.assertReferenceChecks(gc, op, [x, lens, boundaries], ref)
 
-    @given(
+    @serial.given(
         hot_indices=hu.tensor(
             min_dim=1, max_dim=1, dtype=np.int64,
             elements=st.integers(min_value=0, max_value=42)),
@@ -134,7 +135,7 @@ def one_hot_ref(hot_indices, size):
             one_hot_ref,
             input_device_options={'size': core.DeviceOption(caffe2_pb2.CPU)})
 
-    @given(hot_indices=_one_hots())
+    @serial.given(hot_indices=_one_hots())
     def test_segment_one_hot(self, hot_indices):
         index_size, lengths, indices = hot_indices
 
diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py
index 0cba8053d53e05..eaf0ef58ba30e6 100644
--- a/caffe2/python/operator_test/onnx_while_test.py
+++ b/caffe2/python/operator_test/onnx_while_test.py
@@ -2,17 +2,18 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-import hypothesis.strategies as st
-import unittest
-import caffe2.python.hypothesis_test_util as hu
+from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
-from caffe2.proto import caffe2_pb2
+import hypothesis.strategies as st
+import numpy as np
+import unittest
 
 
-class TestONNXWhile(hu.HypothesisTestCase):
-    @given(
+class TestONNXWhile(serial.SerializedTestCase):
+    @serial.given(
         condition=st.booleans(),
         max_trip_count=st.integers(0, 100),
         save_scopes=st.booleans(),
diff --git a/caffe2/python/operator_test/order_switch_test.py b/caffe2/python/operator_test/order_switch_test.py
index d54ac26c55fb36..5d3fd0e60f5e08 100644
--- a/caffe2/python/operator_test/order_switch_test.py
+++ b/caffe2/python/operator_test/order_switch_test.py
@@ -1,14 +1,17 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+
 from caffe2.python import core
 from hypothesis import given
 
 
 class OrderSwitchOpsTest(hu.HypothesisTestCase):
-    @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5), **hu.gcs)
-    def test_nchw2nhwc(self, X, gc, dc):
-        op = core.CreateOperator("NCHW2NHWC", ["X"], ["Y"], device_option=gc)
+    @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5),
+           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
+    def test_nchw2nhwc(self, X, engine, gc, dc):
+        op = core.CreateOperator("NCHW2NHWC", ["X"], ["Y"], engine=engine)
 
         def nchw2nhwc_ref(X):
             X_reshaped = X.transpose((0,) + tuple(range(2, X.ndim)) + (1,))
@@ -18,12 +21,14 @@ def nchw2nhwc_ref(X):
         self.assertGradientChecks(gc, op, [X], 0, [0])
         self.assertDeviceChecks(dc, op, [X], [0])
 
-    @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5), **hu.gcs)
-    def test_nhwc2nchw(self, X, gc, dc):
-        op = core.CreateOperator("NHWC2NCHW", ["X"], ["Y"], device_option=gc)
+    @given(X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5),
+           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
+    def test_nhwc2nchw(self, X, engine, gc, dc):
+        op = core.CreateOperator("NHWC2NCHW", ["X"], ["Y"], engine=engine)
 
         def nhwc2nchw_ref(X):
-            X_reshaped = X.transpose((0, X.ndim - 1) + tuple(range(1, X.ndim - 1)))
+            X_reshaped = X.transpose(
+                (0, X.ndim - 1) + tuple(range(1, X.ndim - 1)))
             return (X_reshaped,)
 
         self.assertReferenceChecks(gc, op, [X], nhwc2nchw_ref)
diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py
index 3935ca8c8f17ab..f6674ed625bd45 100644
--- a/caffe2/python/operator_test/pack_ops_test.py
+++ b/caffe2/python/operator_test/pack_ops_test.py
@@ -5,6 +5,7 @@
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 
 from hypothesis import given
 from hypothesis import strategies as st
@@ -12,7 +13,7 @@
 import time
 
 
-class TestTensorPackOps(hu.HypothesisTestCase):
+class TestTensorPackOps(serial.SerializedTestCase):
 
     def pack_segments_ref(self, return_presence_mask=False, max_length=None):
         def pack_segments_ref(lengths, data, max_length=max_length):
@@ -53,7 +54,7 @@ def pack_segments_ref(lengths, data, max_length=max_length):
 
         return pack_segments_ref
 
-    @given(
+    @serial.given(
         num_seq=st.integers(10, 100),
         cell_size=st.integers(1, 10),
         **hu.gcs
diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
index a5a3d6de537d01..6bf2315ca0c52e 100644
--- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
+++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
@@ -6,13 +6,14 @@
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
 import numpy as np
 
 
-class TestPackRNNSequenceOperator(hu.HypothesisTestCase):
+class TestPackRNNSequenceOperator(serial.SerializedTestCase):
 
-    @given(n=st.integers(0, 10), k=st.integers(1, 5),
+    @serial.given(n=st.integers(0, 10), k=st.integers(1, 5),
            dim=st.integers(1, 5), **hu.gcs_cpu_only)
     def test_pack_rnn_seqence(self, n, k, dim, gc, dc):
         lengths = np.random.randint(k, size=n).astype(np.int32) + 1
@@ -47,7 +48,7 @@ def pack_op(values, lengths):
         # Gradient check
         self.assertGradientChecks(gc, op, [values, lengths], 0, [0])
 
-    @given(n=st.integers(0, 10), k=st.integers(2, 5),
+    @serial.given(n=st.integers(0, 10), k=st.integers(2, 5),
            dim=st.integers(1, 5), **hu.gcs_cpu_only)
     def test_unpack_rnn_seqence(self, n, k, dim, gc, dc):
         lengths = np.random.randint(k, size=n).astype(np.int32) + 1
diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py
index ee5e001a91e2ce..43cd10c231887e 100644
--- a/caffe2/python/operator_test/pad_test.py
+++ b/caffe2/python/operator_test/pad_test.py
@@ -2,16 +2,18 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-import hypothesis.strategies as st
-import unittest
-import caffe2.python.hypothesis_test_util as hu
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+
 from hypothesis import given
+import hypothesis.strategies as st
+import numpy as np
+import unittest
 
 
-class TestPad(hu.HypothesisTestCase):
-    @given(pad_t=st.integers(-5, 0),
+class TestPad(serial.SerializedTestCase):
+    @serial.given(pad_t=st.integers(-5, 0),
            pad_l=st.integers(-5, 0),
            pad_b=st.integers(-5, 0),
            pad_r=st.integers(-5, 0),
diff --git a/caffe2/python/operator_test/piecewise_linear_transform_test.py b/caffe2/python/operator_test/piecewise_linear_transform_test.py
index f09dcdcde2b6d5..83d67f4beea966 100644
--- a/caffe2/python/operator_test/piecewise_linear_transform_test.py
+++ b/caffe2/python/operator_test/piecewise_linear_transform_test.py
@@ -4,15 +4,16 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
+import caffe2.python.hypothesis_test_util as hu
+import caffe2.python.serialized_test.serialized_test_util as serial
+
 from hypothesis import given
 import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
 import numpy as np
-
 import unittest
 
 
-class TestPiecewiseLinearTransform(hu.HypothesisTestCase):
+class TestPiecewiseLinearTransform(serial.SerializedTestCase):
     def constrain(self, v, min_val, max_val):
         def constrain_internal(x):
             return min(max(x, min_val), max_val)
@@ -31,7 +32,7 @@ def transform(self, x, bounds, slopes, intercepts):
         y = slopes[index] * x_ + intercepts[index]
         return y
 
-    @given(n=st.integers(1, 100), **hu.gcs)
+    @serial.given(n=st.integers(1, 100), **hu.gcs)
     def test_multi_predictions_params_from_arg(self, n, gc, dc):
         slopes = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
         intercepts = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
diff --git a/caffe2/python/operator_test/weighted_sum_test.py b/caffe2/python/operator_test/weighted_sum_test.py
index 9cdeafaae50bc9..5882f7aef8346d 100644
--- a/caffe2/python/operator_test/weighted_sum_test.py
+++ b/caffe2/python/operator_test/weighted_sum_test.py
@@ -14,7 +14,7 @@
 
 class TestWeightedSumOp(serial.SerializedTestCase):
 
-    @serial.given_and_seeded(
+    @serial.given(
         n=st.integers(1, 8), m=st.integers(1, 10), d=st.integers(1, 4),
         in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
         seed=st.integers(min_value=0, max_value=65535),
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 2ebcf1d92a1240..482d16a0dfa6a6 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -8,6 +8,8 @@
 from collections import namedtuple, defaultdict
 from past.builtins import basestring
 
+import logging
+
 import numpy as np
 
 from caffe2.python import core, scope, utils, workspace
@@ -20,6 +22,8 @@
 AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"])
 _optimizer_instance_count = defaultdict(int)
 
+logger = logging.getLogger(__name__)
+
 
 class Optimizer(object):
     def __init__(self):
@@ -554,6 +558,8 @@ def _run(self, net, param_init_net, param_info):
         )
 
         if self.rowWise:
+            assert self.engine == "SIMD", "Got {}".format(self.engine)
+
             shapes, types = workspace.InferShapesAndTypes([param_init_net])
             if str(param) not in shapes:
                 # Type/shape inference is not available for this param, fallback
@@ -577,13 +583,24 @@ def _run(self, net, param_init_net, param_info):
                     shape=[shapes[str(param)][0]],
                     value=0.0
                 )
-
         else:
-            param_squared_sum = param_init_net.ConstantFill(
-                [param],
-                str(param) + "_squared_sum",
-                value=0.0
-            )
+            if self.engine == "SIMD_Q_FP16" or self.engine == "SIMD_Q_STOC_FP16":
+                shapes, types = workspace.InferShapesAndTypes([param_init_net])
+                assert str(param) in shapes, shapes
+                shape = shapes[str(param)]
+
+                param_squared_sum = param_init_net.Float16ConstantFill(
+                    [],
+                    str(param) + "_squared_sum",
+                    value=0.0,
+                    shape=shape,
+                )
+            else:
+                param_squared_sum = param_init_net.ConstantFill(
+                    [param],
+                    str(param) + "_squared_sum",
+                    value=0.0
+                )
 
         self._aux_params.local.append(param_squared_sum)
 
@@ -604,7 +621,7 @@ def _run(self, net, param_init_net, param_info):
                 [param, param_squared_sum, grad.indices, grad.values, lr],
                 [param, param_squared_sum],
                 epsilon=self.epsilon,
-                engine=self.engine
+                engine=self.engine,
             )
         else:
             output_args = [param, param_squared_sum]
@@ -913,19 +930,6 @@ def _run(self, net, param_init_net, param_info):
             **(self.init_kwargs)
         )
 
-        if self.use_lr_adaption:
-            effective_grad = param_init_net.ConstantFill(
-                [param],
-                param + "_effgrad",
-                value=0.0
-            )
-            self._aux_params.local.append(effective_grad)
-            net.LearningRateAdaption(
-                [lr, grad, effective_grad],
-                [lr],
-                lr_alpha=self.lr_alpha,
-                normalized_lr_adaption=self.normalized_lr_adaption)
-
         m1 = param_init_net.ConstantFill(
             [param],
             param + "_first_moment",
@@ -956,35 +960,45 @@ def _run(self, net, param_init_net, param_info):
                 'If SparseAdam with rowWise=True, gradient must be '\
                 'a gradientslice. PLease ensure that rowWise is not enabled '\
                 'for the dense Adam optimizer, as it is not supported.'
+
+        output_blobs = [param, m1, m2]
+        if self.use_lr_adaption:
+            effective_grad = str(param) + '_effective_grad'
+            output_blobs.append(effective_grad)
+
         if isinstance(grad, core.GradientSlice):
             grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
             if self.rowWise:
                 op = 'RowWiseSparseAdam'
             else:
                 op = 'SparseAdam'
+
             net.__getattr__(op)(
                 [param, m1, m2, grad.indices, grad.values, lr, iteration],
-                [param, m1, m2],
+                output_blobs,
                 beta1=self.beta1,
                 beta2=self.beta2,
-                epsilon=self.epsilon
-            )
+                epsilon=self.epsilon)
+            if self.use_lr_adaption:
+                net.LearningRateAdaption(
+                    [lr, grad.values, effective_grad],
+                    [lr],
+                    lr_alpha=self.lr_alpha,
+                    normalized_lr_adaption=self.normalized_lr_adaption)
 
         else:
+            net.Adam(
+                [param, m1, m2, grad, lr, iteration],
+                output_blobs,
+                beta1=self.beta1,
+                beta2=self.beta2,
+                epsilon=self.epsilon)
             if self.use_lr_adaption:
-                net.Adam(
-                    [param, m1, m2, grad, lr, iteration],
-                    [param, m1, m2, effective_grad],
-                    beta1=self.beta1,
-                    beta2=self.beta2,
-                    epsilon=self.epsilon)
-            else:
-                net.Adam(
-                    [param, m1, m2, grad, lr, iteration],
-                    [param, m1, m2],
-                    beta1=self.beta1,
-                    beta2=self.beta2,
-                    epsilon=self.epsilon)
+                net.LearningRateAdaption(
+                    [lr, grad, effective_grad],
+                    [lr],
+                    lr_alpha=self.lr_alpha,
+                    normalized_lr_adaption=self.normalized_lr_adaption)
 
     def scale_learning_rate(self, scale):
         self.alpha *= scale
diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py
index ee38fe52df8c4e..ade4e6ac9248e8 100644
--- a/caffe2/python/pipeline.py
+++ b/caffe2/python/pipeline.py
@@ -324,7 +324,8 @@ def _pipe_step(
     elif hasattr(input, 'reader'):
         reader = input.reader()
     else:
-        raise ValueError('in must be a reader, queue or stream.')
+        raise ValueError(
+            'Input must be a reader, queue or stream. Got {}'.format(type(input)))
 
     if processor is not None:
         reader = ProcessingReader(reader, processor)
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
index 8b6f5e1a5c3033..e0122fdcc9983a 100644
--- a/caffe2/python/pybind_state_dlpack.h
+++ b/caffe2/python/pybind_state_dlpack.h
@@ -108,16 +108,21 @@ class DLPackWrapper {
     }
 
     tensor->Resize(dims);
-    const auto& meta = DLTypeToCaffe(dlTensor->dtype);
+    caffe2::TypeMeta meta = DLTypeToCaffe(dlTensor->dtype);
+    at::Device device = at::Device(tensor->GetDeviceType());
     tensor->ShareExternalPointer(
-        ((int8_t*)dlTensor->data) + dlTensor->byte_offset,
+        at::DataPtr(
+            (void*)(((int8_t*)dlTensor->data) + dlTensor->byte_offset),
+            static_cast<void*>(dlMTensor),
+            [](void* t_ptr) -> void {
+              DLManagedTensor* mt_ptr = static_cast<DLManagedTensor*>(t_ptr);
+              if (mt_ptr->destructor) {
+                mt_ptr->destructor(mt_ptr);
+              }
+            },
+            device),
         meta,
-        0,
-        [dlMTensor](void*) {
-          if (dlMTensor->destructor) {
-            dlMTensor->destructor(dlMTensor);
-          }
-        });
+        0);
   }
 
   Tensor* tensor;
diff --git a/caffe2/python/pybind_state_gpu.cc b/caffe2/python/pybind_state_gpu.cc
index 9c15bc2145d5a9..8c547cf8eccca6 100644
--- a/caffe2/python/pybind_state_gpu.cc
+++ b/caffe2/python/pybind_state_gpu.cc
@@ -10,7 +10,9 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#ifdef CAFFE2_USE_CUDNN
 #include "caffe2/core/common_cudnn.h"
+#endif // CAFFE2_USE_CUDNN
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/operators/operator_fallback_gpu.h"
 #include "caffe2/python/pybind_state_registry.h"
@@ -39,10 +41,12 @@ namespace py = pybind11;
 void addCUDAGlobalMethods(py::module& m) {
   m.def("num_cuda_devices", &NumCudaDevices);
   m.def("get_cuda_version", &CudaVersion);
+#ifdef CAFFE2_USE_CUDNN
   m.def("get_cudnn_version", &cudnnCompiledVersion);
   m.attr("cudnn_convolution_fwd_algo_count") = py::int_((int) CUDNN_CONVOLUTION_FWD_ALGO_COUNT);
   m.attr("cudnn_convolution_bwd_data_algo_count") = py::int_((int) CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT);
   m.attr("cudnn_convolution_bwd_filter_algo_count") = py::int_((int) CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT);
+#endif
   m.def("get_cuda_peer_access_pattern", []() {
     std::vector<std::vector<bool>> pattern;
     CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern));
diff --git a/caffe2/python/pybind_state_nomni.cc b/caffe2/python/pybind_state_nomni.cc
index fbfe143f66cee0..1f05d3bd1beeb4 100644
--- a/caffe2/python/pybind_state_nomni.cc
+++ b/caffe2/python/pybind_state_nomni.cc
@@ -198,7 +198,45 @@ void addNomnigraphMethods(pybind11::module& m) {
             CAFFE_ENFORCE(nn::is<nom::repr::Tensor>(n));
             return nn::get<nom::repr::Tensor>(n);
           },
-          py::return_value_policy::reference_internal);
+          py::return_value_policy::reference_internal)
+      .def(
+          "getAnnotation",
+          [](NNGraph::NodeRef n) { return getOrAddCaffe2Annotation(n); })
+      .def(
+          "setAnnotation",
+          [](NNGraph::NodeRef n, Caffe2Annotation annot) {
+            auto* nnOp = nn::get<NeuralNetOperator>(n);
+            nnOp->setAnnotation(
+                nom::util::make_unique<Caffe2Annotation>(annot));
+          })
+      .def(
+          "getOperatorPredecessors",
+          [](NNGraph::NodeRef n) {
+            CAFFE_ENFORCE(nn::is<NeuralNetOperator>(n));
+            std::vector<NNGraph::NodeRef> pred;
+            for (const auto& inEdge : n->getInEdges()) {
+              auto data = inEdge->tail();
+              if (nn::hasProducer(data)) {
+                pred.emplace_back(nn::getProducer(data));
+              }
+            }
+            return pred;
+          },
+          py::return_value_policy::reference)
+      .def(
+          "getOperatorSuccessors",
+          [](NNGraph::NodeRef n) {
+            CAFFE_ENFORCE(nn::is<NeuralNetOperator>(n));
+            std::vector<NNGraph::NodeRef> succ;
+            for (const auto& outEdge : n->getOutEdges()) {
+              auto data = outEdge->head();
+              for (const auto& consumer : nn::getConsumers(data)) {
+                succ.emplace_back(consumer);
+              }
+            }
+            return succ;
+          },
+          py::return_value_policy::reference);
 
   py::class_<GenericOperator> nnop(m, "NeuralNetOperator");
   py::class_<nom::repr::Tensor> nndata(m, "NeuralNetData");
@@ -233,9 +271,11 @@ void addNomnigraphMethods(pybind11::module& m) {
                   auto nnOp = nn::get<NeuralNetOperator>(node);
                   return opName == nnOp->getName();
                 });
-            return g->createNode(
-                nom::matcher::MatchNode<nn::NNNodeMatchCriteria>(
-                    match, true, 1, !strict));
+            auto node = nom::matcher::MatchNode<nn::NNNodeMatchCriteria>(match);
+            if (!strict) {
+              node.nonTerminal();
+            }
+            return g->createNode(std::move(node));
           },
           py::return_value_policy::reference_internal,
           py::arg("node"),
@@ -243,9 +283,11 @@ void addNomnigraphMethods(pybind11::module& m) {
       .def(
           "createNode",
           [](nn::NNMatchGraph* g, nom::repr::Tensor& tensor, bool strict) {
-            return g->createNode(
-                nom::matcher::MatchNode<nn::NNNodeMatchCriteria>(
-                    nn::matchTensor(), true, 1, !strict));
+            auto node = nn::NNMatchNode(nn::matchTensor());
+            if (!strict) {
+              node.nonTerminal();
+            }
+            return g->createNode(std::move(node));
           },
           py::return_value_policy::reference_internal,
           py::arg("tensor"),
@@ -255,9 +297,11 @@ void addNomnigraphMethods(pybind11::module& m) {
           [](nn::NNMatchGraph* g, bool strict) {
             auto match = nn::NNNodeMatchCriteria(
                 [](NNGraph::NodeRef node) { return true; });
-            return g->createNode(
-                nom::matcher::MatchNode<nn::NNNodeMatchCriteria>(
-                    match, true, 1, !strict));
+            auto node = nom::matcher::MatchNode<nn::NNNodeMatchCriteria>(match);
+            if (!strict) {
+              node.nonTerminal();
+            }
+            return g->createNode(std::move(node));
           },
           py::return_value_policy::reference_internal,
           py::arg("strict") = false)
@@ -276,6 +320,14 @@ void addNomnigraphMethods(pybind11::module& m) {
     }
     return NNSubgraph();
   });
+
+  // Annotation API
+  py::class_<Caffe2Annotation> annotation(m, "Annotation");
+  annotation.def(py::init<>())
+      .def("setDevice", &Caffe2Annotation::setDevice)
+      .def("getDevice", &Caffe2Annotation::getDevice)
+      .def("setDeviceType", &Caffe2Annotation::setDeviceType)
+      .def("getDeviceType", &Caffe2Annotation::getDeviceType);
 }
 
 REGISTER_PYBIND_ADDITION(addNomnigraphMethods);
diff --git a/caffe2/python/serialized_test/README.md b/caffe2/python/serialized_test/README.md
index 00d104d309f652..2885ed290cab8e 100644
--- a/caffe2/python/serialized_test/README.md
+++ b/caffe2/python/serialized_test/README.md
@@ -4,9 +4,15 @@ Major functionality lives in `serialized_test_util.py`
 
 ## How to use
 1. Extend the test case class from `SerializedTestCase`
-2. Change the `@given` decorator to `@given_and_seeded`. This runs a seeded hypothesis test instance which will generate outputs if desired in addition to the unseeded hypothesis tests normally run.
-3. Change a call to `unittest.main()` in `__main__` to `testWithArgs`.
-4.  Run your test `python caffe2/python/operator_test/my_test.py -g` to generate serialized outputs. They will live in `caffe2/python/serialized_test/data/operator_test`, one folder per test function
-5. Thereafter, runs of the test without the flag will load serialized outputs and gradient operators for comparison against the seeded run. If for any reason the seeded run's inputs are different (this can happen with different hypothesis versions or different setups), then we'll run the serialized inputs through the serialized operator to get a runtime output for comparison. 
+2. Change the `@given` decorator to `@serialized_test_util.given`. This runs a seeded hypothesis test instance which will generate outputs if desired in addition to the unseeded hypothesis tests normally run.
+3. [Optional] Add (or change a call of `unittest.main()` to) `testWithArgs` in `__main__`. This allows you to generate outputs using `python caffe2/python/operator_test/my_test.py -G`.
+4.  Run your test `python -m pytest caffe2/python/operator_test/my_test.py -G` to generate serialized outputs. They will live in `caffe2/python/serialized_test/data/operator_test`, one npz file per test function. Use `-O` to change the output directory.
+5. Thereafter, runs of the test without the flag will load serialized outputs and gradient operators for comparison against the seeded run. The comparison is done as long as you have a call to assertReferenceChecks. If for any reason the seeded run's inputs are different (this can happen with different hypothesis versions or different setups), then we'll run the serialized inputs through the serialized operator to get a runtime output for comparison. 
+
+##Additional Notes
 
 If we'd like to extend the test framework beyond that for operator tests, we can create a new subfolder for them inside `caffe2/python/serialized_test/data`.
+
+Note, we currently don't support using other hypothesis decorators on top of `given_and_seeded`. Hypothis has some handling to explicitly check that `@given` is on the bottom of the decorator stack.
+
+If there are multiple calls to assertReferenceChecks in a test function, we'll serialize and write the last one. The actual input checked may then differ if we refactor a test function that calls this multiple times, though the serialized test should still pass since we then use the serialized input to generate a dynamic output.
diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip
new file mode 100644
index 00000000000000..415a47d71c3166
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip
new file mode 100644
index 00000000000000..e4584245ab112c
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip
new file mode 100644
index 00000000000000..0dc8e48877d431
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip
new file mode 100644
index 00000000000000..07e439a921cfcf
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip
new file mode 100644
index 00000000000000..2bdb95bdaf798c
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip
new file mode 100644
index 00000000000000..adac8479290e62
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip
new file mode 100644
index 00000000000000..9326bfd4df1da4
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip
new file mode 100644
index 00000000000000..27c7db4e9a17de
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip
new file mode 100644
index 00000000000000..8ddd81fbf44a9f
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip b/caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip
new file mode 100644
index 00000000000000..cb08f9f86e6df4
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip b/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip
new file mode 100644
index 00000000000000..88ec3cc8dbba70
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip b/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip
new file mode 100644
index 00000000000000..a0e1408b5a731b
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip b/caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip
new file mode 100644
index 00000000000000..5a115b00e3b154
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip b/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip
new file mode 100644
index 00000000000000..73717a440d9581
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip b/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip
new file mode 100644
index 00000000000000..bb95ce7149a696
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip b/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip
new file mode 100644
index 00000000000000..eaddc47759a9c0
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip b/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip
new file mode 100644
index 00000000000000..f51ee2ee182b35
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip
new file mode 100644
index 00000000000000..668efa6e1643b5
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip
new file mode 100644
index 00000000000000..12692067370534
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip b/caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip
new file mode 100644
index 00000000000000..166b4b1d8022ce
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip b/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip
new file mode 100644
index 00000000000000..ccdd2257ffc71b
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip b/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip
new file mode 100644
index 00000000000000..928a74f90cec5d
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip b/caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip
new file mode 100644
index 00000000000000..2c3e35be43a175
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip b/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip
new file mode 100644
index 00000000000000..1c9f5abc9bd0c5
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip b/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip
new file mode 100644
index 00000000000000..f0e22405a92f76
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip b/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip
new file mode 100644
index 00000000000000..d19477ff24f940
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip b/caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip
new file mode 100644
index 00000000000000..016c1495758286
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip b/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip
new file mode 100644
index 00000000000000..a1768f9b2d700f
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip
new file mode 100644
index 00000000000000..709d1674d3052d
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip
new file mode 100644
index 00000000000000..9939456c7dbc89
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip
new file mode 100644
index 00000000000000..6a22bf9d5b9c41
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip b/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip
new file mode 100644
index 00000000000000..2b3813c79aee7e
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip b/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip
new file mode 100644
index 00000000000000..37b399e1584e51
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip b/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip
new file mode 100644
index 00000000000000..11701717ee81e2
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip b/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip
new file mode 100644
index 00000000000000..9b87be841729ba
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip b/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip
new file mode 100644
index 00000000000000..c6a3150b72d008
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip b/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip
new file mode 100644
index 00000000000000..867437401d2f48
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip b/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip
new file mode 100644
index 00000000000000..ef9530a9a11ce2
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip
new file mode 100644
index 00000000000000..a8e45abf3c8ea5
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip
new file mode 100644
index 00000000000000..02cb6d516ae68e
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip
new file mode 100644
index 00000000000000..61e709080e5c22
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip
new file mode 100644
index 00000000000000..407edcbba9a970
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip b/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip
new file mode 100644
index 00000000000000..21b93d67f95a1d
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip b/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip
new file mode 100644
index 00000000000000..334cc694c49e57
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip b/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip
new file mode 100644
index 00000000000000..08d071b420c216
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip b/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip
new file mode 100644
index 00000000000000..338cb152f91aef
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip b/caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip
new file mode 100644
index 00000000000000..23cc6cdedec7f1
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip b/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip
new file mode 100644
index 00000000000000..234c13761bd8a8
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip b/caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip
new file mode 100644
index 00000000000000..91b7a6ea038fd2
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip
new file mode 100644
index 00000000000000..2886b5689db08e
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip
new file mode 100644
index 00000000000000..53885e5840d91f
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip
new file mode 100644
index 00000000000000..8396b7975691ac
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip
new file mode 100644
index 00000000000000..012725912304cd
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip b/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip
new file mode 100644
index 00000000000000..9fd13e82fb758d
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip b/caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip
new file mode 100644
index 00000000000000..72fe453f766f85
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip b/caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip
new file mode 100644
index 00000000000000..ce0af76aa78704
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip
new file mode 100644
index 00000000000000..1e1eceec001f50
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip
new file mode 100644
index 00000000000000..e2295f316c3a3e
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip
new file mode 100644
index 00000000000000..87eab5db10f061
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip b/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip
new file mode 100644
index 00000000000000..00a30f3611f954
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip b/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip
new file mode 100644
index 00000000000000..fa921405003358
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip
new file mode 100644
index 00000000000000..a878aedf18c943
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip
new file mode 100644
index 00000000000000..1425d94c588e3a
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip b/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip
new file mode 100644
index 00000000000000..e723b255e952f7
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip b/caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip
new file mode 100644
index 00000000000000..0bdb30fab892a3
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip b/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip
new file mode 100644
index 00000000000000..b38de79996df64
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip b/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip
new file mode 100644
index 00000000000000..0d37ccb28e3ed2
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip b/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip
new file mode 100644
index 00000000000000..cdc2079dc27758
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip b/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip
new file mode 100644
index 00000000000000..86d5742dd78e16
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip b/caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip
new file mode 100644
index 00000000000000..ddaa761fab5a68
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip b/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip
new file mode 100644
index 00000000000000..2ef77b58d9b0d4
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip b/caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip
new file mode 100644
index 00000000000000..2de39276663ad2
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip b/caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip
new file mode 100644
index 00000000000000..0d9b65472803fc
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip b/caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip
new file mode 100644
index 00000000000000..4c2a40e90619e0
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip b/caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip
new file mode 100644
index 00000000000000..6255a751d925c2
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip
new file mode 100644
index 00000000000000..305f6fa75e5fa6
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip
new file mode 100644
index 00000000000000..3a72d02000f275
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip
new file mode 100644
index 00000000000000..bffbc66c8ea8b1
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip b/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip
new file mode 100644
index 00000000000000..2cf316c359a415
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip b/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip
new file mode 100644
index 00000000000000..652b0dc0909105
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip b/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip
new file mode 100644
index 00000000000000..3ff14dcf91ec12
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip
new file mode 100644
index 00000000000000..437698665e19ad
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip
new file mode 100644
index 00000000000000..3998133efd3545
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip b/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip
new file mode 100644
index 00000000000000..855579a23a3524
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip b/caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip
new file mode 100644
index 00000000000000..8c09282a249a2d
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip
new file mode 100644
index 00000000000000..76d2671c4fbcbe
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip
new file mode 100644
index 00000000000000..d0db2a7d338495
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip
new file mode 100644
index 00000000000000..20e95bdf8633c8
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip
new file mode 100644
index 00000000000000..f6917e3c4ffa23
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip b/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip
new file mode 100644
index 00000000000000..5b29d053454923
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip b/caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip
new file mode 100644
index 00000000000000..09b2dd7e7fb953
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip
new file mode 100644
index 00000000000000..8438af3a9f7867
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip
new file mode 100644
index 00000000000000..953e275453a5a9
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip b/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip
new file mode 100644
index 00000000000000..945f89561e9d77
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip b/caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip
new file mode 100644
index 00000000000000..2339652431c7d4
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip
new file mode 100644
index 00000000000000..7bdda94c3fdba6
Binary files /dev/null and b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip differ
diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/gradient_0.pb b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/gradient_0.pb
deleted file mode 100644
index ba59745bd14a7b..00000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/gradient_0.pb and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/inputs.npz b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/inputs.npz
deleted file mode 100644
index 3f35572017ab82..00000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/inputs.npz and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/operator_0.pb b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/operator_0.pb
deleted file mode 100644
index 8fae4791be423c..00000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/operator_0.pb and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/outputs.npz b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/outputs.npz
deleted file mode 100644
index 543a127bedee0e..00000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum/outputs.npz and /dev/null differ
diff --git a/caffe2/python/serialized_test/serialized_test_util.py b/caffe2/python/serialized_test/serialized_test_util.py
index ad79591fb2e1bf..feb5d8e127cb88 100644
--- a/caffe2/python/serialized_test/serialized_test_util.py
+++ b/caffe2/python/serialized_test/serialized_test_util.py
@@ -7,14 +7,15 @@
 from caffe2.proto import caffe2_pb2
 from caffe2.python import gradient_checker
 import caffe2.python.hypothesis_test_util as hu
-from hypothesis import given, seed, settings
+import hypothesis as hy
 import inspect
-import numpy
+import numpy as np
 import os
 import re
 import shutil
 import sys
 import threading
+from zipfile import ZipFile
 
 operator_test_type = 'operator_test'
 TOP_DIR = os.path.dirname(os.path.realpath(__file__))
@@ -23,10 +24,10 @@
 _output_context = threading.local()
 
 
-def given_and_seeded(*given_args, **given_kwargs):
+def given(*given_args, **given_kwargs):
     def wrapper(f):
-        hyp_func = given(*given_args, **given_kwargs)(f)
-        fixed_seed_func = seed(0)(settings(max_examples=1)(given(
+        hyp_func = hy.given(*given_args, **given_kwargs)(f)
+        fixed_seed_func = hy.seed(0)(hy.settings(max_examples=1)(hy.given(
             *given_args, **given_kwargs)(f)))
 
         def func(self, *args, **kwargs):
@@ -38,21 +39,36 @@ def func(self, *args, **kwargs):
     return wrapper
 
 
+def _getGradientOrNone(op_proto):
+    try:
+        grad_ops, _ = gradient_checker.getGradientForOp(op_proto)
+        return grad_ops
+    except Exception:
+        return []
+
+
+# necessary to support converting jagged lists into numpy arrays
+def _transformList(l):
+    ret = np.empty(len(l), dtype=np.object)
+    for (i, arr) in enumerate(l):
+        ret[i] = arr
+    return ret
+
+
+def _prepare_dir(path):
+    if os.path.exists(path):
+        shutil.rmtree(path)
+    os.makedirs(path)
+
+
 class SerializedTestCase(hu.HypothesisTestCase):
 
     should_serialize = False
 
     def get_output_dir(self):
-        class_path = inspect.getfile(self.__class__)
-        file_name_components = os.path.basename(class_path).split('.')
-        test_file = file_name_components[0]
-
-        function_name_components = self.id().split('.')
-        test_function = function_name_components[-1]
-
         output_dir_arg = getattr(_output_context, 'output_dir', DATA_DIR)
         output_dir = os.path.join(
-            output_dir_arg, operator_test_type, test_file + '.' + test_function)
+            output_dir_arg, operator_test_type)
 
         if os.path.exists(output_dir):
             return output_dir
@@ -65,30 +81,55 @@ def get_output_dir(self):
         output_dir_fallback = os.path.join(cwd, serialized_dir, DATA_SUFFIX)
         output_dir = os.path.join(
             output_dir_fallback,
-            operator_test_type,
-            test_file + '.' + test_function)
+            operator_test_type)
 
         return output_dir
 
+    def get_output_filename(self):
+        class_path = inspect.getfile(self.__class__)
+        file_name_components = os.path.basename(class_path).split('.')
+        test_file = file_name_components[0]
+
+        function_name_components = self.id().split('.')
+        test_function = function_name_components[-1]
+
+        return test_file + '.' + test_function
+
     def serialize_test(self, inputs, outputs, grad_ops, op, device_option):
-        def prepare_dir(path):
-            if os.path.exists(path):
-                shutil.rmtree(path)
-            os.makedirs(path)
         output_dir = self.get_output_dir()
-        prepare_dir(output_dir)
-        for (i, grad) in enumerate(grad_ops):
-            grad_path = os.path.join(output_dir, 'gradient_{}.pb'.format(i))
-            with open(grad_path, 'wb') as f:
-                f.write(grad.SerializeToString())
+        test_name = self.get_output_filename()
+        full_dir = os.path.join(output_dir, test_name)
+        _prepare_dir(full_dir)
+
+        inputs = _transformList(inputs)
+        outputs = _transformList(outputs)
         device_type = int(device_option.device_type)
-        op_path = os.path.join(output_dir, 'operator_{}.pb'.format(device_type))
+
+        op_path = os.path.join(full_dir, 'op.pb')
+        grad_paths = []
+        inout_path = os.path.join(full_dir, 'inout')
+
         with open(op_path, 'wb') as f:
             f.write(op.SerializeToString())
-        numpy.savez_compressed(
-            os.path.join(output_dir, 'inputs'), inputs=inputs)
-        numpy.savez_compressed(
-            os.path.join(output_dir, 'outputs'), outputs=outputs)
+        for (i, grad) in enumerate(grad_ops):
+            grad_path = os.path.join(full_dir, 'grad_{}.pb'.format(i))
+            grad_paths.append(grad_path)
+            with open(grad_path, 'wb') as f:
+                f.write(grad.SerializeToString())
+
+        np.savez_compressed(
+            inout_path,
+            inputs=inputs,
+            outputs=outputs,
+            device_type=device_type)
+
+        with ZipFile(os.path.join(output_dir, test_name + '.zip'), 'w') as z:
+            z.write(op_path, 'op.pb')
+            z.write(inout_path + '.npz', 'inout.npz')
+            for path in grad_paths:
+                z.write(path, os.path.basename(path))
+
+        shutil.rmtree(full_dir)
 
     def compare_test(self, inputs, outputs, grad_ops, atol=1e-7, rtol=1e-7):
 
@@ -98,48 +139,50 @@ def parse_proto(x):
             return proto
 
         source_dir = self.get_output_dir()
+        test_name = self.get_output_filename()
+        full_dir = os.path.join(source_dir, test_name)
+        _prepare_dir(full_dir)
+        with ZipFile(os.path.join(source_dir, test_name + '.zip')) as z:
+            loaded = z.extractall(full_dir)
+
+        op_path = os.path.join(full_dir, 'op.pb')
+        inout_path = os.path.join(full_dir, 'inout.npz')
+        loaded = np.load(inout_path, encoding='bytes')
 
         # load serialized input and output
-        loaded_inputs = numpy.load(
-            os.path.join(source_dir, 'inputs.npz'), encoding='bytes')['inputs']
+        loaded_inputs = loaded['inputs'].tolist()
         inputs_equal = True
         for (x, y) in zip(inputs, loaded_inputs):
-            if not numpy.array_equal(x, y):
+            if not np.array_equal(x, y):
                 inputs_equal = False
-        loaded_outputs = numpy.load(os.path.join(
-            source_dir, 'outputs.npz'), encoding='bytes')['outputs']
+        loaded_outputs = loaded['outputs'].tolist()
 
         # load operator
-        found_op = False
-        for i in os.listdir(source_dir):
-            op_file = os.path.join(source_dir, i)
-            match = re.search('operator_(.+?)\.pb', i)
-            if os.path.isfile(op_file) and match:
-                with open(op_file, 'rb') as f:
-                    loaded_op = f.read()
-                op_proto = parse_proto(loaded_op)
-                device_type = int(match.group(1))
-                device_option = caffe2_pb2.DeviceOption(device_type=device_type)
-                grad_ops, _ = gradient_checker.getGradientForOp(op_proto)
-                found_op = True
-                break
+        with open(op_path, 'rb') as f:
+            loaded_op = f.read()
+
+        op_proto = parse_proto(loaded_op)
+        device_type = loaded['device_type']
+        device_option = caffe2_pb2.DeviceOption(device_type=int(device_type))
 
         # if inputs are not the same, run serialized input through serialized op
         if not inputs_equal:
-            self.assertTrue(found_op)
             outputs = hu.runOpOnInput(device_option, op_proto, loaded_inputs)
+            grad_ops = _getGradientOrNone(op_proto)
 
         # assert outputs are equal
         for (x, y) in zip(outputs, loaded_outputs):
-            numpy.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
+            np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
 
         # assert gradient op is equal
         for i in range(len(grad_ops)):
-            with open(os.path.join(source_dir, 'gradient_{}.pb'.format(i)), 'rb') as f:
+            with open(os.path.join(full_dir, 'grad_{}.pb'.format(i)), 'rb') as f:
                 loaded_grad = f.read()
             grad_proto = parse_proto(loaded_grad)
             self.assertTrue(grad_proto == grad_ops[i])
 
+        shutil.rmtree(full_dir)
+
     def assertSerializedOperatorChecks(
             self,
             inputs,
@@ -149,7 +192,7 @@ def assertSerializedOperatorChecks(
             device_option,
     ):
         if self.should_serialize:
-            if getattr(_output_context, 'should_write_output', False):
+            if getattr(_output_context, 'should_generate_output', False):
                 self.serialize_test(
                     inputs, outputs, gradient_operator, op, device_option)
             else:
@@ -180,29 +223,34 @@ def assertReferenceChecks(
             atol,
             outputs_to_check,
         )
-        grad_ops, _ = gradient_checker.getGradientForOp(op)
-        self.assertSerializedOperatorChecks(
-            inputs,
-            outs,
-            grad_ops,
-            op,
-            device_option,
-        )
+        if not getattr(_output_context, 'disable_serialized_check', False):
+            grad_ops = _getGradientOrNone(op)
+            self.assertSerializedOperatorChecks(
+                inputs,
+                outs,
+                grad_ops,
+                op,
+                device_option,
+            )
 
 
 def testWithArgs():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '-g', '--generate-serialized', action='store_true', dest='write',
+        '-G', '--generate-serialized', action='store_true', dest='generate',
         help='generate output files (default=false, compares to current files)')
     parser.add_argument(
-        '-o', '--output', default=DATA_DIR,
+        '-O', '--output', default=DATA_DIR,
         help='output directory (default: %(default)s)')
+    parser.add_argument(
+        '-D', '--disable-serialized_check', action='store_true', dest='disable',
+        help='disable checking serialized tests')
     parser.add_argument('unittest_args', nargs='*')
     args = parser.parse_args()
     sys.argv[1:] = args.unittest_args
-    _output_context.__setattr__('should_write_output', args.write)
+    _output_context.__setattr__('should_generate_output', args.generate)
     _output_context.__setattr__('output_dir', args.output)
+    _output_context.__setattr__('disable_serialized_check', args.disable)
 
     import unittest
     unittest.main()
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 1a579b519fe09c..383b8410ea6ae2 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -396,8 +396,8 @@ def test_converterEnforceUnusedInputs(self):
         net = core.Net("net")
         net.Relu(["X"], ["Y"])
         net.Proto().external_input.extend(["fake"])
-        with self.assertRaises(Exception):
-            transformer.AddNNPACK(net)  # just testing the converter
+        # This should now work
+        transformer.AddNNPACK(net)  # just testing the converter
 
     def test_converterEnforceUnusedOutputs(self):
         net = core.Net("net")
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
index 75124add41cecd..5e87df8058e017 100644
--- a/caffe2/python/utils.py
+++ b/caffe2/python/utils.py
@@ -6,13 +6,13 @@
 from __future__ import unicode_literals
 
 from caffe2.proto import caffe2_pb2
+from caffe2.python.compatibility import container_abcs
 from future.utils import viewitems
 from google.protobuf.message import DecodeError, Message
 from google.protobuf import text_format
 
 import sys
 import copy
-import collections
 import functools
 import numpy as np
 from six import integer_types, binary_type, text_type, string_types
@@ -120,7 +120,7 @@ def MakeArgument(key, value):
     """Makes an argument based on the value type."""
     argument = caffe2_pb2.Argument()
     argument.name = key
-    iterable = isinstance(value, collections.Iterable)
+    iterable = isinstance(value, container_abcs.Iterable)
 
     # Fast tracking common use case where a float32 array of tensor parameters
     # needs to be serialized.  The entire array is guaranteed to have the same
diff --git a/caffe2/sgd/adam_op.cc b/caffe2/sgd/adam_op.cc
index 25414622bad754..623e93a07e3251 100644
--- a/caffe2/sgd/adam_op.cc
+++ b/caffe2/sgd/adam_op.cc
@@ -34,7 +34,7 @@ and returns (param_o, m1_o, m2_o, grad_o), in which grad_o is an optional output
     .Output(0, "output_param", "Updated parameters")
     .Output(1, "output_moment_1", "Updated first moment")
     .Output(2, "output_moment_2", "Updated second moment")
-    .Output(3, "output_grad", "Effective grad")
+    .Output(3, "output_grad", "Optional Effective gradient")
     .Arg("beta1", "Default 0.9")
     .Arg("beta2", "Default 0.999")
     .Arg("epsilon", "Default 1e-5");
@@ -42,7 +42,7 @@ and returns (param_o, m1_o, m2_o, grad_o), in which grad_o is an optional output
 REGISTER_CPU_OPERATOR(SparseAdam, SparseAdamOp<float, CPUContext>);
 OPERATOR_SCHEMA(SparseAdam)
     .NumInputs(7)
-    .NumOutputs(3)
+    .NumOutputs(3, 4)
     .EnforceInplace({{0, 0}, {1, 1}, {2, 2}})
     .SetDoc(R"DOC(
 
@@ -62,6 +62,7 @@ OPERATOR_SCHEMA(SparseAdam)
     .Output(0, "output_param", "Updated parameters")
     .Output(1, "output_moment_1", "Updated first moment")
     .Output(2, "output_moment_2", "Updated second moment")
+    .Output(3, "output_grad", "Optional Effective gradient")
     .Arg("beta1", "Default 0.9")
     .Arg("beta2", "Default 0.999")
     .Arg("epsilon", "Default 1e-5");
@@ -71,7 +72,7 @@ REGISTER_CPU_OPERATOR(
     RowWiseSparseAdamOp<float, CPUContext>);
 OPERATOR_SCHEMA(RowWiseSparseAdam)
     .NumInputs(7)
-    .NumOutputs(3)
+    .NumOutputs(3, 4)
     .EnforceInplace({{0, 0}, {1, 1}, {2, 2}})
     .SetDoc(R"DOC(
 
@@ -95,6 +96,7 @@ OPERATOR_SCHEMA(RowWiseSparseAdam)
     .Output(0, "output_param", "Updated parameters")
     .Output(1, "output_moment_1", "Updated first moment")
     .Output(2, "output_moment_2", "Updated second moment")
+    .Output(3, "output_grad", "Optional Effective gradient")
     .Arg("beta1", "Default 0.9")
     .Arg("beta2", "Default 0.999")
     .Arg("epsilon", "Default 1e-5");
diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h
index dadf7f4ee22015..699ba7aa5d23b1 100644
--- a/caffe2/sgd/adam_op.h
+++ b/caffe2/sgd/adam_op.h
@@ -88,7 +88,7 @@ class AdamOp final : public Operator<Context> {
         epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5f)) {}
   bool RunOnDevice() override {
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(ITER, CPU));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(PARAM).size());
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENT_1).size());
@@ -195,58 +195,118 @@ class SparseAdamOp final : public Operator<Context> {
     auto* moment1Out = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
     auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
 
-    for (auto i = 0; i < n; ++i) {
-      auto idx = indices[i];
+    if (OutputSize() == 3) {
+      for (auto i = 0; i < n; ++i) {
+        auto idx = indices[i];
+
+        if (block_size == 1) {
+          float gi = gradIn[i];
+          float mi = moment1Out[idx] =
+              moment1In[idx] * beta1_ + gi * (1 - beta1_);
+          float vi = moment2Out[idx] =
+              moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
+          paramOut[idx] = paramIn[idx] +
+              lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+
+        } else {
+          auto offsetI = i * block_size;
+          auto offsetIdx = idx * block_size;
 
-      if (block_size == 1) {
-        float gi = gradIn[i];
-        float mi = moment1Out[idx] =
-            moment1In[idx] * beta1_ + gi * (1 - beta1_);
-        float vi = moment2Out[idx] =
-            moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
-        paramOut[idx] =
-            paramIn[idx] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+#ifndef NDEBUG
+          CAFFE_ENFORCE_GE(
+              Input(PARAM).size(),
+              block_size + offsetIdx,
+              this->debug_def().input(PARAM),
+              ", out of bound,  idx:",
+              idx,
+              " for input i:",
+              i,
+              " and block size:",
+              block_size);
+          CAFFE_ENFORCE_GE(
+              Input(GRAD).size(),
+              block_size + offsetI,
+              this->debug_def().input(GRAD),
+              ", out of bound idx, idx:",
+              idx,
+              " for input i:",
+              i);
+#endif
 
-      } else {
-        auto offsetI = i * block_size;
-        auto offsetIdx = idx * block_size;
+          adam_compute(
+              block_size,
+              paramIn + offsetIdx,
+              gradIn + offsetI,
+              moment1In + offsetIdx,
+              moment2In + offsetIdx,
+              paramOut + offsetIdx,
+              moment1Out + offsetIdx,
+              moment2Out + offsetIdx,
+              beta1_,
+              beta2_,
+              epsilon_,
+              correction,
+              lr,
+              &context_);
+        }
+      }
+    } else {
+      Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+      auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data<T>();
+      for (auto i = 0; i < n; ++i) {
+        auto idx = indices[i];
+
+        if (block_size == 1) {
+          float gi = gradIn[i];
+          float mi = moment1Out[idx] =
+              moment1In[idx] * beta1_ + gi * (1 - beta1_);
+          float vi = moment2Out[idx] =
+              moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
+          float ngi = gradOut[i] = correction * mi / (std::sqrt(vi) + epsilon_);
+          paramOut[idx] = paramIn[idx] + lr[0] * ngi;
+
+        } else {
+          auto offsetI = i * block_size;
+          auto offsetIdx = idx * block_size;
 
 #ifndef NDEBUG
-        CAFFE_ENFORCE_GE(
-            Input(PARAM).size(),
-            block_size + offsetIdx,
-            this->debug_def().input(PARAM),
-            ", out of bound,  idx:",
-            idx,
-            " for input i:",
-            i,
-            " and block size:",
-            block_size);
-        CAFFE_ENFORCE_GE(
-            Input(GRAD).size(),
-            block_size + offsetI,
-            this->debug_def().input(GRAD),
-            ", out of bound idx, idx:",
-            idx,
-            " for input i:",
-            i);
+          CAFFE_ENFORCE_GE(
+              Input(PARAM).size(),
+              block_size + offsetIdx,
+              this->debug_def().input(PARAM),
+              ", out of bound,  idx:",
+              idx,
+              " for input i:",
+              i,
+              " and block size:",
+              block_size);
+          CAFFE_ENFORCE_GE(
+              Input(GRAD).size(),
+              block_size + offsetI,
+              this->debug_def().input(GRAD),
+              ", out of bound idx, idx:",
+              idx,
+              " for input i:",
+              i);
 #endif
 
-        adam_compute(
-            block_size,
-            paramIn + offsetIdx,
-            gradIn + offsetI,
-            moment1In + offsetIdx,
-            moment2In + offsetIdx,
-            paramOut + offsetIdx,
-            moment1Out + offsetIdx,
-            moment2Out + offsetIdx,
-            beta1_,
-            beta2_,
-            epsilon_,
-            correction,
-            lr,
-            &context_);
+          adam_compute_output_grad(
+              block_size,
+              paramIn + offsetIdx,
+              gradIn + offsetI,
+              moment1In + offsetIdx,
+              moment2In + offsetIdx,
+              paramOut + offsetIdx,
+              moment1Out + offsetIdx,
+              moment2Out + offsetIdx,
+              gradOut + offsetI,
+              beta1_,
+              beta2_,
+              epsilon_,
+              correction,
+              lr,
+              &context_);
+        }
       }
     }
     return true;
@@ -257,7 +317,7 @@ class SparseAdamOp final : public Operator<Context> {
   T beta2_;
   T epsilon_;
   INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, INDICES, GRAD, LR, ITER);
-  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD);
 };
 
 template <typename T, class Context>
@@ -305,61 +365,126 @@ class RowWiseSparseAdamOp final : public Operator<Context> {
     auto* moment1Out = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
     auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data<T>();
 
-    for (auto i = 0; i < n; ++i) {
-      auto idx = indices[i];
+    if (OutputSize() == 3) {
+      for (auto i = 0; i < n; ++i) {
+        auto idx = indices[i];
+
+        if (block_size == 1) {
+          float gi = gradIn[i];
+          float mi = moment1Out[idx] =
+              moment1In[idx] * beta1_ + gi * (1 - beta1_);
+          float vi = moment2Out[idx] =
+              moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
+          paramOut[idx] = paramIn[idx] +
+              lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+
+        } else {
+          auto offsetI = i * block_size;
+          auto offsetIdx = idx * block_size;
 
-      if (block_size == 1) {
-        float gi = gradIn[i];
-        float mi = moment1Out[idx] =
-            moment1In[idx] * beta1_ + gi * (1 - beta1_);
-        float vi = moment2Out[idx] =
-            moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
-        paramOut[idx] =
-            paramIn[idx] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+#ifndef NDEBUG
+          CAFFE_ENFORCE_GE(
+              Input(PARAM).size(),
+              block_size + offsetIdx,
+              this->debug_def().input(PARAM),
+              ", out of bound,  idx:",
+              idx,
+              " for input i:",
+              i,
+              " and block size:",
+              block_size);
+          CAFFE_ENFORCE_GE(
+              Input(GRAD).size(),
+              block_size + offsetI,
+              this->debug_def().input(GRAD),
+              ", out of bound idx, idx:",
+              idx,
+              " for input i:",
+              i);
+#endif
 
-      } else {
-        auto offsetI = i * block_size;
-        auto offsetIdx = idx * block_size;
+          const float* w = paramIn + offsetIdx;
+          const float* g = gradIn + offsetI;
+          const float* m1 = moment1In + offsetIdx;
+          const float* m2 = moment2In + idx;
+          float* nw = paramOut + offsetIdx;
+          float* nm1 = moment1Out + offsetIdx;
+          float* nm2 = moment2Out + idx;
+
+          float m2_sum = 0.;
+          for (auto j = 0; j < block_size; ++j) {
+            float gj = g[j];
+            m2_sum += gj * gj;
+          }
+          float vi = nm2[0] =
+              m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_);
+          for (auto j = 0; j < block_size; ++j) {
+            float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_);
+            nw[j] = w[j] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+          }
+        }
+      }
+    } else {
+      Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+      auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data<T>();
+      for (auto i = 0; i < n; ++i) {
+        auto idx = indices[i];
+
+        if (block_size == 1) {
+          float gi = gradIn[i];
+          float mi = moment1Out[idx] =
+              moment1In[idx] * beta1_ + gi * (1 - beta1_);
+          float vi = moment2Out[idx] =
+              moment2In[idx] * beta2_ + gi * gi * (1 - beta2_);
+          float ngi = gradOut[i] = correction * mi / (std::sqrt(vi) + epsilon_);
+          paramOut[idx] = paramIn[idx] + lr[0] * ngi;
+
+        } else {
+          auto offsetI = i * block_size;
+          auto offsetIdx = idx * block_size;
 
 #ifndef NDEBUG
-        CAFFE_ENFORCE_GE(
-            Input(PARAM).size(),
-            block_size + offsetIdx,
-            this->debug_def().input(PARAM),
-            ", out of bound,  idx:",
-            idx,
-            " for input i:",
-            i,
-            " and block size:",
-            block_size);
-        CAFFE_ENFORCE_GE(
-            Input(GRAD).size(),
-            block_size + offsetI,
-            this->debug_def().input(GRAD),
-            ", out of bound idx, idx:",
-            idx,
-            " for input i:",
-            i);
+          CAFFE_ENFORCE_GE(
+              Input(PARAM).size(),
+              block_size + offsetIdx,
+              this->debug_def().input(PARAM),
+              ", out of bound,  idx:",
+              idx,
+              " for input i:",
+              i,
+              " and block size:",
+              block_size);
+          CAFFE_ENFORCE_GE(
+              Input(GRAD).size(),
+              block_size + offsetI,
+              this->debug_def().input(GRAD),
+              ", out of bound idx, idx:",
+              idx,
+              " for input i:",
+              i);
 #endif
 
-        const float* w = paramIn + offsetIdx;
-        const float* g = gradIn + offsetI;
-        const float* m1 = moment1In + offsetIdx;
-        const float* m2 = moment2In + idx;
-        float* nw = paramOut + offsetIdx;
-        float* nm1 = moment1Out + offsetIdx;
-        float* nm2 = moment2Out + idx;
-
-        float m2_sum = 0.;
-        for (auto j = 0; j < block_size; ++j) {
-          float gj = g[j];
-          m2_sum += gj * gj;
-        }
-        float vi = nm2[0] =
-            m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_);
-        for (auto j = 0; j < block_size; ++j) {
-          float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_);
-          nw[j] = w[j] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_);
+          const float* w = paramIn + offsetIdx;
+          const float* g = gradIn + offsetI;
+          const float* m1 = moment1In + offsetIdx;
+          const float* m2 = moment2In + idx;
+          float* nw = paramOut + offsetIdx;
+          float* nm1 = moment1Out + offsetIdx;
+          float* nm2 = moment2Out + idx;
+          float* ng = gradOut + offsetI;
+
+          float m2_sum = 0.;
+          for (auto j = 0; j < block_size; ++j) {
+            float gj = g[j];
+            m2_sum += gj * gj;
+          }
+          float vi = nm2[0] =
+              m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_);
+          for (auto j = 0; j < block_size; ++j) {
+            float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_);
+            float ngi = ng[j] = correction * mi / (std::sqrt(vi) + epsilon_);
+            nw[j] = w[j] + lr[0] * ngi;
+          }
         }
       }
     }
@@ -371,7 +496,7 @@ class RowWiseSparseAdamOp final : public Operator<Context> {
   T beta2_;
   T epsilon_;
   INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, INDICES, GRAD, LR, ITER);
-  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2);
+  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2, OUTPUT_GRAD);
 };
 
 } // namespace caffe2
diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
index 41f70ca51d577b..2e142af682f795 100644
--- a/caffe2/sgd/adam_op_gpu.cu
+++ b/caffe2/sgd/adam_op_gpu.cu
@@ -1,6 +1,6 @@
-#include "caffe2/sgd/adam_op.h"
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"
+#include "caffe2/sgd/adam_op.h"
 
 namespace caffe2 {
 
@@ -95,6 +95,55 @@ void adam_compute<CUDAContext>(
       N, w, g, m, v, nw, nm, nv, beta1, beta2, eps_hat, correction, lr);
 }
 
+__global__ void AdamComputeOutputGrad(
+    int N,
+    const float* w,
+    const float* g,
+    const float* m,
+    const float* v,
+    float* nw,
+    float* nm,
+    float* nv,
+    float* ng,
+    float beta1,
+    float beta2,
+    float eps_hat,
+    float correction,
+    const float* lr) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    float gi = g[i];
+    float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1);
+    float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2);
+    float ngi = ng[i] = correction * mi / (sqrtf(vi) + eps_hat);
+    nw[i] = w[i] + lr[0] * ngi;
+  }
+}
+
+template <>
+void adam_compute_output_grad<CUDAContext>(
+    int N,
+    const float* w,
+    const float* g,
+    const float* m,
+    const float* v,
+    float* nw,
+    float* nm,
+    float* nv,
+    float* ng,
+    float beta1,
+    float beta2,
+    float eps_hat,
+    float correction,
+    const float* lr,
+    CUDAContext* context) {
+  AdamComputeOutputGrad<<<
+      CAFFE_GET_BLOCKS(N),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context->cuda_stream()>>>(
+      N, w, g, m, v, nw, nm, nv, ng, beta1, beta2, eps_hat, correction, lr);
+}
+
 template <typename SIndex>
 __global__ void SparseAdamKernel(
     const size_t N,
@@ -123,9 +172,44 @@ __global__ void SparseAdamKernel(
   }
 }
 
+template <typename SIndex>
+__global__ void SparseAdamOutputGradKernel(
+    const size_t N,
+    const size_t grad_slice_sz,
+    const float beta1,
+    const float beta2,
+    const float epsilon,
+    float* param,
+    float* mom1,
+    float* mom2,
+    float* output_grad,
+    const SIndex* indices,
+    const float* grad,
+    const float correction,
+    const float* lr,
+    const float iter) {
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    const size_t gradIdx = i;
+    const SIndex index = indices[i / grad_slice_sz];
+    const size_t paramIdx = index * grad_slice_sz + (i % grad_slice_sz);
+
+    float m1n = mom1[paramIdx] =
+        mom1[paramIdx] * beta1 + grad[gradIdx] * (1.0f - beta1);
+    float m2n = mom2[paramIdx] =
+        mom2[paramIdx] * beta2 + grad[gradIdx] * grad[gradIdx] * (1.0f - beta2);
+    float gradOut = output_grad[gradIdx] =
+        correction * m1n / (sqrt(m2n) + epsilon);
+    param[paramIdx] += lr[0] * gradOut;
+  }
+}
+
 template <>
 template <typename SIndex>
 bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
+  Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
+  Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));
+  Output(OUTPUT_MOMENT_2)->ResizeLike(Input(MOMENT_2));
+
   auto N = Input(GRAD).size();
   auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
   const auto iter =
@@ -133,24 +217,48 @@ bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
   const float correction = sqrtf(1.0f - std::pow(beta2_, iter + 1)) /
       (1.0f - std::pow(beta1_, iter + 1));
 
-  SparseAdamKernel<SIndex>
-      <<<CAFFE_GET_BLOCKS(N),
-         CAFFE_CUDA_NUM_THREADS,
-         0,
-         context_.cuda_stream()>>>(
-          N,
-          grad_slice_sz,
-          beta1_,
-          beta2_,
-          epsilon_,
-          Output(OUTPUT_PARAM)->template mutable_data<float>(),
-          Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
-          Output(OUTPUT_MOMENT_2)->template mutable_data<float>(),
-          Input(INDICES).template data<SIndex>(),
-          Input(GRAD).template data<float>(),
-          correction,
-          Input(LR).template data<float>(),
-          iter);
+  if (OutputSize() == 3) {
+    SparseAdamKernel<SIndex>
+        <<<CAFFE_GET_BLOCKS(N),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            N,
+            grad_slice_sz,
+            beta1_,
+            beta2_,
+            epsilon_,
+            Output(OUTPUT_PARAM)->template mutable_data<float>(),
+            Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
+            Output(OUTPUT_MOMENT_2)->template mutable_data<float>(),
+            Input(INDICES).template data<SIndex>(),
+            Input(GRAD).template data<float>(),
+            correction,
+            Input(LR).template data<float>(),
+            iter);
+  } else {
+    Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
+    SparseAdamOutputGradKernel<SIndex>
+        <<<CAFFE_GET_BLOCKS(N),
+           CAFFE_CUDA_NUM_THREADS,
+           0,
+           context_.cuda_stream()>>>(
+            N,
+            grad_slice_sz,
+            beta1_,
+            beta2_,
+            epsilon_,
+            Output(OUTPUT_PARAM)->template mutable_data<float>(),
+            Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
+            Output(OUTPUT_MOMENT_2)->template mutable_data<float>(),
+            Output(OUTPUT_GRAD)->template mutable_data<float>(),
+            Input(INDICES).template data<SIndex>(),
+            Input(GRAD).template data<float>(),
+            correction,
+            Input(LR).template data<float>(),
+            iter);
+  }
+
   return true;
 }
 
diff --git a/caffe2/sgd/fp16_momentum_sgd_op.h b/caffe2/sgd/fp16_momentum_sgd_op.h
index 7b1c68634de228..a8f3ce75c7fc2c 100644
--- a/caffe2/sgd/fp16_momentum_sgd_op.h
+++ b/caffe2/sgd/fp16_momentum_sgd_op.h
@@ -37,8 +37,8 @@ class FP16MomentumSGDUpdateOp final : public Operator<Context> {
   bool RunOnDevice() override {
     auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
diff --git a/caffe2/sgd/fp32_momentum_sgd_op.h b/caffe2/sgd/fp32_momentum_sgd_op.h
index 75907a63501da9..57ea18a097b099 100644
--- a/caffe2/sgd/fp32_momentum_sgd_op.h
+++ b/caffe2/sgd/fp32_momentum_sgd_op.h
@@ -33,8 +33,8 @@ class FP32MomentumSGDUpdateOp final : public Operator<Context> {
   bool RunOnDevice() override {
     auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
diff --git a/caffe2/sgd/iter_op.h b/caffe2/sgd/iter_op.h
index 91709f47f3453a..22ec8d252c455f 100644
--- a/caffe2/sgd/iter_op.h
+++ b/caffe2/sgd/iter_op.h
@@ -39,7 +39,7 @@ class IterOp final : public Operator<Context> {
   bool RunOnDevice() override {
     if (InputSize() == 0) {
       LOG(INFO) << "[Input size is zero]";
-      if (!OperatorBase::OutputIsType<Tensor>(0, CPU)) {
+      if (!OperatorBase::OutputIsTensorType(0, CPU)) {
         // This is the first run; set the iter to start with 0.
         LOG(ERROR) << "You are using an old definition of IterOp that will "
                       "be deprecated soon. More specifically, IterOp now "
diff --git a/caffe2/sgd/momentum_sgd_op.h b/caffe2/sgd/momentum_sgd_op.h
index 6e79d5dbedc7da..c3f25c84c9b8aa 100644
--- a/caffe2/sgd/momentum_sgd_op.h
+++ b/caffe2/sgd/momentum_sgd_op.h
@@ -47,8 +47,8 @@ class MomentumSGDOp final : public Operator<Context> {
   bool RunOnDevice() override {
     auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type));
     CAFFE_ENFORCE(Input(LR).size() == 1);
     CAFFE_ENFORCE(Input(GRAD).size() == Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
@@ -87,8 +87,8 @@ class MomentumSGDUpdateOp final : public Operator<Context> {
   bool RunOnDevice() override {
     auto device_type = Context::GetDeviceType();
     // Iter live on the CPU
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(GRAD, device_type));
-    CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(MOMENTUM, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(GRAD, device_type));
+    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(MOMENTUM, device_type));
     CAFFE_ENFORCE_EQ(Input(LR).size(), 1);
     CAFFE_ENFORCE_EQ(Input(GRAD).size(), Input(MOMENTUM).size());
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
diff --git a/caffe2/sgd/yellowfin_op.h b/caffe2/sgd/yellowfin_op.h
index 249f638bfac03d..94150413df1750 100644
--- a/caffe2/sgd/yellowfin_op.h
+++ b/caffe2/sgd/yellowfin_op.h
@@ -126,7 +126,7 @@ CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
 CAFFE2_YF_READ_INPUT(GRAD, grad)
 #undef CAFFE2_YF_READ_OUTPUT
 
-CAFFE_ENFORCE(OperatorBase::InputIsType<Tensor>(ITER, CPU));
+CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU));
 CAFFE_ENFORCE_EQ(lr_avg_tensor.size(), 1);
 CAFFE_ENFORCE_EQ(mu_avg_tensor.size(), 1);
 CAFFE_ENFORCE_EQ(param_tensor.ndim(), moment_tensor.ndim());
diff --git a/caffe2/share/contrib/CMakeLists.txt b/caffe2/share/contrib/CMakeLists.txt
index 01af2c0616dfce..0fc3a4186f0189 100644
--- a/caffe2/share/contrib/CMakeLists.txt
+++ b/caffe2/share/contrib/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (USE_NNPACK)
+if (USE_NNPACK AND BUILD_CAFFE2_OPS)
   add_subdirectory(nnpack)
 endif()
 if (USE_ZSTD)
diff --git a/caffe2/utils/Array.h b/caffe2/utils/Array.h
index 55c4a6a6e50af7..3217198dd8cdf3 100644
--- a/caffe2/utils/Array.h
+++ b/caffe2/utils/Array.h
@@ -259,7 +259,7 @@ template<std::size_t _Int, typename _Tp, std::size_t _Nm>
 constexpr _Tp&& get(array<_Tp, _Nm>&& __arr) noexcept
 {
   static_assert(_Int < _Nm, "array index is within bounds");
-  return std::move(get<_Int>(__arr));
+  return guts::move(get<_Int>(__arr));
 }
 
 template<std::size_t _Int, typename _Tp, std::size_t _Nm>
@@ -292,12 +292,12 @@ constexpr inline array<T, N-1> tail(const array<T, N>& arg) {
 namespace detail {
 template<class T, size_t N, size_t... I>
 constexpr inline array<T, N+1> prepend_(T&& head, const array<T, N>& tail, guts::index_sequence<I...>) {
-  return {{std::forward<T>(head), get<I>(tail)...}};
+  return {{guts::forward<T>(head), get<I>(tail)...}};
 }
 }
 template<class T, size_t N>
 constexpr inline array<T, N+1> prepend(T&& head, const array<T, N>& tail) {
-  return detail::prepend_(std::forward<T>(head), tail, guts::make_index_sequence<N>());
+  return detail::prepend_(guts::forward<T>(head), tail, guts::make_index_sequence<N>());
 }
 
 /**
diff --git a/caffe2/utils/Array_test.cpp b/caffe2/utils/Array_test.cpp
index 1d8c290b8a2249..1f3171ebe88eb0 100644
--- a/caffe2/utils/Array_test.cpp
+++ b/caffe2/utils/Array_test.cpp
@@ -78,11 +78,9 @@ namespace test_tail {
     static_assert(array < int, 0 > {{}} == tail(array < int, 1 > {{3}}), "");
 }
 
-TEST(ArrayTest, TestPrepend) {
-  // Some compilers can't handle move results as constexpr, so use
-  // gtest assert for this test
-  ASSERT_EQ((array<int, 3> {{2, 3, 4}}), (prepend(2, array<int, 2> {{3, 4}})));
-  ASSERT_EQ((array<int, 1> {{3}}), (prepend(3, array<int, 0> {{}})));
+namespace test_prepend {
+    static_assert(array < int, 3 > {{2, 3, 4}} == prepend(2, array < int, 2 > {{3, 4}}), "");
+    static_assert(array < int, 1 > {{3}} == prepend(3, array < int, 0 > {{}}), "");
 }
 
 namespace test_to_std_array {
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index 920bffc0ae3bfb..2bb11cd22ad70e 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -130,7 +130,7 @@ __global__ void BroadcastBinaryOpCUDAKernel(
 }
 
 template <typename TIn, typename TOut, class BinaryOperator>
-void BinaryOpWith2DBroadcasting(
+CAFFE2_CUDA_EXPORT void BinaryOpWith2DBroadcasting(
     const int rows,
     const int cols,
     const bool rowwise_broadcast,
@@ -177,7 +177,7 @@ void BinaryOpWith2DBroadcasting(
 }
 
 template <typename TIn, typename TOut, class BinaryOperator, int D>
-void BroadcastBinaryOpImpl(
+CAFFE2_CUDA_EXPORT void BroadcastBinaryOpImpl(
     const int* A_dims,
     const int* B_dims,
     const int* C_dims,
@@ -212,7 +212,7 @@ void BroadcastBinaryOpImpl(
 }
 
 template <typename TIn, typename TOut, class BinaryOperator>
-void BroadcastBinaryOp(
+CAFFE2_CUDA_EXPORT void BroadcastBinaryOp(
     const int A_ndim,
     const int* A_dims,
     const int B_ndim,
@@ -294,7 +294,7 @@ void BroadcastBinaryOp(
     }                                                               \
   }                                                                 \
   template <>                                                       \
-  void Func<T, CUDAContext>(                                        \
+  CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>(                                        \
       const int N, const T* x, T* y, CUDAContext* context) {        \
     Func##CUDAKernel<<<                                             \
         CAFFE_GET_BLOCKS(N),                                        \
@@ -362,7 +362,7 @@ DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Inv, utils::Inv<double>)
 
 #define CAFFE2_SPECIALIZED_CUDA_SINCOS(T)                            \
   template <>                                                        \
-  void SinCos<T, CUDAContext>(                                       \
+  CAFFE2_CUDA_EXPORT void SinCos<T, CUDAContext>(                                       \
       const int N, const T* x, T* ys, T* yc, CUDAContext* context) { \
     SinCosCUDAKernel<<<                                              \
         CAFFE_GET_BLOCKS(N),                                         \
@@ -376,7 +376,7 @@ CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
 
 #define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
   template <>                                                     \
-  void Func<TIn, CUDAContext>(                                    \
+  CAFFE2_CUDA_EXPORT void Func<TIn, CUDAContext>(                                    \
       const int N,                                                \
       const TIn* A,                                               \
       const TIn* B,                                               \
@@ -444,7 +444,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
 
 #define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op)   \
   template <>                                                             \
-  void Rowwise##Func<TIn, CUDAContext, true>(                             \
+  CAFFE2_CUDA_EXPORT void Rowwise##Func<TIn, CUDAContext, true>(                             \
       const int rows,                                                     \
       const int cols,                                                     \
       const TIn* A,                                                       \
@@ -463,7 +463,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
            context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
   }                                                                       \
   template <>                                                             \
-  void Rowwise##Func<TIn, CUDAContext, false>(                            \
+  CAFFE2_CUDA_EXPORT void Rowwise##Func<TIn, CUDAContext, false>(                            \
       const int rows,                                                     \
       const int cols,                                                     \
       const TIn* A,                                                       \
@@ -482,7 +482,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
            context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
   }                                                                       \
   template <>                                                             \
-  void Colwise##Func<TIn, CUDAContext, true>(                             \
+  CAFFE2_CUDA_EXPORT void Colwise##Func<TIn, CUDAContext, true>(                             \
       const int rows,                                                     \
       const int cols,                                                     \
       const TIn* A,                                                       \
@@ -501,7 +501,7 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
            context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
   }                                                                       \
   template <>                                                             \
-  void Colwise##Func<TIn, CUDAContext, false>(                            \
+  CAFFE2_CUDA_EXPORT void Colwise##Func<TIn, CUDAContext, false>(                            \
       const int rows,                                                     \
       const int cols,                                                     \
       const TIn* A,                                                       \
@@ -573,7 +573,7 @@ DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
 
 #define DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op)  \
   template <>                                                         \
-  void Func<TIn, CUDAContext>(                                        \
+  CAFFE2_CUDA_EXPORT void Func<TIn, CUDAContext>(                                        \
       const int A_ndim,                                               \
       const int* A_dims,                                              \
       const int B_ndim,                                               \
@@ -638,7 +638,7 @@ DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
 
 #define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func)                  \
   template <>                                                           \
-  void Funcname<T, CUDAContext>(                                        \
+  CAFFE2_CUDA_EXPORT void Funcname<T, CUDAContext>(                                        \
       const int N,                                                      \
       const T* src,                                                     \
       T* dst,                                                           \
@@ -669,7 +669,7 @@ DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max)
 // Caffe2 gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
 template <>
-void Gemm<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Gemm<float, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -710,7 +710,7 @@ void Gemm<float, CUDAContext>(
 }
 
 template <>
-void Gemm<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Gemm<float16, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -781,7 +781,7 @@ void Gemm<float16, CUDAContext>(
 }
 
 template <>
-void BiasCHW<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void BiasCHW<float, CUDAContext>(
     const float* bias,
     const float* bias_multiplier,
     const int bias_channels,
@@ -803,7 +803,7 @@ void BiasCHW<float, CUDAContext>(
 }
 
 template <>
-void GemmBatched<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void GemmBatched<float, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -869,7 +869,7 @@ void GemmBatched<float, CUDAContext>(
 }
 
 template <>
-void GemmStridedBatched<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void GemmStridedBatched<float, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -930,7 +930,7 @@ void GemmStridedBatched<float, CUDAContext>(
 }
 
 template <>
-void GemmBatched<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void GemmBatched<float16, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -1059,7 +1059,7 @@ void GemmBatched<float16, CUDAContext>(
 }
 
 template <>
-void GemmStridedBatched<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void GemmStridedBatched<float16, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -1168,7 +1168,7 @@ void GemmStridedBatched<float16, CUDAContext>(
 
 // No change, but required. Defer to default CUDA engine
 template <>
-void Gemm<float, CUDAContext, TensorCoreEngine>(
+CAFFE2_CUDA_EXPORT void Gemm<float, CUDAContext, TensorCoreEngine>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -1186,7 +1186,7 @@ void Gemm<float, CUDAContext, TensorCoreEngine>(
 }
 
 template <>
-void Gemm<float16, CUDAContext, TensorCoreEngine>(
+CAFFE2_CUDA_EXPORT void Gemm<float16, CUDAContext, TensorCoreEngine>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -1245,7 +1245,7 @@ void Gemm<float16, CUDAContext, TensorCoreEngine>(
 }
 
 template <>
-void GemmStridedBatched<float, CUDAContext, TensorCoreEngine>(
+CAFFE2_CUDA_EXPORT void GemmStridedBatched<float, CUDAContext, TensorCoreEngine>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -1282,7 +1282,7 @@ void GemmStridedBatched<float, CUDAContext, TensorCoreEngine>(
 }
 
 template <>
-void GemmStridedBatched<float16, CUDAContext, TensorCoreEngine>(
+CAFFE2_CUDA_EXPORT void GemmStridedBatched<float16, CUDAContext, TensorCoreEngine>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int batch_size,
@@ -1321,7 +1321,7 @@ void GemmStridedBatched<float16, CUDAContext, TensorCoreEngine>(
 #endif // CUDA_VERSION >= 9000
 
 template <>
-void GemmEx<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void GemmEx<float, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const CBLAS_TRANSPOSE trans_B,
     const int M,
@@ -1362,7 +1362,7 @@ void GemmEx<float, CUDAContext>(
 }
 
 template <>
-void Gemv<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Gemv<float, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const int M,
     const int N,
@@ -1415,7 +1415,7 @@ __global__ void AddStripedBatchKernel(
 
 #define CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(T)              \
   template <>                                                     \
-  void AddStripedBatch<T, CUDAContext>(                           \
+  CAFFE2_CUDA_EXPORT void AddStripedBatch<T, CUDAContext>(                           \
       const int N,                                                \
       const T* first,                                             \
       T* Y,                                                       \
@@ -1434,7 +1434,7 @@ CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float16);
 #undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH
 
 template <>
-void Gemv<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Gemv<float16, CUDAContext>(
     const CBLAS_TRANSPOSE trans_A,
     const int M,
     const int N,
@@ -1514,7 +1514,7 @@ __global__ void SetKernel(const int N, const T alpha, T* Y) {
 
 #define CAFFE2_SPECIALIZED_CUDA_SET(T)                              \
   template <>                                                       \
-  void Set<T, CUDAContext>(                                         \
+  CAFFE2_CUDA_API void Set<T, CUDAContext>(                         \
       const size_t N, const T alpha, T* Y, CUDAContext* context) {  \
     if (N == 0) {                                                   \
       return;                                                       \
@@ -1542,7 +1542,7 @@ CAFFE2_SPECIALIZED_CUDA_SET(uint16_t);
 #undef CAFFE2_SPECIALIZED_CUDA_SET
 
 template <>
-void Set<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Set<float16, CUDAContext>(
     const size_t N,
     const float16 alpha,
     float16* Y,
@@ -1577,7 +1577,7 @@ UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) {
 } // namespace
 
 template <>
-void RandUniform<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void RandUniform<float, CUDAContext>(
     const size_t n,
     const float min,
     const float max,
@@ -1592,7 +1592,7 @@ void RandUniform<float, CUDAContext>(
 }
 
 template <>
-void RandUniform<double, CUDAContext>(
+CAFFE2_CUDA_EXPORT void RandUniform<double, CUDAContext>(
     const size_t n,
     const double min,
     const double max,
@@ -1608,7 +1608,7 @@ void RandUniform<double, CUDAContext>(
 }
 
 template <>
-void RandUniform<int, CUDAContext>(
+CAFFE2_CUDA_EXPORT void RandUniform<int, CUDAContext>(
     const size_t n,
     const int min,
     const int max,
@@ -1642,7 +1642,7 @@ size_t HandleOddLengthRandGaussian(
 }
 
 template <>
-void RandGaussian<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void RandGaussian<float, CUDAContext>(
     const size_t n,
     const float mean,
     const float std,
@@ -1658,7 +1658,7 @@ void RandGaussian<float, CUDAContext>(
 }
 
 template <>
-void RandGaussian<double, CUDAContext>(
+CAFFE2_CUDA_EXPORT void RandGaussian<double, CUDAContext>(
     const size_t n,
     const double mean,
     const double std,
@@ -1671,7 +1671,7 @@ void RandGaussian<double, CUDAContext>(
 }
 
 template <>
-void Dot<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Dot<float, CUDAContext>(
     const int n,
     const float* a,
     const float* b,
@@ -1683,7 +1683,7 @@ void Dot<float, CUDAContext>(
 }
 
 template <>
-void Dot<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Dot<float16, CUDAContext>(
     const int n,
     const float16* a,
     const float16* b,
@@ -1760,7 +1760,7 @@ __global__ void SumConvertKernel(float* sum, T* dest) {
 }
 
 template <typename T, typename IterT>
-void SumGenericIter(
+CAFFE2_CUDA_EXPORT void SumGenericIter(
     const int N,
     IterT it,
     T*& dest,
@@ -1789,7 +1789,7 @@ void SumGenericIter(
 } // namespace
 
 template <>
-void Sum<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Sum<float, CUDAContext>(
     const int N,
     const float* x,
     float* y,
@@ -1804,7 +1804,7 @@ void Sum<float, CUDAContext>(
 }
 
 template <>
-void Sum<int32_t, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Sum<int32_t, CUDAContext>(
     const int N,
     const int32_t* x,
     int32_t* y,
@@ -1829,7 +1829,7 @@ struct FloatTransform {
 
 #define CAFFE2_MATH_SUM_FUNC(T)                                           \
   template <>                                                             \
-  void Sum<T, CUDAContext>(                                               \
+  CAFFE2_CUDA_EXPORT void Sum<T, CUDAContext>(                                               \
       const int N,                                                        \
       const T* x,                                                         \
       T* y,                                                               \
@@ -1861,7 +1861,7 @@ struct SqrTransform {
 } //  namespace
 
 template <>
-void SumSqr<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void SumSqr<float, CUDAContext>(
     const int N,
     const float* x,
     float* y,
@@ -1880,7 +1880,7 @@ void SumSqr<float, CUDAContext>(
 
 #define CAFFE2_MATH_SUMSQR_FUNC(T)                                      \
   template <>                                                           \
-  void SumSqr<T, CUDAContext>(                                          \
+  CAFFE2_CUDA_EXPORT void SumSqr<T, CUDAContext>(                                          \
       const int N,                                                      \
       const T* x,                                                       \
       T* y,                                                             \
@@ -1920,7 +1920,7 @@ SelectKernel(const int N, const int D, const T* x, const int* idx, T* y) {
 } // namespace
 
 template <>
-void Select<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Select<float, CUDAContext>(
     const int N,
     const int D,
     const float* x,
@@ -1935,7 +1935,7 @@ void Select<float, CUDAContext>(
 }
 
 template <>
-void Select<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Select<float16, CUDAContext>(
     const int N,
     const int D,
     const float16* x,
@@ -1985,7 +1985,7 @@ __global__ void PowKernel(const int n, const T* x, const T exponent, T* y) {
 } // namespace
 
 template <>
-void Powx<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Powx<float, CUDAContext>(
     const int N,
     const float* a,
     const float b,
@@ -2000,7 +2000,7 @@ void Powx<float, CUDAContext>(
 
 #define DELEGATE_CUBLAS_SCALE_FUNCTION(TAlpha, TData, CuBLASFunc)            \
   template <>                                                                \
-  void Scale<TAlpha, TData, CUDAContext>(                                    \
+  CAFFE2_CUDA_EXPORT void Scale<TAlpha, TData, CUDAContext>(                                    \
       const int N,                                                           \
       const TAlpha alpha,                                                    \
       const TData* x,                                                        \
@@ -2024,7 +2024,7 @@ void Powx<float, CUDAContext>(
     }                                                                        \
   }                                                                          \
   template <>                                                                \
-  void Scale<TAlpha, TData, CUDAContext>(                                    \
+  CAFFE2_CUDA_EXPORT void Scale<TAlpha, TData, CUDAContext>(                                    \
       const int N,                                                           \
       const TAlpha* alpha,                                                   \
       const TData* x,                                                        \
@@ -2051,7 +2051,7 @@ DELEGATE_CUBLAS_SCALE_FUNCTION(double, double, cublasDscal)
 
 #define CAFFE2_SPECIALIZED_CUDA_SCALE(TAlpha, TData)  \
   template <>                                         \
-  void Scale<TAlpha, TData, CUDAContext>(             \
+  CAFFE2_CUDA_EXPORT void Scale<TAlpha, TData, CUDAContext>(             \
       const int N,                                    \
       const TAlpha alpha,                             \
       const TData* x,                                 \
@@ -2078,7 +2078,7 @@ DELEGATE_CUBLAS_SCALE_FUNCTION(double, double, cublasDscal)
            context->cuda_stream()>>>(N, alpha, x, y); \
   }                                                   \
   template <>                                         \
-  void Scale<TAlpha, TData, CUDAContext>(             \
+  CAFFE2_CUDA_EXPORT void Scale<TAlpha, TData, CUDAContext>(             \
       const int N,                                    \
       const TAlpha* alpha,                            \
       const TData* x,                                 \
@@ -2098,7 +2098,7 @@ CAFFE2_SPECIALIZED_CUDA_SCALE(std::int64_t, std::int64_t)
 #undef CAFFE2_SPECIALIZED_CUDA_SCALE
 
 template <>
-void Scale<float16, float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Scale<float16, float16, CUDAContext>(
     const int N,
     const float16 alpha,
     const float16* x,
@@ -2129,7 +2129,7 @@ void Scale<float16, float16, CUDAContext>(
 }
 
 template <>
-void Scale<float16, float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Scale<float16, float16, CUDAContext>(
     const int N,
     const float16* alpha,
     const float16* x,
@@ -2160,7 +2160,7 @@ void Scale<float16, float16, CUDAContext>(
 }
 
 template <>
-void Scale<float, float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Scale<float, float16, CUDAContext>(
     const int N,
     const float alpha,
     const float16* x,
@@ -2193,7 +2193,7 @@ void Scale<float, float16, CUDAContext>(
 }
 
 template <>
-void Scale<float, float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Scale<float, float16, CUDAContext>(
     const int N,
     const float* alpha,
     const float16* x,
@@ -2224,7 +2224,7 @@ void Scale<float, float16, CUDAContext>(
 }
 
 template <>
-void Axpy<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Axpy<float, CUDAContext>(
     const int N,
     const float alpha,
     const float* X,
@@ -2236,7 +2236,7 @@ void Axpy<float, CUDAContext>(
 }
 
 template <>
-void Axpy<double, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Axpy<double, CUDAContext>(
     const int N,
     const float alpha,
     const double* X,
@@ -2250,7 +2250,7 @@ void Axpy<double, CUDAContext>(
 }
 
 template <>
-void Axpy<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Axpy<float16, CUDAContext>(
     const int N,
     const float alpha,
     const float16* X,
@@ -2273,7 +2273,7 @@ void Axpy<float16, CUDAContext>(
 }
 
 template <>
-void Axpy<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Axpy<float, CUDAContext>(
     const int N,
     const float* alpha,
     const float* X,
@@ -2285,7 +2285,7 @@ void Axpy<float, CUDAContext>(
 }
 
 template <>
-void Axpy<float16, CUDAContext>(
+CAFFE2_CUDA_EXPORT void Axpy<float16, CUDAContext>(
     const int N,
     const float* alpha,
     const float16* X,
@@ -2379,7 +2379,7 @@ __global__ void AxpbyCUDAKernel<float, float16>(
 
 #define CAFFE2_SPECIALIZED_CUDA_AXPBY(TCoeff, TData) \
   template <>                                        \
-  void Axpby<TCoeff, TData, CUDAContext>(            \
+  CAFFE2_CUDA_EXPORT void Axpby<TCoeff, TData, CUDAContext>(            \
       const int n,                                   \
       const TCoeff a,                                \
       const TData* x,                                \
@@ -2393,7 +2393,7 @@ __global__ void AxpbyCUDAKernel<float, float16>(
            context->cuda_stream()>>>(n, a, x, b, y); \
   }                                                  \
   template <>                                        \
-  void Axpby<TCoeff, TData, CUDAContext>(            \
+  CAFFE2_CUDA_EXPORT void Axpby<TCoeff, TData, CUDAContext>(            \
       const int n,                                   \
       const TCoeff* a,                               \
       const TData* x,                                \
@@ -2468,7 +2468,7 @@ __global__ void Im2ColNCHWCUDAKernel(
 }
 
 template <typename T>
-__global__ void Im2ColNHWCCUDAKernel(
+__global__  void Im2ColNHWCCUDAKernel(
     const int n,
     const int input_h,
     const int input_w,
@@ -2519,7 +2519,7 @@ __global__ void Im2ColNHWCCUDAKernel(
 }
 
 template <typename T>
-__global__ void Col2ImNCHWCUDAKernel(
+__global__  void Col2ImNCHWCUDAKernel(
     const int n,
     const int input_h,
     const int input_w,
@@ -2574,7 +2574,7 @@ __global__ void Col2ImNCHWCUDAKernel(
 }
 
 template <typename T>
-__global__ void Col2ImNHWCCUDAKernel(
+__global__  void Col2ImNHWCCUDAKernel(
     const int n,
     const int input_w,
     const int channels,
@@ -2627,7 +2627,7 @@ __global__ void Col2ImNHWCCUDAKernel(
 }
 
 template <typename T, int N, bool kCol2Im>
-__global__ void Im2ColNdNCHWCUDAKernel(
+__global__  void Im2ColNdNCHWCUDAKernel(
     const int outer_size,
     const int inner_size,
     const int kernel_size,
@@ -2683,7 +2683,7 @@ __global__ void Im2ColNdNCHWCUDAKernel(
 }
 
 template <typename T, int N>
-void Im2ColNdNCHWCUDAImpl(
+CAFFE2_CUDA_EXPORT void Im2ColNdNCHWCUDAImpl(
     const int img_size,
     const int col_size,
     const int* img_shape,
@@ -2730,7 +2730,7 @@ void Im2ColNdNCHWCUDAImpl(
 }
 
 template <typename T, int N>
-void Col2ImNdNCHWCUDAImpl(
+CAFFE2_CUDA_EXPORT void Col2ImNdNCHWCUDAImpl(
     const int img_size,
     const int col_size,
     const int* img_shape,
@@ -2780,7 +2780,7 @@ void Col2ImNdNCHWCUDAImpl(
 } // namespace
 
 template <>
-void Im2Col<float, CUDAContext, StorageOrder::NCHW>(
+CAFFE2_CUDA_EXPORT void Im2Col<float, CUDAContext, StorageOrder::NCHW>(
     const int channels,
     const int height,
     const int width,
@@ -2826,7 +2826,7 @@ void Im2Col<float, CUDAContext, StorageOrder::NCHW>(
 }
 
 template <>
-void Im2Col<float, CUDAContext, StorageOrder::NHWC>(
+CAFFE2_CUDA_EXPORT void Im2Col<float, CUDAContext, StorageOrder::NHWC>(
     const int channels,
     const int height,
     const int width,
@@ -2874,7 +2874,7 @@ void Im2Col<float, CUDAContext, StorageOrder::NHWC>(
 }
 
 template <>
-void Col2Im<float, CUDAContext, StorageOrder::NCHW>(
+CAFFE2_CUDA_EXPORT void Col2Im<float, CUDAContext, StorageOrder::NCHW>(
     const int channels,
     const int height,
     const int width,
@@ -2920,7 +2920,7 @@ void Col2Im<float, CUDAContext, StorageOrder::NCHW>(
 }
 
 template <>
-void Col2Im<float, CUDAContext, StorageOrder::NHWC>(
+CAFFE2_CUDA_EXPORT void Col2Im<float, CUDAContext, StorageOrder::NHWC>(
     const int channels,
     const int height,
     const int width,
@@ -2968,7 +2968,7 @@ void Col2Im<float, CUDAContext, StorageOrder::NHWC>(
 }
 
 template <>
-void Im2ColNd<float, CUDAContext, StorageOrder::NCHW>(
+CAFFE2_CUDA_EXPORT void Im2ColNd<float, CUDAContext, StorageOrder::NCHW>(
     const int N,
     const int img_size,
     const int col_size,
@@ -2999,7 +2999,7 @@ void Im2ColNd<float, CUDAContext, StorageOrder::NCHW>(
 }
 
 template <>
-void Col2ImNd<float, CUDAContext, StorageOrder::NCHW>(
+CAFFE2_CUDA_EXPORT void Col2ImNd<float, CUDAContext, StorageOrder::NCHW>(
     const int N,
     const int img_size,
     const int col_size,
@@ -3030,7 +3030,7 @@ void Col2ImNd<float, CUDAContext, StorageOrder::NCHW>(
 }
 
 template <>
-void CopyMatrix<CUDAContext>(
+CAFFE2_CUDA_EXPORT void CopyMatrix<CUDAContext>(
     const size_t itemsize,
     const int M,
     const int N,
@@ -3082,7 +3082,7 @@ CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX(TIndex)
 #undef CAFFE2_SPECIALIZED_CUDA_COPY_MATRIX
 
 template <>
-void CopyVector<float, CUDAContext>(
+CAFFE2_CUDA_EXPORT void CopyVector<float, CUDAContext>(
     const int N,
     const float* src,
     float* dst,
@@ -3152,7 +3152,7 @@ __global__ void ColwiseReduceKernel(
 
 #define CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(T)                            \
   template <>                                                             \
-  void RowwiseMax<T, CUDAContext>(                                        \
+  CAFFE2_CUDA_EXPORT void RowwiseMax<T, CUDAContext>(                                        \
       const int N, const int D, const T* x, T* y, CUDAContext* context) { \
     RowwiseReduceKernel<<<                                                \
         std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),                            \
@@ -3166,7 +3166,7 @@ CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(float)
 
 #define CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(T)                            \
   template <>                                                             \
-  void ColwiseMax<T, CUDAContext>(                                        \
+  CAFFE2_CUDA_EXPORT void ColwiseMax<T, CUDAContext>(                                        \
       const int N, const int D, const T* x, T* y, CUDAContext* context) { \
     ColwiseReduceKernel<<<                                                \
         std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS),                            \
@@ -3188,7 +3188,7 @@ maximum_kernel(const int N, const float alpha, const float* x, float* y) {
 } // namespace
 
 template <>
-void Maximum(
+CAFFE2_CUDA_EXPORT void Maximum(
     const int N,
     const float alpha,
     const float* x,
@@ -3241,7 +3241,7 @@ __global__ void ReduceTensorCUDAKernel(
 }
 
 template <typename T, class Reducer, int D>
-void ReduceTensorCUDAImpl(
+CAFFE2_CUDA_EXPORT void ReduceTensorCUDAImpl(
     const int outer_size,
     const int inner_size,
     const int* dims,
@@ -3275,7 +3275,7 @@ void ReduceTensorCUDAImpl(
 }
 
 template <typename T, class Reducer>
-void ReduceTensorCUDA(
+CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
     const int num_dims,
     const int* dims,
     const int num_axes,
@@ -3353,7 +3353,7 @@ void ReduceTensorCUDA(
 
 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \
   template <>                                 \
-  void ReduceMin<T, CUDAContext>(             \
+  CAFFE2_CUDA_EXPORT void ReduceMin<T, CUDAContext>(             \
       const int num_dims,                     \
       const int* dims,                        \
       const int num_axes,                     \
@@ -3382,7 +3382,7 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(double)
 
 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \
   template <>                                 \
-  void ReduceMax<T, CUDAContext>(             \
+  CAFFE2_CUDA_EXPORT void ReduceMax<T, CUDAContext>(             \
       const int num_dims,                     \
       const int* dims,                        \
       const int num_axes,                     \
@@ -3411,7 +3411,7 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double)
 
 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \
   template <>                                 \
-  void ReduceSum<T, CUDAContext>(             \
+  CAFFE2_CUDA_EXPORT void ReduceSum<T, CUDAContext>(             \
       const int num_dims,                     \
       const int* dims,                        \
       const int num_axes,                     \
@@ -3440,7 +3440,7 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(double)
 
 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T) \
   template <>                                  \
-  void ReduceMean<T, CUDAContext>(             \
+  CAFFE2_CUDA_EXPORT void ReduceMean<T, CUDAContext>(             \
       const int num_dims,                      \
       const int* dims,                         \
       const int num_axes,                      \
@@ -3496,7 +3496,7 @@ __global__ void BroadcastCUDAKernel(
 }
 
 template <typename T, int D>
-void BroadcastCUDAImpl(
+CAFFE2_CUDA_EXPORT void BroadcastCUDAImpl(
     const int X_ndim,
     const int* X_dims,
     const int* Y_dims,
@@ -3534,7 +3534,7 @@ void BroadcastCUDAImpl(
 
 #define CAFFE2_SPECIALIZED_CUDA_BROADCAST(T) \
   template <>                                \
-  void Broadcast<T, CUDAContext>(            \
+  CAFFE2_CUDA_EXPORT void Broadcast<T, CUDAContext>(            \
       const int X_ndim,                      \
       const int* X_dims,                     \
       const int Y_ndim,                      \
@@ -3676,7 +3676,7 @@ __global__ void MomentsCUDAKernel(
 }
 
 template <typename T, int D>
-void MomentsCUDAImpl(
+CAFFE2_CUDA_EXPORT void MomentsCUDAImpl(
     const int outer_size,
     const int inner_size,
     const int* dims,
@@ -3700,7 +3700,7 @@ void MomentsCUDAImpl(
 }
 
 template <typename T>
-void MomentsCUDA(
+CAFFE2_CUDA_EXPORT void MomentsCUDA(
     const int num_dims,
     const int* dims,
     const int num_axes,
@@ -3783,7 +3783,7 @@ void MomentsCUDA(
 
 #define CAFFE2_SPECIALIZED_CUDA_MOMENTS(T)                           \
   template <>                                                        \
-  void Moments<T, CUDAContext>(                                      \
+  CAFFE2_CUDA_EXPORT void Moments<T, CUDAContext>(                                      \
       const int num_dims,                                            \
       const int* dims,                                               \
       const int num_axes,                                            \
@@ -3819,7 +3819,7 @@ DELEGATE_INV_STD_KERNEL_FUNCTION(float, rsqrtf)
 
 #define CAFFE2_SPECIALIZED_CUDA_INV_STD(T)                      \
   template <>                                                   \
-  void InvStd<T, CUDAContext>(                                  \
+  CAFFE2_CUDA_EXPORT void InvStd<T, CUDAContext>(                                  \
       const int N,                                              \
       const T epsilon,                                          \
       const T* var,                                             \
@@ -3861,7 +3861,7 @@ __global__ void TransposeCUDAKernel(
 }
 
 template <typename T, int D>
-void TransposeCUDAImpl(
+CAFFE2_CUDA_EXPORT void TransposeCUDAImpl(
     const int* dims,
     const int* axes,
     const T* X,
@@ -3886,7 +3886,7 @@ void TransposeCUDAImpl(
 
 #define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T)                             \
   template <>                                                            \
-  void Transpose<T, CUDAContext>(                                        \
+  CAFFE2_CUDA_EXPORT void Transpose<T, CUDAContext>(                                        \
       const int ndim,                                                    \
       const int* dims,                                                   \
       const int* axes,                                                   \
@@ -3933,7 +3933,7 @@ __global__ void AffineChannelCUDAKernel(
 
 #define CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(T, kOrder)              \
   template <>                                                          \
-  void AffineChannel<T, CUDAContext, kOrder>(                          \
+  CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, kOrder>(                          \
       const int N,                                                     \
       const int C,                                                     \
       const int HxW,                                                   \
diff --git a/caffe2/utils/proto_wrap.cc b/caffe2/utils/proto_wrap.cc
index b573968d9095ed..eb06524cae8417 100644
--- a/caffe2/utils/proto_wrap.cc
+++ b/caffe2/utils/proto_wrap.cc
@@ -29,3 +29,18 @@ void ShutdownProtobufLibrary() {
 }
 
 }  // namespace caffe2
+
+namespace torch {
+
+// Caffe2 wrapper functions for protobuf's GetEmptyStringAlreadyInited() function
+// used to avoid duplicated global variable in the case when protobuf
+// is built with hidden visibility.
+CAFFE2_API const ::std::string& GetEmptyStringAlreadyInited() {
+  return ::google::protobuf::internal::GetEmptyStringAlreadyInited();
+}
+
+void ShutdownProtobufLibrary() {
+  ::google::protobuf::ShutdownProtobufLibrary();
+}
+
+}  // namespace torch
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index ff838c58889e45..7d9a18eda18489 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -167,12 +167,20 @@ if (NOT BUILD_ATEN_MOBILE)
 
   file(GLOB_RECURSE all_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*")
 
+  # these are files that are generated by the script and checked in -- the script checks
+  # that they are equivalent so it must be a dependency of the script
+  set(core_gen_checked_inputs
+    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/Type.h
+    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/Tensor.h
+    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/TensorMethods.h)
+
   file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen)
+  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen/core_tmp)
 
   add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp}
     COMMAND ${GEN_COMMAND}
       --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
-    DEPENDS ${all_python} ${all_templates} ${cwrap_files})
+    DEPENDS ${all_python} ${all_templates} ${cwrap_files} ${core_gen_checked_inputs})
 
   # Generated headers used from a CUDA (.cu) file are
   # not tracked correctly in CMake. We make the libATen.so depend explicitly
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index d8d8927a05241e..db36ab66bbaeca 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -187,20 +187,19 @@ endif()
 if(BUILD_TEST)
   # Preserve build options.
   set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
-  set(TEMP_CMAKE_DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX})
 
   # We will build gtest as static libs and embed it directly into the binary.
   set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE)
 
   # For gtest, we will simply embed it into our test binaries, so we won't
   # need to install it.
-  set(BUILD_GTEST ON)
-  set(INSTALL_GTEST OFF)
+  set(BUILD_GTEST ON CACHE BOOL "Build gtest" FORCE)
+  set(INSTALL_GTEST OFF CACHE BOOL "Install gtest." FORCE)
   # We currently don't need gmock right now.
-  set(BUILD_GMOCK OFF)
+  set(BUILD_GMOCK OFF CACHE BOOL "Build gmock." FORCE)
   # For Windows, we will check the runtime used is correctly passed in.
   if (NOT CAFFE2_USE_MSVC_STATIC_RUNTIME)
-    set(gtest_force_shared_crt ON)
+      set(gtest_force_shared_crt ON CACHE BOOL "force shared crt on gtest" FORCE)
   endif()
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest)
   include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/googletest/googletest/include)
@@ -212,10 +211,8 @@ if(BUILD_TEST)
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark)
   include_directories(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark/include)
 
-  # Recover build options. Unfortunately gtest modifies CMAKE_DEBUG_POSTFIX
-  # in some versions as detailed at https://github.com/google/googletest/issues/1334
+  # Recover build options.
   set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
-  set(CMAKE_DEBUG_POSTFIX ${TEMP_CMAKE_DEBUG_POSTFIX} CACHE BOOL "Debug postfix" FORCE)
 endif()
 
 # ---[ LMDB
@@ -352,7 +349,9 @@ if(BUILD_PYTHON)
     execute_process(
       COMMAND "which" "python" RESULT_VARIABLE _exitcode OUTPUT_VARIABLE _py_exe)
     if(${_exitcode} EQUAL 0)
-      string(STRIP ${_py_exe} PYTHON_EXECUTABLE)
+      if (NOT MSVC)
+        string(STRIP ${_py_exe} PYTHON_EXECUTABLE)
+      endif()
       message(STATUS "Setting Python to ${PYTHON_EXECUTABLE}")
     endif()
   endif()
@@ -391,7 +390,11 @@ if(BUILD_PYTHON)
     pycmd_no_exit(_py_lib _exitcode "from sysconfig import get_paths; print(get_paths()['stdlib'])")
     if("${_exitcode}" EQUAL 0 AND EXISTS "${_py_lib}" AND EXISTS "${_py_lib}")
       SET(PYTHON_LIBRARY "${_py_lib}")
-      message(STATUS "Setting Python's library to ${_py_lib}")
+      if (MSVC)
+        STRING(REPLACE "Lib" "libs" _py_static_lib ${_py_lib})
+        link_directories(${_py_static_lib})
+      endif()
+      message(STATUS "Setting Python's library to ${PYTHON_LIBRARY}")
     endif()
   endif(NOT DEFINED PYTHON_LIBRARY)
 
@@ -545,6 +548,7 @@ if(NOT BUILD_ATEN_MOBILE)
     list(APPEND HIP_HIPCC_FLAGS -Wno-shift-count-negative)
     list(APPEND HIP_HIPCC_FLAGS -Wno-shift-count-overflow)
     list(APPEND HIP_HIPCC_FLAGS -Wno-unused-command-line-argument)
+    list(APPEND HIP_HIPCC_FLAGS -Wno-duplicate-decl-specifier)
 
     set(Caffe2_HIP_INCLUDES
       ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${thrust_INCLUDE_DIRS} $<INSTALL_INTERFACE:include> ${Caffe2_HIP_INCLUDES})
@@ -770,7 +774,7 @@ if (USE_NNAPI AND NOT ANDROID)
   caffe2_update_option(USE_NNAPI OFF)
 endif()
 
-if (NOT BUILD_ATEN_MOBILE)
+if (NOT BUILD_ATEN_MOBILE AND BUILD_CAFFE2_OPS)
   if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
     list(APPEND Caffe2_DEPENDENCY_LIBS aten_op_header_gen)
     if (USE_CUDA)
@@ -795,6 +799,11 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
   set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
   # We will build onnx as static libs and embed it directly into the binary.
   set(BUILD_SHARED_LIBS OFF)
+  if (MSVC AND BUILD_SHARED_LIBS)
+    # That also means we want to export all symbols from the shared
+    # library we are building
+    set(ONNX_BUILD_MAIN_LIB ON)
+  endif()
   set(ONNX_USE_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME})
   set(ONNX_USE_LITE_PROTO ${CAFFE2_USE_LITE_PROTO})
   # If linking local protobuf, make sure ONNX has the same protobuf
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
index edc9b3ab3fda74..b2ca36a9677771 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
@@ -531,7 +531,9 @@ option(CUDA_HOST_COMPILATION_CPP "Generated file extension" ON)
 # Extra user settable flags
 cmake_initialize_per_config_variable(CUDA_NVCC_FLAGS "Semi-colon delimit multiple arguments.")
 
-if(CMAKE_GENERATOR MATCHES "Visual Studio")
+if(DEFINED ENV{CUDA_HOST_COMPILER})
+  set(CUDA_HOST_COMPILER "$ENV{CUDA_HOST_COMPILER}" CACHE FILEPATH "Host side compiler used by NVCC")
+elseif(CMAKE_GENERATOR MATCHES "Visual Studio")
   set(_CUDA_MSVC_HOST_COMPILER "$(VCInstallDir)Tools/MSVC/$(VCToolsVersion)/bin/Host$(Platform)/$(PlatformTarget)")
   if(MSVC_VERSION LESS 1910)
    set(_CUDA_MSVC_HOST_COMPILER "$(VCInstallDir)bin")
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 9adc2a2be8347a..ed12b3b90e5480 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -179,7 +179,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var)
 
         # If we remove all reference to these pb.h files from external
         # libraries and binaries this rewrite can be removed.
-        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
+        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
 
         DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil}
         COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 846f4d6154b8ba..a314e4d348b708 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -38,6 +38,7 @@ function (caffe2_print_configuration_summary)
     message(STATUS "    Python includes     : ${PYTHON_INCLUDE_DIRS}")
     message(STATUS "    Python site-packages: ${PYTHON_SITE_PACKAGES}")
   endif()
+  message(STATUS "  BUILD_CAFFE2_OPS      : ${BUILD_CAFFE2_OPS}")
   message(STATUS "  BUILD_SHARED_LIBS     : ${BUILD_SHARED_LIBS}")
   message(STATUS "  BUILD_TEST            : ${BUILD_TEST}")
 
@@ -124,6 +125,7 @@ function (caffe2_print_configuration_summary)
     message(STATUS "    USE_GLOO            : ${USE_GLOO}")
     message(STATUS "    USE_GLOO_IBVERBS    : ${USE_GLOO_IBVERBS}")
   endif()
+  message(STATUS "  TORCH_USE_CEREAL       : ${TORCH_USE_CEREAL}")
 
   message(STATUS "  Public Dependencies  : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}")
   message(STATUS "  Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}")
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index a14b2e1b0e8b44..066a7e63f9c57a 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -7,49 +7,67 @@
 #
 #   TORCH_FOUND        -- True if the system has the Torch library
 #   TORCH_INCLUDE_DIRS -- The include directories for torch
-#   TORCH_LIBRARIES    -- Libraries to link to
+#   TORCH_LIBRARIES    -- Libraries to link against
+#   TORCH_CXX_FLAGS    -- Additional (required) compiler flags
 #
 # and the following imported targets:
 #
-#   Torch
-#
-# and the following functions:
-#
-#   torch_add_custom_op_library(<name> <source_files>)
+#   torch
 
-SET(TORCH_ROOT "${CMAKE_CURRENT_LIST_DIR}/../")
+if ($ENV{TORCH_INSTALL_PREFIX})
+  set(TORCH_INSTALL_PREFIX $ENV{TORCH_INSTALL_PREFIX})
+else()
+  # Assume we are in <install-prefix>/share/cmake/Torch/TorchConfig.cmake
+  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+  get_filename_component(TORCH_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
+endif()
 
-set(TORCH_INCLUDE_DIRS
-  "${TORCH_ROOT}"
-  "${TORCH_ROOT}/aten/src"
-  "${CMAKE_CURRENT_LIST_DIR}/aten/src"
-  "${CMAKE_CURRENT_LIST_DIR}/caffe2/aten/src"
-  "${CMAKE_CURRENT_LIST_DIR}/caffe2/aten/src/TH"
-)
+# Include directories.
+if (EXISTS "${TORCH_INSTALL_PREFIX}/lib/include")
+  set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/lib/include")
+else()
+  set(TORCH_INCLUDE_DIRS "${TORCH_INSTALL_PREFIX}/include")
+endif()
+
+# Library dependencies.
+find_package(Caffe2 REQUIRED)
 
-find_library(TORCH_LIBRARY torch PATHS "${CMAKE_CURRENT_LIST_DIR}/lib" NO_DEFAULT_PATH)
-find_library(CAFFE2_LIBRARY caffe2 PATHS "${CMAKE_CURRENT_LIST_DIR}/lib" NO_DEFAULT_PATH)
+find_library(TORCH_LIBRARY torch PATHS "${TORCH_INSTALL_PREFIX}/lib")
+add_library(torch SHARED IMPORTED)
+set(TORCH_LIBRARIES torch ${Caffe2_MAIN_LIBS})
 
 if (@USE_CUDA@)
-  find_package(CUDA REQUIRED)
-  find_library(CAFFE2_CUDA_LIBRARY caffe2_gpu PATHS "${CMAKE_CURRENT_LIST_DIR}/lib" NO_DEFAULT_PATH)
-  set(TORCH_CUDA_LIBRARIES -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 cuda nvrtc cudart nvToolsExt)
-  list(APPEND TORCH_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
+  if(MSVC)
+    set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt")
+    if ($ENV{NVTOOLEXT_HOME})
+      set(NVTOOLEXT_HOME $ENV{NVTOOLEXT_HOME})
+    endif()
+    set(TORCH_CUDA_LIBRARIES
+      ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
+      ${CUDA_LIBRARIES})
+    list(APPEND TORCH_INCLUDE_DIRS "${NVTOOLEXT_HOME}/include")
+  elseif(APPLE)
+    set(TORCH_CUDA_LIBRARIES
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib
+      ${CUDA_LIBRARIES})
+  else()
+    set(TORCH_CUDA_LIBRARIES
+      ${CUDA_CUDA_LIB}
+      ${CUDA_NVRTC_LIB}
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so
+      ${CUDA_LIBRARIES})
+  endif()
+  list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES})
 endif()
 
-set(TORCH_LIBRARIES
-  ${TORCH_LIBRARY}
-  ${CAFFE2_LIBRARY}
-  ${CAFFE2_CUDA_LIBRARY}
-  ${TORCH_CUDA_LIBRARIES})
+# When we build libtorch with the old GCC ABI, dependent libraries must too.
+set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@")
 
-# Creates a shared library <name> with the correct include directories
-# and linker flags set to include Torch header files and link with Torch
-# libraries. Also sets the C++ standard version to C++11. All options
-# can be override by specifying further options on the `<name>` CMake target.
-function(torch_add_custom_op_library name source_files)
-  add_library(${name} SHARED ${source_files})
-  target_include_directories(${name} PUBLIC "${TORCH_INCLUDE_DIRS}")
-  target_link_libraries(${name} "${TORCH_LIBRARIES}")
-  target_compile_options(${name} PUBLIC -std=c++11)
-endfunction(torch_add_custom_op_library)
+set_target_properties(torch PROPERTIES
+    IMPORTED_LOCATION ${TORCH_LIBRARY}
+    INTERFACE_INCLUDE_DIRECTORIES ${TORCH_INCLUDE_DIRS}
+    INTERFACE_COMPILE_OPTIONS ${TORCH_CXX_FLAGS}
+    CXX_STANDARD 11
+)
diff --git a/docker/caffe2/jenkins/common/install_rocm.sh b/docker/caffe2/jenkins/common/install_rocm.sh
index c69d857118b2d7..82692d0acdb9b6 100644
--- a/docker/caffe2/jenkins/common/install_rocm.sh
+++ b/docker/caffe2/jenkins/common/install_rocm.sh
@@ -60,25 +60,30 @@ install_rocrand() {
 # Install rocSPARSE/hipSPARSE that will be released soon - can co-exist w/ hcSPARSE which will be removed soon
 install_hipsparse() {
     mkdir -p /opt/rocm/debians
-    curl https://s3.amazonaws.com/ossci-linux/rocsparse-0.1.1.0.deb -o /opt/rocm/debians/rocsparse.deb
-    curl https://s3.amazonaws.com/ossci-linux/hipsparse-0.1.1.0.deb -o /opt/rocm/debians/hipsparse.deb
+    curl https://s3.amazonaws.com/ossci-linux/rocsparse-0.1.2.114-Linux.deb -o /opt/rocm/debians/rocsparse.deb
+    curl https://s3.amazonaws.com/ossci-linux/hipsparse-0.1.2.55-Linux.deb -o /opt/rocm/debians/hipsparse.deb
     dpkg -i /opt/rocm/debians/rocsparse.deb
     dpkg -i /opt/rocm/debians/hipsparse.deb
 }
 
 # Install custom hcc containing two compiler fixes relevant to PyTorch
 install_customhcc() {
+    HIP_VERSION="1.5.18354"
     mkdir -p /opt/rocm/debians
-    curl https://s3.amazonaws.com/ossci-linux/hcc-1.2.18272-Linux.deb -o /opt/rocm/debians/hcc-1.2.18272-Linux.deb
-    curl https://s3.amazonaws.com/ossci-linux/hip_base-1.5.18276.deb -o /opt/rocm/debians/hip_base-1.5.18276.deb
-    curl https://s3.amazonaws.com/ossci-linux/hip_doc-1.5.18276.deb -o /opt/rocm/debians/hip_doc-1.5.18276.deb
-    curl https://s3.amazonaws.com/ossci-linux/hip_samples-1.5.18276.deb -o /opt/rocm/debians/hip_samples-1.5.18276.deb
-    curl https://s3.amazonaws.com/ossci-linux/hip_hcc-1.5.18276.deb -o /opt/rocm/debians/hip_hcc-1.5.18276.deb
-    dpkg -i /opt/rocm/debians/hcc-1.2.18272-Linux.deb
-    dpkg -i /opt/rocm/debians/hip_base-1.5.18276.deb
-    dpkg -i /opt/rocm/debians/hip_doc-1.5.18276.deb
-    dpkg -i /opt/rocm/debians/hip_samples-1.5.18276.deb
-    dpkg -i /opt/rocm/debians/hip_hcc-1.5.18276.deb
+    curl https://s3.amazonaws.com/ossci-linux/hcc-1.2.18272-Linux.deb -o /opt/rocm/debians/hcc-Linux.deb
+    curl "https://s3.amazonaws.com/ossci-linux/hip_base-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_base.deb
+    curl "https://s3.amazonaws.com/ossci-linux/hip_doc-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_doc.deb
+    curl "https://s3.amazonaws.com/ossci-linux/hip_samples-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_samples.deb
+    curl "https://s3.amazonaws.com/ossci-linux/hip_hcc-$HIP_VERSION.deb" -o /opt/rocm/debians/hip_hcc.deb
+    dpkg -i /opt/rocm/debians/hcc-Linux.deb
+    dpkg -i /opt/rocm/debians/hip_base.deb
+    dpkg -i /opt/rocm/debians/hip_doc.deb
+    dpkg -i /opt/rocm/debians/hip_samples.deb
+    dpkg -i /opt/rocm/debians/hip_hcc.deb
+
+    if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then
+        sudo sed -i 's/\ -I${dir}/\ $<$<BOOL:${dir}>:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake
+    fi
 }
 
 # Install Python packages depending on the base OS
diff --git a/docs/Makefile b/docs/Makefile
index 4a56c12ca22d89..59c2397bb023e1 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -39,4 +39,4 @@ html-stable:
 
 clean:
 	@echo "Removing everything under 'build'.."
-	@rm -r $(BUILDDIR)/html/ $(BUILDDIR)/doctrees
+	@rm -rf $(BUILDDIR)/html/ $(BUILDDIR)/doctrees
diff --git a/docs/cpp/Doxyfile b/docs/cpp/Doxyfile
deleted file mode 100644
index a21dcf8184ad52..00000000000000
--- a/docs/cpp/Doxyfile
+++ /dev/null
@@ -1,2028 +0,0 @@
-# Doxyfile 1.8.14
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See
-# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "PyTorch"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         =
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          =
-
-# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
-# in the documentation. The maximum height of the logo should not exceed 55
-# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
-# the logo to the output directory.
-
-PROJECT_LOGO           =
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = build
-
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       = "The $name class" \
-                         "The $name widget" \
-                         "The $name file" \
-                         is \
-                         provides \
-                         specifies \
-                         contains \
-                         represents \
-                         a \
-                         an \
-                         the
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = YES
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
-# page for each member. If set to NO, the documentation of a member will be part
-# of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 4
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines (in the resulting output). You can put ^^ in the value part of an
-# alias to insert a newline as if a physical newline was in the original file.
-
-ALIASES                = "rst=\verbatim embed:rst:leading-asterisk"
-ALIASES               += "endrst=\endverbatim"
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
-#
-# Note: For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = YES
-
-# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = YES
-
-# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = YES
-
-# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO,
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. If set to YES, local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO, only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO, these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = NO
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES, the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
-# list. This list is created by putting \todo commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
-# list. This list is created by putting \test commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES, the
-# list will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-GENERATE_LATEX         = NO
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. See also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = YES
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = NO
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  =  ../../torch/csrc/api/include \
-                          ../../torch/csrc/api/src \
-                          ../../aten/src/ATen/ATen.h \
-                          ../../aten/src/ATen/Backend.h \
-                          ../../aten/src/ATen/Device.h \
-                          ../../aten/src/ATen/DeviceGuard.h \
-                          ../../aten/src/ATen/Layout.h \
-                          ../../aten/src/ATen/OptionsGuard.h \
-                          ../../aten/src/ATen/Scalar.h \
-                          ../../aten/src/ATen/TensorOptions.h \
-                          ../../aten/src/ATen/core/ArrayRef.h \
-                          ../../aten/src/ATen/core/DeviceType.h \
-                          ../../aten/src/ATen/core/Error.h \
-                          ../../aten/src/ATen/core/Half.h \
-                          ../../aten/src/ATen/core/ScalarType.h \
-                          ../../aten/src/ATen/cuda/CUDAGuard.h \
-                          ../../aten/src/ATen/cuda/CUDAStream.h \
-                          ../../aten/src/ATen/cuda/CUDAContext.h \
-                          ../../aten/src/ATen/cudnn/Descriptors.h \
-                          ../../aten/src/ATen/cudnn/Handles.h \
-                          ../../aten/src/ATen/cudnn/Types.h \
-                          ../../aten/src/ATen/cudnn/Utils.h \
-                          ../../aten/src/ATen/mkl/Descriptors.h \
-                          ../../build/aten/src/ATen/Tensor.h \
-                          ../../build/aten/src/ATen/Functions.h \
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# read by doxygen.
-#
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
-# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
-
-FILE_PATTERNS          = *.h *.cpp
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                =
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS = c10::* caffe2::* cereal* DL* TH* cudnn*
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       = *
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# properly processed by doxygen.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# properly processed by doxygen.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE =
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see https://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = NO
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# cascading style sheets that are included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefore more robust against future updates.
-# Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: https://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler (hhc.exe). If non-empty,
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated
-# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
-# enables the Previous and Next buttons.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# https://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from https://www.mathjax.org before deployment.
-# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
-# setting. When disabled, doxygen will generate a PHP script for searching and
-# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
-# and searching needs to be provided by external tools. See the section
-# "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: https://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = YES
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
-# in the source code. If set to NO, only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = YES
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES, the include files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           = ../../ \
-                         ../../torch/csrc/api/include/ \
-                         ../../aten/src/ \
-                         ../../build/aten/src/
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             = DOXYGEN_DOCUMENTATION_BUILD
-PREDEFINED            += DOXYGEN_SHOULD_SKIP_THIS
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have
-# an all uppercase name, and do not end with a semicolon. Such function macros
-# are typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have a unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# If set to YES the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command. Disabling a call graph can be
-# accomplished by means of the command \hidecallgraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command. Disabling a caller graph can be
-# accomplished by means of the command \hidecallergraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. For an explanation of the image formats see the section
-# output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
-# png:gdiplus:gdiplus.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/docs/cpp/Makefile b/docs/cpp/Makefile
index 336792c10277f1..689743bbd4372e 100644
--- a/docs/cpp/Makefile
+++ b/docs/cpp/Makefile
@@ -4,7 +4,7 @@
 SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = PyTorch
-SOURCEDIR     = ./
+SOURCEDIR     = source
 BUILDDIR      = build
 PYCMD         = python
 
@@ -12,13 +12,17 @@ PYCMD         = python
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-.PHONY: help Makefile
+.PHONY: help Makefile clean
+
+clean:
+	@# Clean up sphinx and doxygen build artifacts.
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	@# Clean up exhale generated api.
+	@echo "Removing everything under generated 'source/api'..."
+	@rm -rf $(SOURCEDIR)/api
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-clean:
-	@echo "Removing everything under 'build'..."
-	@rm -r $(BUILDDIR)/html/ $(BUILDDIR)/doctrees $(BUILDDIR)/xml
diff --git a/docs/cpp/source/Doxyfile b/docs/cpp/source/Doxyfile
new file mode 100644
index 00000000000000..2ab4947453eaca
--- /dev/null
+++ b/docs/cpp/source/Doxyfile
@@ -0,0 +1,132 @@
+################################################################################
+# Primary project setup.                                                       #
+################################################################################
+PROJECT_NAME           = "PyTorch"
+# Parent directory of where Doxygen should output files.  Cannot be for example
+# `../build/doxygen` because if `../build` does not exist Doxygen will error.
+OUTPUT_DIRECTORY       = ../build
+# XML output stored in ${OUTPUT_DIRECTORY}/${XML_OUTPUT}.  Must agree
+# with what is specified to `breathe_projects` in `conf.py`.
+XML_OUTPUT             = xml
+# HTML output stored in ${OUTPUT_DIRECTORY}/${HTML_OUTPUT}.
+# NOTE: it can be useful to have Doxygen generate HTML to verify what sphinx is
+#       creating, or to generate HTML documentation to check a docstring you are
+#       writing without needing to wait for Sphinx.
+#
+#       # Must run from this directory (see relative paths below...)
+#       $ cd /path/to/pytorch/docs/cpp/source
+#
+#       # Send configs in this file plus `GENERATE_HTML=YES` to doxygen on stdin
+#       $ (cat Doxyfile; echo 'GENERATE_HTML = YES') | doxygen -
+#
+#       # View the doxygen results.
+#       $ open ../build/doxygen_html/index.html
+#
+#       This sends everything in this file plus `GENERATE_HTML = YES` to doxygen
+#       on stdin.  Without needing to edit this `Doxyfile` directly.
+HTML_OUTPUT            = doxygen_html
+# Strip the path prefix.  *MUST* agree with `exhale_args` in conf.py.
+# {repo_root}/docs/cpp/source/../../.. -> {repo_root}
+STRIP_FROM_PATH        = ../../..
+# What folders / files Doxygen should process.
+INPUT                  = ../../../torch/csrc/api/include \
+                         ../../../torch/csrc/api/src \
+                         ../../../torch/csrc/jit/custom_operator.h \
+                         ../../../torch/csrc/jit/import.h \
+                         ../../../torch/csrc/jit/ivalue.h \
+                         ../../../torch/csrc/jit/script/module.h \
+                         ../../../aten/src/ATen/ATen.h \
+                         ../../../aten/src/ATen/Backend.h \
+                         ../../../aten/src/ATen/Device.h \
+                         ../../../aten/src/ATen/DeviceGuard.h \
+                         ../../../aten/src/ATen/Layout.h \
+                         ../../../aten/src/ATen/OptionsGuard.h \
+                         ../../../aten/src/ATen/Scalar.h \
+                         ../../../aten/src/ATen/TensorOptions.h \
+                         ../../../aten/src/ATen/core/ArrayRef.h \
+                         ../../../aten/src/ATen/core/DeviceType.h \
+                         ../../../aten/src/ATen/core/Error.h \
+                         ../../../aten/src/ATen/core/Half.h \
+                         ../../../aten/src/ATen/core/ScalarType.h \
+                         ../../../aten/src/ATen/cuda/CUDAGuard.h \
+                         ../../../aten/src/ATen/cuda/CUDAStream.h \
+                         ../../../aten/src/ATen/cuda/CUDAContext.h \
+                         ../../../aten/src/ATen/cudnn/Descriptors.h \
+                         ../../../aten/src/ATen/cudnn/Handles.h \
+                         ../../../aten/src/ATen/cudnn/Types.h \
+                         ../../../aten/src/ATen/cudnn/Utils.h \
+                         ../../../aten/src/ATen/mkl/Descriptors.h \
+                         ../../../build/aten/src/ATen/Tensor.h \
+                         ../../../build/aten/src/ATen/Functions.h
+# Don't include .cpp files!
+FILE_PATTERNS          = *.h
+# If you need this to be YES, exhale will probably break.
+CREATE_SUBDIRS         = NO
+# So that Doxygen does not trim paths, which affects the file hierarchy
+FULL_PATH_NAMES        = YES
+# Nested folders will be ignored without this.
+RECURSIVE              = YES
+################################################################################
+# Output formats for Doxygen to create.                                        #
+################################################################################
+# Set to YES if you are debugging or want to compare.
+GENERATE_HTML          = NO
+# Unless you want it...
+GENERATE_LATEX         = NO
+# Both breathe and exhale need the xml.
+GENERATE_XML           = YES
+# Set to NO if you do not want the Doxygen program listing included.
+# NOTE: setting to NO may result in unrecovered file relationships
+#       (which file defined which compound)
+XML_PROGRAMLISTING     = YES
+################################################################################
+# Doxygen preprocessor / parser control.                                       #
+################################################################################
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = NO
+SKIP_FUNCTION_MACROS   = NO
+# Extra defs for to help with building the _right_ version of the docs
+PREDEFINED             = DOXYGEN_DOCUMENTATION_BUILD
+PREDEFINED            += DOXYGEN_SHOULD_SKIP_THIS
+# Symbol expansion to follow for #include statements (not input)
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           = ../../.. \
+                         ../../../torch/csrc/api/include \
+                         ../../../aten/src \
+                         ../../../build/aten/src
+################################################################################
+# Compound extraction control.                                                 #
+################################################################################
+EXTRACT_ALL            = YES
+EXTRACT_PACKAGE        = YES
+EXTRACT_STATIC         = YES
+CASE_SENSE_NAMES       = NO
+EXCLUDE_SYMBOLS        = c10::* caffe2::* cereal* DL* TH* cudnn*
+################################################################################
+# Docstring control / customization.                                           #
+################################################################################
+# First line of /** comment */ treated as \brief
+JAVADOC_AUTOBRIEF      = YES
+# Allow for rst directives and advanced functions e.g. grid tables
+# Example:
+#     /**
+#      * \rst
+#      * .. code-block:: cpp
+#      *
+#      *     int main() {
+#      *         return 0;
+#      *     }
+#      *
+#      * \endrst
+#      */
+# NOTE:
+#     1. \rst and \endrst must be on their own line.
+#     2. leading-asterisk required.
+ALIASES                = "rst=\verbatim embed:rst:leading-asterisk"
+ALIASES               += "endrst=\endverbatim"
+################################################################################
+# Warning suppression.                                                         #
+################################################################################
+QUIET                  = YES
+WARN_IF_UNDOCUMENTED   = NO
diff --git a/docs/cpp/building.rst b/docs/cpp/source/building.rst
similarity index 100%
rename from docs/cpp/building.rst
rename to docs/cpp/source/building.rst
diff --git a/docs/cpp/check-doxygen.sh b/docs/cpp/source/check-doxygen.sh
similarity index 64%
rename from docs/cpp/check-doxygen.sh
rename to docs/cpp/source/check-doxygen.sh
index 58e65b403e39bf..fcc7c081860aa1 100755
--- a/docs/cpp/check-doxygen.sh
+++ b/docs/cpp/source/check-doxygen.sh
@@ -4,7 +4,9 @@ set -ex
 
 ignore_warning() {
   # Invert match to filter out $1.
+  set +e
   grep -v "$1" doxygen-log.txt > temp.txt
+  set -e
   mv temp.txt doxygen-log.txt
 }
 
@@ -17,14 +19,12 @@ cat original-doxygen-log.txt
 
 # Filter out some warnings.
 ignore_warning "warning: no uniquely matching class member found for"
-ignore_warning "warning: source ../../build/aten/src/ is not a readable file"
-ignore_warning "warning: source ../../build/aten/src/ATen/Tensor.h is not a readable file"
-ignore_warning "warning: source ../../build/aten/src/ATen/Functions.h is not a readable file"
+ignore_warning "warning:.*\.\./\.\./\.\./build/aten.*"
 
 # Count the number of remaining warnings.
-warnings=$(grep 'warning:' doxygen-log.txt | wc -l)
+warnings="$(grep 'warning:' doxygen-log.txt | wc -l)"
 
-if [[ $warnings != 0 ]]; then
+if [[ "$warnings" -ne "0" ]]; then
   echo "Filtered output"
   cat doxygen-log.txt
   rm -f doxygen-log.txt original-doxygen-log.txt
diff --git a/docs/cpp/conf.py b/docs/cpp/source/conf.py
similarity index 81%
rename from docs/cpp/conf.py
rename to docs/cpp/source/conf.py
index c494decdf9dbbe..c5d928120e76ae 100644
--- a/docs/cpp/conf.py
+++ b/docs/cpp/source/conf.py
@@ -17,7 +17,7 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
-# import os
+import os
 # sys.path.insert(0, os.path.abspath('.'))
 
 import sys
@@ -29,7 +29,7 @@
 
 # If your documentation needs a minimal Sphinx version, state it here.
 #
-# needs_sphinx = '1.0'
+needs_sphinx = '1.6'
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -39,7 +39,24 @@
     'exhale'
 ]
 
-breathe_projects = {"PyTorch": "build/xml"}
+# Setup absolute paths for communicating with breathe / exhale where
+# items are expected / should be trimmed by.
+# This file is {repo_root}/docs/cpp/source/conf.py
+this_file_dir = os.path.abspath(os.path.dirname(__file__))
+doxygen_xml_dir = os.path.join(
+    os.path.dirname(this_file_dir),  # {repo_root}/docs/cpp
+    'build',                         # {repo_root}/docs/cpp/build
+    'xml'                            # {repo_root}/docs/cpp/build/xml
+)
+repo_root = os.path.dirname(  # {repo_root}
+    os.path.dirname(          # {repo_root}/docs
+        os.path.dirname(      # {repo_root}/docs/cpp
+            this_file_dir     # {repo_root}/docs/cpp/source
+        )
+    )
+)
+
+breathe_projects = {"PyTorch": doxygen_xml_dir}
 breathe_default_project = "PyTorch"
 
 # Setup the exhale extension
@@ -50,7 +67,7 @@
     "containmentFolder": "./api",
     "rootFileName": "library_root.rst",
     "rootFileTitle": "Library API",
-    "doxygenStripFromPath": "../",
+    "doxygenStripFromPath": repo_root,
     ############################################################################
     # Suggested optional arguments.                                            #
     ############################################################################
@@ -137,7 +154,6 @@
 # a list of builtin themes.
 #
 html_theme = 'sphinx_rtd_theme'
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
@@ -150,21 +166,33 @@
     'logo_only': True,
 }
 
-html_logo = '../source/_static/img/pytorch-logo-dark-unstable.png'
+# NOTE: sharing python docs resources
+html_logo = os.path.join(
+    repo_root, 'docs', 'source', '_static', 'img', 'pytorch-logo-dark-unstable.png'
+)
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['../source/_static']
+# NOTE: sharing python docs resources
+html_static_path = [os.path.join(repo_root, 'docs', 'source', '_static')]
 
-# html_style_path = 'css/pytorch_theme.css'
-html_context = {
-    'css_files': [
-        'https://fonts.googleapis.com/css?family=Lato',
-        '_static/css/pytorch_theme.css'
-    ],
-}
 
+# Called automatically by Sphinx, making this `conf.py` an "extension".
+def setup(app):
+    # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
+    # and can be moved outside of this function (and the setup(app) function
+    # can be deleted).
+    html_css_files = [
+        'https://fonts.googleapis.com/css?family=Lato',
+        'css/pytorch_theme.css'  # relative to paths in `html_static_path`
+    ]
+
+    # In Sphinx 1.8 it was renamed to `add_css_file`, 1.7 and prior it is
+    # `add_stylesheet` (deprecated in 1.8).
+    add_css = getattr(app, 'add_css_file', getattr(app, 'add_stylesheet'))
+    for css_file in html_css_files:
+        add_css(css_file)
 
 # -- Options for HTMLHelp output ------------------------------------------
 
diff --git a/docs/cpp/contributing.rst b/docs/cpp/source/contributing.rst
similarity index 100%
rename from docs/cpp/contributing.rst
rename to docs/cpp/source/contributing.rst
diff --git a/docs/cpp/examples.rst b/docs/cpp/source/examples.rst
similarity index 100%
rename from docs/cpp/examples.rst
rename to docs/cpp/source/examples.rst
diff --git a/docs/cpp/index.rst b/docs/cpp/source/index.rst
similarity index 100%
rename from docs/cpp/index.rst
rename to docs/cpp/source/index.rst
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 159253ca0a6e33..a8d4eb2a85d11e 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,3 +1,3 @@
-sphinx
+sphinx==1.7.9
 -e git://github.com/snide/sphinx_rtd_theme.git#egg=sphinx_rtd_theme
 sphinxcontrib.katex
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1b4d8d6ff83195..fb7340ab47b528 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -38,7 +38,7 @@
 
 # If your documentation needs a minimal Sphinx version, state it here.
 #
-# needs_sphinx = '1.0'
+needs_sphinx = '1.6'
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -55,22 +55,6 @@
     'sphinxcontrib.katex',
 ]
 
-# katex (mathjax replacement) macros
-#
-#
-
-katex_macros = r'''
-"\\op": "\\operatorname{{#1}}",
-"\\i": "\\mathrm{i}",
-"\\e": "\\mathrm{e}^{#1}",
-"\\w": "\\omega",
-"\\vec": "\\mathbf{#1}",
-"\\x": "\\vec{x}",
-"\\d": "\\operatorname{d}\\!{}",
-"\\dirac": "\\operatorname{\\delta}\\left(#1\\right)",
-"\\scalarprod": "\\left\\langle#1,#2\\right\\rangle",
-'''
-
 # katex options
 #
 #
@@ -78,10 +62,9 @@
 katex_options = r'''
 delimiters : [
    {left: "$$", right: "$$", display: true},
-   {left: "\\(", right: "\\)", display: true},
+   {left: "\\(", right: "\\)", display: false},
    {left: "\\[", right: "\\]", display: true}
-],
-strict : false
+]
 '''
 
 napoleon_use_ivar = True
@@ -152,7 +135,6 @@
 #
 #
 html_theme = 'sphinx_rtd_theme'
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 
 
 # Theme options are theme-specific and customize the look and feel of a theme
@@ -176,14 +158,23 @@
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static', '_images']
 
-html_style_path = 'css/pytorch_theme.css'
-html_context = {
-    'css_files': [
+
+# Called automatically by Sphinx, making this `conf.py` an "extension".
+def setup(app):
+    # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
+    # and can be moved outside of this function (and the setup(app) function
+    # can be deleted).
+    html_css_files = [
         'https://fonts.googleapis.com/css?family=Lato',
-        '_static/css/pytorch_theme.css',
-        'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css',
-    ],
-}
+        'css/pytorch_theme.css',  # relative to paths in `html_static_path`
+        'https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css'
+    ]
+
+    # In Sphinx 1.8 it was renamed to `add_css_file`, 1.7 and prior it is
+    # `add_stylesheet` (deprecated in 1.8).
+    add_css = getattr(app, 'add_css_file', getattr(app, 'add_stylesheet'))
+    for css_file in html_css_files:
+        add_css(css_file)
 
 
 # -- Options for HTMLHelp output ------------------------------------------
@@ -246,7 +237,7 @@
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
     'python': ('https://docs.python.org/', None),
-    'numpy': ('http://docs.scipy.org/doc/numpy/', None),
+    'numpy': ('https://docs.scipy.org/doc/numpy/', None),
 }
 
 # -- A patch that prevents Sphinx from cross-referencing ivar tags -------
diff --git a/docs/source/cudnn_deterministic.rst b/docs/source/cudnn_deterministic.rst
new file mode 100644
index 00000000000000..71c622ea3baeb8
--- /dev/null
+++ b/docs/source/cudnn_deterministic.rst
@@ -0,0 +1,7 @@
+.. note::
+
+    In some circumstances when using the CUDA backend with CuDNN, this operator
+    may select a nondeterministic algorithm to increase performance. If this is
+    undesirable, you can try to make the operation deterministic (potentially at
+    a performance cost) by setting ``torch.backends.cudnn.deterministic =
+    True``.
diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index 9dbb9be74709bb..49d9d96ee62817 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -7,35 +7,35 @@ Distributed communication package - torch.distributed
 .. automodule:: torch.distributed
 .. currentmodule:: torch.distributed
 
-Currently torch.distributed supports four backends, each with
+Currently torch.distributed supports three backends, each with
 different capabilities. The table below shows which functions are available
 for use with CPU / CUDA tensors.
 MPI supports cuda only if the implementation used to build PyTorch supports it.
 
 
-+------------+-----------+-----------+-----------+-----------+
-| Backend    | ``tcp``   | ``gloo``  | ``mpi``   | ``nccl``  |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| Device     | CPU | GPU | CPU | GPU | CPU | GPU | CPU | GPU |
-+============+=====+=====+=====+=====+=====+=====+=====+=====+
-| send       | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| recv       | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| broadcast  | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| all_reduce | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| reduce     | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| all_gather | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| gather     | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| scatter    | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| barrier    | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✘   |
-+------------+-----+-----+-----+-----+-----+-----+-----+-----+
++------------+-----------+-----------+-----------+
+| Backend    | ``gloo``  | ``mpi``   | ``nccl``  |
++------------+-----+-----+-----+-----+-----+-----+
+| Device     | CPU | GPU | CPU | GPU | CPU | GPU |
++============+=====+=====+=====+=====+=====+=====+
+| send       | ✓   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+
+| recv       | ✓   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+
+| broadcast  | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+
+| all_reduce | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+
+| reduce     | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+
+| all_gather | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+
+| gather     | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+
+| scatter    | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+
+| barrier    | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+
 
 .. _distributed-basics:
 
@@ -79,6 +79,14 @@ joined.
 
 .. autofunction:: get_world_size
 
+.. autofunction:: is_initialized
+
+.. autofunction:: get_default_group
+
+.. autofunction:: is_mpi_available
+
+.. autofunction:: is_nccl_available
+
 --------------------------------------------------------------------------------
 
 Currently three initialization methods are supported:
@@ -91,10 +99,8 @@ reachable from all processes and a desired ``world_size``. The first way
 requires specifying an address that belongs to the rank 0 process. This
 initialization method requires that all processes have manually specified ranks.
 
-Alternatively, the address has to be a valid IP multicast address, in which case
-ranks can be assigned automatically. Multicast initialization also supports
-a ``group_name`` argument, which allows you to use the same address for multiple
-jobs, as long as they use different group names.
+Note that multicast address is not supported anymore in the latest distributed
+package. ``group_name`` is deprecated as well.
 
 ::
 
@@ -103,31 +109,40 @@ jobs, as long as they use different group names.
     # Use address of one of the machines
     dist.init_process_group(backend, init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4)
 
-    # or a multicast address - rank will be assigned automatically if unspecified
-    dist.init_process_group(backend, init_method='tcp://[ff15:1e18:5d4c:4cf0:d02d:b659:53ba:b0a7]:23456',
-                            world_size=4)
-
 Shared file-system initialization
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Another initialization method makes use of a file system that is shared and
 visible from all machines in a group, along with a desired ``world_size``. The URL should start
 with ``file://`` and contain a path to a non-existent file (in an existing
-directory) on a shared file system. This initialization method also supports a
-``group_name`` argument, which allows you to use the same shared file path for
-multiple jobs, as long as they use different group names.
+directory) on a shared file system. File-system initialization will automatically
+create that file if it doesn't exist, but will not delete the file. Therefore, it
+is your responsibility to make sure that the file is cleaned up before the next
+init_process_group call on the same file path/name.
+
+Note that automatic rank assignment is not supported anymore in the latest
+distributed package and ``group_name`` is deprecated as well.
 
 .. warning::
     This method assumes that the file system supports locking using ``fcntl`` - most
     local systems and NFS support it.
 
+.. warning::
+    This method does not clean up and remove the file and it is your responsibility
+    to remove the file at the end of the training. This is especially important
+    if you plan to call init_process_group multiple times on the same file name.
+    In other words, if the file is not removed/cleaned up and you call
+    init_process_group again on that file, it is unexpected behavior and will cause
+    failures. The rule of thumb here is that, make sure that the file is non-existent or
+    empty everytime init_process_group is called.
+
 ::
 
     import torch.distributed as dist
 
-    # Rank will be assigned automatically if unspecified
+    # rank should always be specified
     dist.init_process_group(backend, init_method='file:///mnt/nfs/sharedfile',
-                            world_size=4, group_name=args.group)
+                            world_size=4, rank=args.rank)
 
 Environment variable initialization
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -174,14 +189,29 @@ as they should never be created manually, but they are guaranteed to support two
 * ``wait()`` - will block the process until the operation is finished.
   ``is_completed()`` is guaranteed to return True once it returns.
 
-When using the MPI backend, :func:`~torch.distributed.isend` and :func:`~torch.distributed.irecv`
-support non-overtaking, which has some guarantees on supporting message order. For more detail, see
-http://mpi-forum.org/docs/mpi-2.2/mpi22-report/node54.htm#Node54
-
 .. autofunction:: isend
 
 .. autofunction:: irecv
 
+Synchronous and asynchornous collective operations
+--------------------------------------------------
+Every collective operation function supports the following two kinds of operations:
+
+synchronous operation - the default mode, when ``async_op`` is set to False.
+when the function returns, it is guaranteed that
+the collective operation is performed (not necessarily completed if it's a CUDA op since all
+CUDA ops are asynchornous), and any further function calls depending on the data of the
+collective operation can be called. In the synchronous mode, the collective function does not
+return anything
+
+asynchornous operation - when ``async_op`` is set to True. The collective operation function
+returns a distributed request object. In general, you don't need to create it manually and it
+is guaranteed to support two methods:
+
+* ``is_completed()`` - returns True if the operation has finished
+* ``wait()`` - will block the process until the operation is finished.
+
+
 Collective functions
 --------------------
 
@@ -202,7 +232,7 @@ Collective functions
 Multi-GPU collective functions
 ------------------------------
 
-If you have more than one GPU on each node, when using the NCCL backend,
+If you have more than one GPU on each node, when using the NCCL and Gloo backend,
 :func:`~torch.distributed.broadcast_multigpu`
 :func:`~torch.distributed.all_reduce_multigpu`
 :func:`~torch.distributed.reduce_multigpu` and
diff --git a/docs/source/distributed_deprecated.rst b/docs/source/distributed_deprecated.rst
new file mode 100644
index 00000000000000..303ce7b8a54922
--- /dev/null
+++ b/docs/source/distributed_deprecated.rst
@@ -0,0 +1,280 @@
+.. role:: hidden
+    :class: hidden-section
+
+Distributed communication package (deprecated) - torch.distributed.deprecated
+=============================================================================
+
+.. warning::
+    torch.distributed.deprecated is the older version of torch.distributed and
+    currently deprecated. It will be removed soon. Please use and refer the doc
+    for torch.distributed, which is the latest distributed communication
+    package for PyTorch
+
+.. automodule:: torch.distributed.deprecated
+.. currentmodule:: torch.distributed.deprecated
+
+Currently torch.distributed.deprecated supports four backends, each with
+different capabilities. The table below shows which functions are available
+for use with CPU / CUDA tensors.
+MPI supports cuda only if the implementation used to build PyTorch supports it.
+
+
++------------+-----------+-----------+-----------+-----------+
+| Backend    | ``tcp``   | ``gloo``  | ``mpi``   | ``nccl``  |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| Device     | CPU | GPU | CPU | GPU | CPU | GPU | CPU | GPU |
++============+=====+=====+=====+=====+=====+=====+=====+=====+
+| send       | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| recv       | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| broadcast  | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| all_reduce | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| reduce     | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| all_gather | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✓   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| gather     | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| scatter    | ✓   | ✘   | ✘   | ✘   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+| barrier    | ✓   | ✘   | ✓   | ✓   | ✓   | ?   | ✘   | ✘   |
++------------+-----+-----+-----+-----+-----+-----+-----+-----+
+
+.. _distributed-deprecated-basics:
+
+Basics
+------
+
+The `torch.distributed.deprecated` package provides PyTorch support and communication primitives
+for multiprocess parallelism across several computation nodes running on one or more
+machines. The class :func:`torch.nn.parallel.deprecated.DistributedDataParallel` builds on this
+functionality to provide synchronous distributed training as a wrapper around any
+PyTorch model. This differs from the kinds of parallelism provided by
+:doc:`multiprocessing` and :func:`torch.nn.DataParallel` in that it supports
+multiple network-connected machines and in that the user must explicitly launch a separate
+copy of the main training script for each process.
+
+In the single-machine synchronous case, `torch.distributed.deprecated` or the
+:func:`torch.nn.parallel.deprecated.DistributedDataParallel` wrapper may still have advantages over other
+approaches to data-parallelism, including :func:`torch.nn.DataParallel`:
+
+* Each process maintains its own optimizer and performs a complete optimization step with each
+  iteration. While this may appear redundant, since the gradients have already been gathered
+  together and averaged across processes and are thus the same for every process, this means
+  that no parameter broadcast step is needed, reducing time spent transferring tensors between
+  nodes.
+* Each process contains an independent Python interpreter, eliminating the extra interpreter
+  overhead and "GIL-thrashing" that comes from driving several execution threads, model
+  replicas, or GPUs from a single Python process. This is especially important for models that
+  make heavy use of the Python runtime, including models with recurrent layers or many small
+  components.
+
+Initialization
+--------------
+
+The package needs to be initialized using the :func:`torch.distributed.deprecated.init_process_group`
+function before calling any other methods. This blocks until all processes have
+joined.
+
+.. autofunction:: init_process_group
+
+.. autofunction:: get_rank
+
+.. autofunction:: get_world_size
+
+--------------------------------------------------------------------------------
+
+Currently three initialization methods are supported:
+
+TCP initialization
+^^^^^^^^^^^^^^^^^^
+
+There are two ways to initialize using TCP, both requiring a network address
+reachable from all processes and a desired ``world_size``. The first way
+requires specifying an address that belongs to the rank 0 process. This
+initialization method requires that all processes have manually specified ranks.
+
+Alternatively, the address has to be a valid IP multicast address, in which case
+ranks can be assigned automatically. Multicast initialization also supports
+a ``group_name`` argument, which allows you to use the same address for multiple
+jobs, as long as they use different group names.
+
+::
+
+    import torch.distributed.deprecated as dist
+
+    # Use address of one of the machines
+    dist.init_process_group(backend, init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4)
+
+    # or a multicast address - rank will be assigned automatically if unspecified
+    dist.init_process_group(backend, init_method='tcp://[ff15:1e18:5d4c:4cf0:d02d:b659:53ba:b0a7]:23456',
+                            world_size=4)
+
+Shared file-system initialization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Another initialization method makes use of a file system that is shared and
+visible from all machines in a group, along with a desired ``world_size``. The URL should start
+with ``file://`` and contain a path to a non-existent file (in an existing
+directory) on a shared file system. This initialization method also supports a
+``group_name`` argument, which allows you to use the same shared file path for
+multiple jobs, as long as they use different group names.
+
+.. warning::
+    This method assumes that the file system supports locking using ``fcntl`` - most
+    local systems and NFS support it.
+
+::
+
+    import torch.distributed.deprecated as dist
+
+    # Rank will be assigned automatically if unspecified
+    dist.init_process_group(backend, init_method='file:///mnt/nfs/sharedfile',
+                            world_size=4, group_name=args.group)
+
+Environment variable initialization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This method will read the configuration from environment variables, allowing
+one to fully customize how the information is obtained. The variables to be set
+are:
+
+* ``MASTER_PORT`` - required; has to be a free port on machine with rank 0
+* ``MASTER_ADDR`` - required (except for rank 0); address of rank 0 node
+* ``WORLD_SIZE`` - required; can be set either here, or in a call to init function
+* ``RANK`` - required; can be set either here, or in a call to init function
+
+The machine with rank 0 will be used to set up all connections.
+
+This is the default method, meaning that ``init_method`` does not have to be specified (or
+can be ``env://``).
+
+Groups
+------
+
+By default collectives operate on the default group (also called the world) and
+require all processes to enter the distributed function call. However, some workloads can benefit
+from more fine-grained communication. This is where distributed groups come
+into play. :func:`~torch.distributed.deprecated.new_group` function can be
+used to create new groups, with arbitrary subsets of all processes. It returns
+an opaque group handle that can be given as a ``group`` argument to all collectives
+(collectives are distributed functions to exchange information in certain well-known programming patterns).
+
+.. autofunction:: new_group
+
+Point-to-point communication
+----------------------------
+
+.. autofunction:: send
+
+.. autofunction:: recv
+
+:func:`~torch.distributed.deprecated.isend` and :func:`~torch.distributed.deprecated.irecv`
+return distributed request objects when used. In general, the type of this object is unspecified
+as they should never be created manually, but they are guaranteed to support two methods:
+
+* ``is_completed()`` - returns True if the operation has finished
+* ``wait()`` - will block the process until the operation is finished.
+  ``is_completed()`` is guaranteed to return True once it returns.
+
+When using the MPI backend, :func:`~torch.distributed.deprecated.isend` and :func:`~torch.distributed.deprecated.irecv`
+support non-overtaking, which has some guarantees on supporting message order. For more detail, see
+http://mpi-forum.org/docs/mpi-2.2/mpi22-report/node54.htm#Node54
+
+.. autofunction:: isend
+
+.. autofunction:: irecv
+
+Collective functions
+--------------------
+
+.. autofunction:: broadcast
+
+.. autofunction:: all_reduce
+
+.. autofunction:: reduce
+
+.. autofunction:: all_gather
+
+.. autofunction:: gather
+
+.. autofunction:: scatter
+
+.. autofunction:: barrier
+
+Multi-GPU collective functions
+------------------------------
+
+If you have more than one GPU on each node, when using the NCCL backend,
+:func:`~torch.distributed.deprecated.broadcast_multigpu`
+:func:`~torch.distributed.deprecated.all_reduce_multigpu`
+:func:`~torch.distributed.deprecated.reduce_multigpu` and
+:func:`~torch.distributed.deprecated.all_gather_multigpu` support distributed collective
+operations among multiple GPUs within each node. These functions can potentially
+improve the overall distributed training performance and be easily used by
+passing a list of tensors. Each Tensor in the passed tensor list needs
+to be on a separate GPU device of the host where the function is called. Note
+that the length of the tensor list needs to be identical among all the
+distributed processes. Also note that currently the multi-GPU collective
+functions are only supported by the NCCL backend.
+
+For example, if the system we use for distributed training has 2 nodes, each
+of which has 8 GPUs. On each of the 16 GPUs, there is a tensor that we would
+like to all-reduce. The following code can serve as a reference:
+
+Code running on Node 0
+
+::
+
+    import torch
+    import torch.distributed.deprecated as dist
+
+    dist.init_process_group(backend="nccl",
+                            init_method="file:///distributed_test",
+                            world_size=2,
+                            rank=0)
+    tensor_list = []
+    for dev_idx in range(torch.cuda.device_count()):
+        tensor_list.append(torch.FloatTensor([1]).cuda(dev_idx))
+
+    dist.all_reduce_multigpu(tensor_list)
+
+Code running on Node 1
+
+::
+
+    import torch
+    import torch.distributed.deprecated as dist
+
+    dist.init_process_group(backend="nccl",
+                            init_method="file:///distributed_test",
+                            world_size=2,
+                            rank=1)
+    tensor_list = []
+    for dev_idx in range(torch.cuda.device_count()):
+        tensor_list.append(torch.FloatTensor([1]).cuda(dev_idx))
+
+    dist.all_reduce_multigpu(tensor_list)
+
+After the call, all 16 tensors on the two nodes will have the all-reduced value
+of 16
+
+.. autofunction:: broadcast_multigpu
+
+.. autofunction:: all_reduce_multigpu
+
+.. autofunction:: reduce_multigpu
+
+.. autofunction:: all_gather_multigpu
+
+
+Launch utility
+--------------
+
+The `torch.distributed.deprecated` package also provides a launch utility in
+`torch.distributed.deprecated.launch`.
+
+.. automodule:: torch.distributed.launch
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5b37b3301f3db2..8a0f53975e6098 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -33,6 +33,7 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
    torch.autograd <autograd>
    torch.distributed <distributed>
    torch.distributions <distributions>
+   torch.jit <jit>
    torch.multiprocessing <multiprocessing>
    bottleneck
    checkpoint
@@ -42,6 +43,7 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
    ffi
    model_zoo
    onnx
+   torch.distributed.deprecated <distributed_deprecated>
    torch.legacy <legacy>
 
 .. toctree::
diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index 35e62f52ecefde..f7a4411956a2c8 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -7,13 +7,13 @@ Torch Script
 .. currentmodule:: torch.jit
 
 Torch Script is a way to create serializable and optimizable models from PyTorch code.
-Anything code written in Torch Script can be saved from your Python
-process and loaded/run a process where there is no python dependency.
+Any code written in Torch Script can be saved from your Python
+process and loaded in a process where there is no Python dependency.
 
 We provide tools to incrementally transition a model from being a pure Python program
-to a Torch Script program that can be run independently from python, for instance, in a standalone C++ process.
+to a Torch Script program that can be run independently from Python, for instance, in a standalone C++ program.
 This makes it possible to train models in PyTorch using familiar tools and then export
-the model to a production environment where it is not a good idea to run models as python programs
+the model to a production environment where it is not a good idea to run models as Python programs
 for performance and multi-threading reasons.
 
 Creating Torch Script Code
@@ -33,6 +33,10 @@ Creating Torch Script Code
        To be able to save a module, it must not make any calls to native python functions.
        This means that all submodules must be subclasses of ScriptModules as well.
 
+       .. DANGER::
+          All modules, no matter their device, are always loaded onto the CPU during loading.
+          This is different from :func:`torch.load`'s semantics and may change in the future.
+
 
 .. autofunction:: load
 
@@ -43,7 +47,7 @@ Mixing Tracing and Scripting
 ----------------------------
 
 In many cases either tracing or script is an easier approach for converting a model.
-We allow you to compose tracing and scripting to suite the particular requirements
+We allow you to compose tracing and scripting to suit the particular requirements
 of a part of a model.
 
 Scripted functions can call traced ones. This is particularly useful when you need
@@ -75,6 +79,7 @@ Example:
 ::
 
     import torch
+
     @torch.jit.script
     def foo(x, y):
         if x.max() > y.max():
@@ -531,13 +536,253 @@ Python-defined Constants
 Debugging
 ~~~~~~~~~
 
-Print things
+Disable JIT for Debugging
+    If you want to disable all JIT modes (tracing and scripting) so you can
+    debug your program in raw Python, you can use the ``PYTORCH_JIT`` environment
+    variable. ``PYTORCH_JIT`` can be used to globally disable the
+    JIT by setting its value to ``0``. Given an example script::
+
+        @torch.jit.script
+        def scripted_fn(x : torch.Tensor):
+            for i in range(12):
+                x = x + x
+            return x
+
+
+        def fn(x):
+            x = torch.neg(x)
+            import pdb; pdb.set_trace()
+            return scripted_fn(x)
 
-Use ``USE_PYTHON=0`` to debug in normal python mode
+        traced_fn = torch.jit.trace(fn, (torch.rand(4, 5),))
 
-Look at the graph
+        traced_fn(torch.rand(3, 4))
 
-Pay attention to tracer warnings
+    Debugging this script with PDB works except for when we invoke the @script
+    function. We can globally disable JIT, so that we can call the @script
+    function as a normal python function and not compile it. If the above script
+    is called ``disable_jit_example.py``, we can invoke it like so::
+
+        $ PYTORCH_JIT=0 python disable_jit_example.py
+
+    and we will be able to step into the @script function as a normal Python
+    function.
+
+
+Interpreting Graphs
+    TorchScript uses a static single assignment (SSA) intermediate representation
+    (IR) to represent computation. The instructions in this format consist of
+    ATen (the C++ backend of PyTorch) operators and other primitive operators,
+    including control flow operators for loops and conditionals. As an example::
+
+        @torch.jit.script
+        def foo(len):
+          # type: (int) -> torch.Tensor
+          rv = torch.zeros(3, 4)
+          for i in range(len):
+            if i < 10:
+                rv = rv - 1.0
+            else:
+                rv = rv + 1.0
+          return rv
+
+        print(foo.graph)
+
+    A ``ScriptModule`` with a single ``forward`` method will have an attribute
+    ``graph``, which you can use to inspect the IR representing the computation.
+    If the ScriptModule has more than one method, you will need to access
+    ``.graph`` on the method itself and not the module. We can inspect the
+    graph of a method named ``bar`` on a ScriptModule by accessing ``.bar.graph``.
+
+    The example script above produces the graph::
+
+        graph(%len : int) {
+          %13 : float = prim::Constant[value=1]()
+          %10 : int = prim::Constant[value=10]()
+          %2 : int = prim::Constant[value=4]()
+          %1 : int = prim::Constant[value=3]()
+          %3 : int[] = prim::ListConstruct(%1, %2)
+          %4 : int = prim::Constant[value=6]()
+          %5 : int = prim::Constant[value=0]()
+          %6 : int[] = prim::Constant[value=[0, -1]]()
+          %rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)
+          %8 : int = prim::Constant[value=1]()
+          %rv : Dynamic = prim::Loop(%len, %8, %rv.1)
+            block0(%i : int, %12 : Dynamic) {
+              %11 : int = aten::lt(%i, %10)
+              %rv.4 : Dynamic = prim::If(%11)
+                block0() {
+                  %14 : int = prim::Constant[value=1]()
+                  %rv.2 : Dynamic = aten::sub(%12, %13, %14)
+                  -> (%rv.2)
+                }
+                block1() {
+                  %16 : int = prim::Constant[value=1]()
+                  %rv.3 : Dynamic = aten::add(%12, %13, %16)
+                  -> (%rv.3)
+                }
+              %19 : int = prim::Constant[value=1]()
+              -> (%19, %rv.4)
+            }
+          return (%rv);
+        }
+
+    Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
+    example. ``%rv.1 : Dynamic`` means we assign the output to a (unique)
+    value named ``rv.1``, and that value is of ``Dynamic`` type, i.e. we do
+    not know its concrete shape. ``aten::zeros`` is the operator (equivalent
+    to ``torch.zeros``) and the input list ``(%3, %4, %5, %6)`` specifies which
+    values in scope should be passed as inputs. The schema for built-in functions
+    like ``aten::zeros`` can be found at `Builtin Functions`_.
+
+    Notice that operators can also have associated ``blocks``, namely the
+    ``prim::Loop`` and ``prim::If`` operators. In the graph print-out, these
+    operators are formatted to reflect their equivalent source code forms
+    to facilitate easy debugging.
+
+    Graphs can be inspected as shown to confirm that the computation described
+    by a ``ScriptModule`` is correct, in both automated and manual fashion, as
+    described below.
+
+
+Tracing Edge Cases
+    There are some edge cases that exist where the trace of a given Python
+    function/module will not be representative of the underlying code. These
+    cases can include:
+
+    * Tracing of control flow that is dependent on inputs (e.g. tensor shapes)
+    * Tracing of in-place operations of tensor views (e.g. indexing on the
+      left-hand side of an assignment)
+
+    Note that these cases may in fact be traceable in the future.
+
+
+Automatic Trace Checking
+    One way to automatically catch many errors in traces is by using ``check_inputs``
+    on the ``torch.jit.trace()`` API. ``check_inputs`` takes a list of tuples
+    of inputs that will be used to re-trace the computation and verify the
+    results. For example::
+
+        def loop_in_traced_fn(x):
+            result = x[0]
+            for i in range(x.size(0)):
+                result = result * x[i]
+            return result
+
+        inputs = (torch.rand(3, 4, 5),)
+        check_inputs = [(torch.rand(4, 5, 6),), (torch.rand(2, 3, 4),)]
+
+        traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)
+
+    Gives us the following diagnostic information::
+
+        ERROR: Graphs differed across invocations!
+        Graph diff:
+            graph(%0 : Dynamic) {
+                  %1 : int = prim::Constant[value=0]()
+                  %2 : int = prim::Constant[value=0]()
+                  %3 : Dynamic = aten::select(%0, %1, %2)
+                  %4 : int = prim::Constant[value=0]()
+                  %5 : int = prim::Constant[value=0]()
+                  %6 : Dynamic = aten::select(%0, %4, %5)
+                  %7 : Dynamic = aten::mul(%3, %6)
+                  %8 : int = prim::Constant[value=0]()
+                  %9 : int = prim::Constant[value=1]()
+                  %10 : Dynamic = aten::select(%0, %8, %9)
+                  %11 : Dynamic = aten::mul(%7, %10)
+                  %12 : int = prim::Constant[value=0]()
+                  %13 : int = prim::Constant[value=2]()
+                  %14 : Dynamic = aten::select(%0, %12, %13)
+                  %15 : Dynamic = aten::mul(%11, %14)
+              +   %16 : int = prim::Constant[value=0]()
+              +   %17 : int = prim::Constant[value=3]()
+              +   %18 : Dynamic = aten::select(%0, %16, %17)
+              +   %19 : Dynamic = aten::mul(%15, %18)
+              -   return (%15);
+              ?             ^
+              +   return (%19);
+              ?             ^
+            }
+
+
+    This message indicates to us that the computation differed between when
+    we first traced it and when we traced it with the ``check_inputs``. Indeed,
+    the loop within the body of ``loop_in_traced_fn`` depends on the shape
+    of the input ``x``, and thus when we try another ``x`` with a different
+    shape, the trace differs.
+
+    In this case, data-dependent control flow like this can be captured using
+    script instead::
+
+        def fn(x):
+            result = x[0]
+            for i in range(x.size(0)):
+                result = result * x[i]
+            return result
+
+        inputs = (torch.rand(3, 4, 5),)
+        check_inputs = [(torch.rand(4, 5, 6),), (torch.rand(2, 3, 4),)]
+
+        scripted_fn = torch.jit.script(fn)
+        print(scripted_fn.graph)
+
+        for input_tuple in [inputs] + check_inputs:
+            torch.testing.assert_allclose(fn(*input_tuple), scripted_fn(*input_tuple))
+
+
+    Which produces::
+
+        graph(%x : Dynamic) {
+          %1 : int = prim::Constant[value=0]()
+          %2 : int = prim::Constant[value=0]()
+          %result.1 : Dynamic = aten::select(%x, %2, %1)
+          %4 : int = aten::size(%x, %1)
+          %5 : int = prim::Constant[value=1]()
+          %result : Dynamic = prim::Loop(%4, %5, %result.1)
+            block0(%i : int, %7 : Dynamic) {
+              %9 : int = prim::Constant[value=0]()
+              %10 : Dynamic = aten::select(%x, %9, %i)
+              %result.2 : Dynamic = aten::mul(%7, %10)
+              %12 : int = prim::Constant[value=1]()
+              -> (%12, %result.2)
+            }
+          return (%result);
+        }
+
+
+Tracer Warnings
+    The tracer produces warnings for several problematic patterns in traced
+    computation. As an example, take a trace of a function that contains an
+    in-place assignment on a slice (a view) of a Tensor::
+
+        def fill_row_zero(x):
+            x[0] = torch.rand(*x.shape[1:2])
+            return x
+
+        traced = torch.jit.trace(fill_row_zero, (torch.rand(3, 4),))
+        print(traced.graph)
+
+
+    Produces several warnings and a graph which simply returns the input::
+
+        fill_row_zero.py:4: TracerWarning: There are 2 live references to the data region being modified when tracing in-place operator copy_ (possibly due to an assignment). This might cause the trace to be incorrect, because all other views that also reference this data will not not reflect this change in the trace! On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. are outputs of torch.split), this might still be safe.
+          x[0] = torch.rand(*x.shape[1:2])
+        fill_row_zero.py:6: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error:
+        Not within tolerance rtol=1e-05 atol=1e-05 at input[0, 1] (0.09115803241729736 vs. 0.6782537698745728) and 3 other locations (33.00%)
+          traced = torch.jit.trace(fill_row_zero, (torch.rand(3, 4),))
+        graph(%0 : Float(3, 4)) {
+          return (%0);
+        }
+
+    We can fix this by modifying the code to not use the in-place update, but
+    rather build up the result tensor out-of-place with `torch.cat`::
+
+        def fill_row_zero(x):
+            x = torch.cat((torch.rand(1, *x.shape[1:2]), x[1:2]), dim=0)
+            return x
+
+        traced = torch.jit.trace(fill_row_zero, (torch.rand(3, 4),))
+        print(traced.graph)
 
 
 Builtin Functions
diff --git a/docs/source/notes/randomness.rst b/docs/source/notes/randomness.rst
new file mode 100644
index 00000000000000..c8c7b710d8e5cf
--- /dev/null
+++ b/docs/source/notes/randomness.rst
@@ -0,0 +1,42 @@
+
+Reproducibility
+===============
+
+Completely reproducible results are not guaranteed across PyTorch releases,
+individual commits or different platforms. Furthermore, results need not be
+reproducible between CPU and GPU executions, even when using identical seeds.
+
+However, in order to make computations deterministic on your specific problem on
+one specific platform and PyTorch release, there are a couple of steps to take.
+
+There are two pseudorandom number generators involved in PyTorch, which you will
+need to seed manually to make runs reproducible. Furthermore, you should ensure
+that all other libraries your code relies on an which use random numbers also
+use a fixed seed.
+
+PyTorch
+.......
+You can use :meth:`torch.manual_seed()` to seed the RNG for all devices (both
+CPU and CUDA)
+
+    import torch
+    torch.manual_seed(0)
+
+
+CuDNN
+.....
+When running on the CuDNN backend, one further option must be set::
+
+    torch.backends.cudnn.deterministic = True
+
+.. warning::
+
+    Deterministic mode can have a performance impact, depending on your model.
+
+Numpy
+.....
+If you or any of the libraries you are using rely on Numpy, you should seed the
+Numpy RNG as well. This can be done with::
+
+    import numpy as np
+    np.random.seed(0)
diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index e461cef4be68c9..90bf2878e7b730 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -169,6 +169,12 @@ The following operators are supported:
 * ge
 * le
 * exp
+* sin
+* cos
+* tan
+* asin
+* acos
+* atan
 * permute
 * Conv
 * BatchNorm
diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt
index 1791ca27a98590..971baa41adebbe 100644
--- a/modules/detectron/CMakeLists.txt
+++ b/modules/detectron/CMakeLists.txt
@@ -1,18 +1,20 @@
 file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
 file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu)
 
-# Note(ilijar): Since Detectron ops currently have no
-# CPU implementation, we only build GPU ops for now.
-if (USE_CUDA)
-  CUDA_ADD_LIBRARY(
-      caffe2_detectron_ops_gpu SHARED
-      ${Detectron_CPU_SRCS}
-      ${Detectron_GPU_SRCS})
+if (BUILD_CAFFE2_OPS)
+  # Note(ilijar): Since Detectron ops currently have no
+  # CPU implementation, we only build GPU ops for now.
+  if (USE_CUDA)
+    CUDA_ADD_LIBRARY(
+        caffe2_detectron_ops_gpu SHARED
+        ${Detectron_CPU_SRCS}
+        ${Detectron_GPU_SRCS})
 
-  target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu)
-  install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
-elseif(NOT IOS_PLATFORM)
-  add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
-  target_link_libraries(caffe2_detectron_ops caffe2)
-  install(TARGETS caffe2_detectron_ops DESTINATION lib)
+    target_link_libraries(caffe2_detectron_ops_gpu caffe2_gpu)
+    install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib)
+  elseif(NOT IOS_PLATFORM)
+    add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS})
+    target_link_libraries(caffe2_detectron_ops caffe2)
+    install(TARGETS caffe2_detectron_ops DESTINATION lib)
+  endif()
 endif()
diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc
index ed391a3e3ff5dc..93ae7868e7f622 100644
--- a/modules/observers/perf_observer.cc
+++ b/modules/observers/perf_observer.cc
@@ -1,11 +1,42 @@
 #include "observers/perf_observer.h"
 #include "observers/observer_config.h"
+#if !CAFFE2_MOBILE
+#include "caffe2/core/flags.h"
+#include "observers/net_observer_reporter_print.h"
+#endif
 
 #include <random>
 #include "caffe2/core/common.h"
 #include "caffe2/core/init.h"
 #include "caffe2/core/operator.h"
 
+#if !CAFFE2_MOBILE
+CAFFE2_DEFINE_int64(
+    aiBench_netInitSampleRate,
+    0,
+    "One in N sampling rate for net delay");
+
+CAFFE2_DEFINE_int64(
+    aiBench_netFollowupSampleRate,
+    0,
+    "One in N sampling rate for net delay");
+
+CAFFE2_DEFINE_int64(
+    aiBench_netFollowupSampleCount,
+    0,
+    "control the following c logs");
+
+CAFFE2_DEFINE_int64(
+    aiBench_operatorNetSampleRatio,
+    0,
+    "One in N sampling rate for operator delay");
+
+CAFFE2_DEFINE_int64(
+    aiBench_skipIters,
+    0,
+    "skip the first N iterations of the net run");
+#endif
+
 namespace caffe2 {
 namespace {
 
@@ -13,6 +44,20 @@ bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
   AddGlobalNetObserverCreator([](NetBase* subject) {
     return caffe2::make_unique<PerfNetObserver>(subject);
   });
+
+#if !CAFFE2_MOBILE
+  // for aibench usage
+  caffe2::ObserverConfig::setReporter(
+      caffe2::make_unique<caffe2::NetObserverReporterPrint>());
+
+  caffe2::ObserverConfig::initSampleRate(
+      FLAGS_aiBench_netInitSampleRate,
+      FLAGS_aiBench_netFollowupSampleRate,
+      FLAGS_aiBench_netFollowupSampleCount,
+      FLAGS_aiBench_operatorNetSampleRatio,
+      FLAGS_aiBench_skipIters);
+#endif
+
   return true;
 }
 } // namespace
diff --git a/scripts/build_windows.bat b/scripts/build_windows.bat
index ebb76ccd5a3e30..281d20a8d6926a 100644
--- a/scripts/build_windows.bat
+++ b/scripts/build_windows.bat
@@ -10,6 +10,10 @@ setlocal
 SET ORIGINAL_DIR=%cd%
 SET CAFFE2_ROOT=%~dp0%..
 
+if NOT DEFINED CAFFE2_STATIC_LINK_CUDA (
+  set CAFFE2_STATIC_LINK_CUDA=OFF
+)
+
 if NOT DEFINED CMAKE_BUILD_TYPE (
   set CMAKE_BUILD_TYPE=Release
 )
@@ -57,6 +61,7 @@ cmake .. ^
   -G%CMAKE_GENERATOR% ^
   -DBUILD_TEST=OFF ^
   -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE% ^
+  -DCAFFE2_STATIC_LINK_CUDA=%CAFFE2_STATIC_LINK_CUDA% ^
   -DUSE_CUDA=%USE_CUDA% ^
   -DTORCH_CUDA_ARCH_LIST=5.0 ^
   -DUSE_NNPACK=OFF ^
diff --git a/scripts/onnx/install-develop.sh b/scripts/onnx/install-develop.sh
index 02888b11901aa6..24bb4bd5808a1c 100755
--- a/scripts/onnx/install-develop.sh
+++ b/scripts/onnx/install-develop.sh
@@ -15,4 +15,4 @@ pip install -e "$tp2_dir/onnx"
 # Install caffe2 and pytorch
 pip install -r "$top_dir/caffe2/requirements.txt"
 pip install -r "$top_dir/requirements.txt"
-USE_OPENCV=1 BUILD_BINARY=1 python setup.py build_deps develop
+USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_BINARY=1 python setup.py build_deps develop
diff --git a/scripts/onnx/install.sh b/scripts/onnx/install.sh
index 7d4a3139d2e002..29da7e17baa00a 100755
--- a/scripts/onnx/install.sh
+++ b/scripts/onnx/install.sh
@@ -35,4 +35,4 @@ _pip_install -b "$BUILD_DIR/onnx" "file://$tp2_dir/onnx#egg=onnx"
 # Install caffe2 and pytorch
 pip install -r "$top_dir/caffe2/requirements.txt"
 pip install -r "$top_dir/requirements.txt"
-USE_OPENCV=1 BUILD_BINARY=1 python setup.py install
+USE_LEVELDB=1 USE_LMDB=1 USE_OPENCV=1 BUILD_BINARY=1 python setup.py install
diff --git a/setup.py b/setup.py
index 3cc5c0c1642a0c..5fbe75833da0fb 100644
--- a/setup.py
+++ b/setup.py
@@ -46,12 +46,21 @@
 #     disables use of system-wide nccl (we will use our submoduled
 #     copy in third_party/nccl)
 #
+#   NO_CAFFE2_OPS
+#     disable Caffe2 operators build
+#
 #   USE_GLOO_IBVERBS
 #     toggle features related to distributed support
 #
 #   USE_OPENCV
 #     enables use of OpenCV for additional operators
 #
+#   USE_LEVELDB
+#     enables use of LevelDB for storage
+#
+#   USE_LMBD
+#     enables use of LMDB for storage
+#
 #   BUILD_BINARY
 #     enables the additional binaries/ build
 #
@@ -114,6 +123,7 @@
 import distutils.command.build
 import distutils.command.clean
 import distutils.sysconfig
+import filecmp
 import platform
 import subprocess
 import shutil
@@ -144,11 +154,13 @@ def hotpatch_var(var, prefix='USE_'):
 list(map(hotpatch_var, use_env_vars))
 
 # Also hotpatch a few with BUILD_* equivalent
-build_env_vars = ['BINARY', 'TEST']
+build_env_vars = ['BINARY', 'TEST', 'CAFFE2_OPS']
 [hotpatch_var(v, 'BUILD_') for v in build_env_vars]
 
 from tools.setup_helpers.cuda import USE_CUDA, CUDA_HOME, CUDA_VERSION
-from tools.setup_helpers.build import BUILD_BINARY, BUILD_TEST, USE_OPENCV
+from tools.setup_helpers.build import (BUILD_BINARY, BUILD_TEST,
+                                       BUILD_CAFFE2_OPS, USE_LEVELDB,
+                                       USE_LMDB, USE_OPENCV)
 from tools.setup_helpers.rocm import USE_ROCM, ROCM_HOME, ROCM_VERSION
 from tools.setup_helpers.cudnn import (USE_CUDNN, CUDNN_LIBRARY,
                                        CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
@@ -176,6 +188,7 @@ def hotpatch_var(var, prefix='USE_'):
 
 BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
 USE_CUDA_STATIC_LINK = check_env_flag('USE_CUDA_STATIC_LINK')
+RERUN_CMAKE = True
 
 NUM_JOBS = multiprocessing.cpu_count()
 max_jobs = os.getenv("MAX_JOBS")
@@ -270,7 +283,7 @@ def patched_link(self, *args, **kwargs):
 # Version, create_version_file, and package_name
 ################################################################################
 package_name = os.getenv('TORCH_PACKAGE_NAME', 'torch')
-version = '0.5.0a0'
+version = '1.0.0a0'
 if os.getenv('PYTORCH_BUILD_VERSION'):
     assert os.getenv('PYTORCH_BUILD_NUMBER') is not None
     build_number = int(os.getenv('PYTORCH_BUILD_NUMBER'))
@@ -371,12 +384,17 @@ def build_libs(libs):
         build_libs_cmd += ['--use-mkldnn']
     if USE_GLOO_IBVERBS:
         build_libs_cmd += ['--use-gloo-ibverbs']
+    if not RERUN_CMAKE:
+        build_libs_cmd += ['--dont-rerun-cmake']
 
     my_env["BUILD_TORCH"] = "ON"
     my_env["BUILD_PYTHON"] = "ON"
     my_env["BUILD_BINARY"] = "ON" if BUILD_BINARY else "OFF"
     my_env["BUILD_TEST"] = "ON" if BUILD_TEST else "OFF"
+    my_env["BUILD_CAFFE2_OPS"] = "ON" if BUILD_CAFFE2_OPS else "OFF"
     my_env["INSTALL_TEST"] = "ON" if BUILD_TEST else "OFF"
+    my_env["USE_LEVELDB"] = "ON" if USE_LEVELDB else "OFF"
+    my_env["USE_LMDB"] = "ON" if USE_LMDB else "OFF"
     my_env["USE_OPENCV"] = "ON" if USE_OPENCV else "OFF"
 
     try:
@@ -391,10 +409,24 @@ def build_libs(libs):
         sys.exit(1)
 
 
+# Copy Caffe2's Python proto files (generated during the build with the
+# protobuf python compiler) from the build folder to the root folder
+# cp root/build/caffe2/proto/proto.py root/caffe2/proto/proto.py
+def copy_protos():
+    print('setup.py::copy_protos()')
+    for src in glob.glob(
+            os.path.join(caffe2_build_dir, 'caffe2', 'proto', '*.py')):
+        dst = os.path.join(
+            cwd, os.path.relpath(src, caffe2_build_dir))
+        shutil.copyfile(src, dst)
+
+
 # Build all dependent libraries
 class build_deps(PytorchCommand):
     def run(self):
+        print('setup.py::build_deps::run()')
         # Check if you remembered to check out submodules
+
         def check_file(f):
             if not os.path.exists(f):
                 print("Could not find {}".format(f))
@@ -420,9 +452,7 @@ def check_file(f):
         if USE_DISTRIBUTED:
             if IS_LINUX:
                 libs += ['gloo']
-                # TODO: make c10d build without CUDA
-                if USE_CUDA:
-                    libs += ['c10d']
+                libs += ['c10d']
             libs += ['THD']
         build_libs(libs)
 
@@ -431,9 +461,14 @@ def check_file(f):
         sym_files = ['tools/shared/cwrap_common.py', 'tools/shared/_utils_internal.py']
         orig_files = ['aten/src/ATen/common_with_cwrap.py', 'torch/_utils_internal.py']
         for sym_file, orig_file in zip(sym_files, orig_files):
+            same = False
             if os.path.exists(sym_file):
-                os.remove(sym_file)
-            shutil.copyfile(orig_file, sym_file)
+                if filecmp.cmp(sym_file, orig_file):
+                    same = True
+                else:
+                    os.remove(sym_file)
+            if not same:
+                shutil.copyfile(orig_file, sym_file)
 
         # Copy headers necessary to compile C++ extensions.
         #
@@ -443,14 +478,14 @@ def check_file(f):
         # we need to find a better way to do this.
         # More information can be found in conversation thread of PR #5772
 
-        self.copy_tree('torch/csrc', 'torch/lib/include/torch/csrc/')
+        self.copy_tree('torch/lib/tmp_install/share', 'torch/share')
         self.copy_tree('third_party/pybind11/include/pybind11/',
                        'torch/lib/include/pybind11')
         self.copy_file('torch/csrc/torch.h', 'torch/lib/include/torch/torch.h')
-        self.copy_file('torch/op.h', 'torch/lib/include/torch/op.h')
 
 
 build_dep_cmds = {}
+rebuild_dep_cmds = {}
 
 for lib in dep_libs:
     # wrap in function to capture lib
@@ -462,9 +497,20 @@ def run(self):
     build_dep.lib = lib
     build_dep_cmds['build_' + lib.lower()] = build_dep
 
+    class rebuild_dep(build_deps):
+        description = 'Rebuild {} external library'.format(lib)
+
+        def run(self):
+            global RERUN_CMAKE
+            RERUN_CMAKE = False
+            build_libs([self.lib])
+    rebuild_dep.lib = lib
+    rebuild_dep_cmds['rebuild_' + lib.lower()] = rebuild_dep
+
 
 class build_module(PytorchCommand):
     def run(self):
+        print('setup.py::build_module::run()')
         self.run_command('build_py')
         self.run_command('build_ext')
 
@@ -472,6 +518,7 @@ def run(self):
 class build_py(setuptools.command.build_py.build_py):
 
     def run(self):
+        print('setup.py::build_py::run()')
         self.run_command('create_version_file')
         setuptools.command.build_py.build_py.run(self)
 
@@ -479,18 +526,11 @@ def run(self):
 class develop(setuptools.command.develop.develop):
 
     def run(self):
+        print('setup.py::develop::run()')
         self.run_command('create_version_file')
         setuptools.command.develop.develop.run(self)
         self.create_compile_commands()
-
-        # Copy Caffe2's Python proto files (generated during the build with the
-        # protobuf python compiler) from the build folder to the root folder
-        # cp root/build/caffe2/proto/proto.py root/caffe2/proto/proto.py
-        for src in glob.glob(
-                os.path.join(caffe2_build_dir, 'caffe2', 'proto', '*.py')):
-            dst = os.path.join(
-                cwd, os.path.relpath(src, caffe2_build_dir))
-            self.copy_file(src, dst)
+        copy_protos()
 
     def create_compile_commands(self):
         def load(filename):
@@ -501,8 +541,24 @@ def load(filename):
         all_commands = [entry
                         for f in ninja_files + cmake_files
                         for entry in load(f)]
-        with open('compile_commands.json', 'w') as f:
-            json.dump(all_commands, f, indent=2)
+
+        # cquery does not like c++ compiles that start with gcc.
+        # It forgets to include the c++ header directories.
+        # We can work around this by replacing the gcc calls that python
+        # setup.py generates with g++ calls instead
+        for command in all_commands:
+            if command['command'].startswith("gcc "):
+                command['command'] = "g++ " + command['command'][4:]
+
+        new_contents = json.dumps(all_commands, indent=2)
+        contents = ''
+        if os.path.exists('compile_commands.json'):
+            with open('compile_commands.json', 'r') as f:
+                contents = f.read()
+        if contents != new_contents:
+            with open('compile_commands.json', 'w') as f:
+                f.write(new_contents)
+
         if not USE_NINJA:
             print("WARNING: 'develop' is not building C++ code incrementally")
             print("because ninja is not installed. Run this to enable it:")
@@ -578,7 +634,7 @@ def run(self):
         if USE_DISTRIBUTED:
             print('-- Building with THD distributed package ')
             monkey_patch_THD_link_flags()
-            if IS_LINUX and USE_CUDA:
+            if IS_LINUX:
                 print('-- Building with c10d distributed package ')
                 monkey_patch_C10D_inc_flags()
             else:
@@ -663,9 +719,21 @@ class build(distutils.command.build.build):
     ] + distutils.command.build.build.sub_commands
 
 
+class rebuild(distutils.command.build.build):
+    sub_commands = [
+        ('build_deps', lambda self: True),
+    ] + distutils.command.build.build.sub_commands
+
+    def run(self):
+        global RERUN_CMAKE
+        RERUN_CMAKE = False
+        distutils.command.build.build.run(self)
+
+
 class install(setuptools.command.install.install):
 
     def run(self):
+        print('setup.py::run()')
         if not self.skip_build:
             self.run_command('build_deps')
 
@@ -807,12 +875,6 @@ def run(self):
         CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_gpu.lib'))
     if USE_ROCM:
         CAFFE2_LIBS.append(os.path.join(lib_path, 'caffe2_hip.lib'))
-    # Windows needs direct access to ONNX libraries as well
-    # as through Caffe2 library
-    CAFFE2_LIBS += [
-        os.path.join(lib_path, 'onnx.lib'),
-        os.path.join(lib_path, 'onnx_proto.lib'),
-    ]
     if DEBUG:
         PROTOBUF_STATIC_LIB = os.path.join(lib_path, 'libprotobufd.lib')
     else:
@@ -856,7 +918,6 @@ def run(self):
     "torch/csrc/byte_order.cpp",
     "torch/csrc/jit/batched/BatchTensor.cpp",
     "torch/csrc/jit/init.cpp",
-    "torch/csrc/jit/ivalue.cpp",
     "torch/csrc/jit/passes/onnx.cpp",
     "torch/csrc/jit/passes/onnx/fixup_onnx_loop.cpp",
     "torch/csrc/jit/passes/onnx/prepare_division_for_onnx.cpp",
@@ -906,10 +967,11 @@ def run(self):
     ]
     include_dirs += [tmp_install_path + "/include/THD"]
     main_link_args += [THD_LIB]
-    if IS_LINUX and USE_CUDA:
+    if IS_LINUX:
         extra_compile_args.append('-DUSE_C10D')
         main_sources.append('torch/csrc/distributed/c10d/init.cpp')
-        main_sources.append('torch/csrc/distributed/c10d/ddp.cpp')
+        if USE_CUDA:
+            main_sources.append('torch/csrc/distributed/c10d/ddp.cpp')
         main_link_args.append(C10D_LIB)
 
 if USE_CUDA:
@@ -1103,11 +1165,13 @@ def make_relative_rpath(path):
     'build_deps': build_deps,
     'build_module': build_module,
     'rebuild_libtorch': rebuild_libtorch,
+    'rebuild': rebuild,
     'develop': develop,
     'install': install,
     'clean': clean,
 }
 cmdclass.update(build_dep_cmds)
+cmdclass.update(rebuild_dep_cmds)
 
 entry_points = {
     'console_scripts': [
@@ -1136,27 +1200,38 @@ def make_relative_rpath(path):
                 'lib/*.h',
                 'lib/include/ATen/*.h',
                 'lib/include/ATen/core/*.h',
-                'lib/include/ATen/detail/*.h',
-                'lib/include/ATen/cuda/*.h',
                 'lib/include/ATen/cuda/*.cuh',
+                'lib/include/ATen/cuda/*.h',
+                'lib/include/ATen/cuda/detail/*.cuh',
                 'lib/include/ATen/cuda/detail/*.h',
                 'lib/include/ATen/cudnn/*.h',
-                'lib/include/ATen/cuda/detail/*.cuh',
+                'lib/include/ATen/detail/*.h',
+                'lib/include/caffe2/utils/*.h',
+                'lib/include/torch/*.h',
+                'lib/include/torch/csrc/*.h',
+                'lib/include/torch/csrc/api/include/torch/detail/ordered_dict.h',
+                'lib/include/torch/csrc/autograd/*.h',
+                'lib/include/torch/csrc/autograd/generated/*.h',
+                'lib/include/torch/csrc/cuda/*.h',
+                'lib/include/torch/csrc/jit/*.h',
+                'lib/include/torch/csrc/jit/generated/*.h',
+                'lib/include/torch/csrc/jit/passes/*.h',
+                'lib/include/torch/csrc/jit/script/*.h',
+                'lib/include/torch/csrc/utils/*.h',
                 'lib/include/pybind11/*.h',
                 'lib/include/pybind11/detail/*.h',
                 'lib/include/TH/*.h*',
                 'lib/include/TH/generic/*.h*',
-                'lib/include/THC/*.h*',
                 'lib/include/THC/*.cuh',
+                'lib/include/THC/*.h*',
                 'lib/include/THC/generic/*.h',
                 'lib/include/THCUNN/*.cuh',
                 'lib/include/THNN/*.h',
-                'lib/include/torch/csrc/*.h',
-                'lib/include/torch/csrc/autograd/*.h',
-                'lib/include/torch/csrc/jit/*.h',
-                'lib/include/torch/csrc/utils/*.h',
-                'lib/include/torch/csrc/cuda/*.h',
-                'lib/include/torch/torch.h',
+                'share/cmake/ATen/*.cmake',
+                'share/cmake/Caffe2/*.cmake',
+                'share/cmake/Caffe2/public/*.cmake',
+                'share/cmake/Gloo/*.cmake',
+                'share/cmake/Torch/*.cmake',
             ],
             'caffe2': [
                 rel_site_packages + '/caffe2/**/*.py'
diff --git a/test/common.py b/test/common.py
index e65795967deb7a..c415504e94656d 100644
--- a/test/common.py
+++ b/test/common.py
@@ -128,14 +128,6 @@ def dec(fn):
     return dec
 
 
-def get_cuda_memory_usage():
-    # we don't need CUDA synchronize because the statistics are not tracked at
-    # actual freeing, but at when marking the block as free.
-    num_devices = torch.cuda.device_count()
-    gc.collect()
-    return tuple(torch.cuda.memory_allocated(i) for i in range(num_devices))
-
-
 def suppress_warnings(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -215,6 +207,38 @@ def is_iterable(obj):
         return False
 
 
+class CudaMemoryLeakCheck():
+    def __init__(self, testcase, name=None):
+        self.name = testcase.id() if name is None else name
+        self.testcase = testcase
+
+        # initialize context & RNG to prevent false positive detections
+        # when the test is the first to initialize those
+        from common_cuda import initialize_cuda_context_rng
+        initialize_cuda_context_rng()
+
+    @staticmethod
+    def get_cuda_memory_usage():
+        # we don't need CUDA synchronize because the statistics are not tracked at
+        # actual freeing, but at when marking the block as free.
+        num_devices = torch.cuda.device_count()
+        gc.collect()
+        return tuple(torch.cuda.memory_allocated(i) for i in range(num_devices))
+
+    def __enter__(self):
+        self.befores = self.get_cuda_memory_usage()
+
+    def __exit__(self, exec_type, exec_value, traceback):
+        # Don't check for leaks if an exception was thrown
+        if exec_type is not None:
+            return
+        afters = self.get_cuda_memory_usage()
+        for i, (before, after) in enumerate(zip(self.befores, afters)):
+            self.testcase.assertEqual(
+                before, after, '{} leaked {} bytes CUDA memory on device {}'.format(
+                    self.name, after - before, i))
+
+
 class TestCase(unittest.TestCase):
     precision = 1e-5
     maxDiff = None
@@ -232,12 +256,12 @@ def __init__(self, method_name='runTest'):
             from common_cuda import TEST_CUDA
             fullname = self.id().lower()  # class_name.method_name
             if TEST_CUDA and ('gpu' in fullname or 'cuda' in fullname):
-                # initialize context & RNG to prevent false positive detections
-                # when the test is the first to initialize those
-                from common_cuda import initialize_cuda_context_rng
-                initialize_cuda_context_rng()
                 setattr(self, method_name, self.wrap_with_cuda_memory_check(test_method))
 
+    def assertLeaksNoCudaTensors(self, name=None):
+        name = self.id() if name is None else name
+        return CudaMemoryLeakCheck(self, name)
+
     def wrap_with_cuda_memory_check(self, method):
         # Assumes that `method` is the tested function in `self`.
         # NOTE: Python Exceptions (e.g., unittest.Skip) keeps objects in scope
@@ -247,12 +271,8 @@ def wrap_with_cuda_memory_check(self, method):
         #       call in try-finally and always do the check.
         @wraps(method)
         def wrapper(self, *args, **kwargs):
-            befores = get_cuda_memory_usage()
-            method(*args, **kwargs)
-            afters = get_cuda_memory_usage()
-            for i, (before, after) in enumerate(zip(befores, afters)):
-                self.assertEqual(before, after, '{} leaked {} bytes CUDA memory on device {}'.format(
-                                 self.id(), after - before, i))
+            with self.assertLeaksNoCudaTensors():
+                method(*args, **kwargs)
         return types.MethodType(wrapper, self)
 
     def setUp(self):
@@ -330,6 +350,13 @@ def assertTensorsEqual(a, b):
                     self.assertTrue(torch.equal(nan_mask, b != b), message)
                     diff = a - b
                     diff[nan_mask] = 0
+                    # inf check if allow_inf=True
+                    if allow_inf:
+                        inf_mask = (a == float("inf")) | (a == float("-inf"))
+                        self.assertTrue(torch.equal(inf_mask,
+                                                    (b == float("inf")) | (b == float("-inf"))),
+                                        message)
+                        diff[inf_mask] = 0
                     # TODO: implement abs on CharTensor
                     if diff.is_signed() and 'CharTensor' not in diff.type():
                         diff = diff.abs()
@@ -566,3 +593,68 @@ def find_free_port():
     sockname = sock.getsockname()
     sock.close()
     return sockname[1]
+
+
+# Methods for matrix generation
+# Used in test_autograd.py and test_torch.py
+def prod_single_zero(dim_size):
+    result = torch.randn(dim_size, dim_size)
+    result[0, 1] = 0
+    return result
+
+
+def random_square_matrix_of_rank(l, rank):
+    assert rank <= l
+    A = torch.randn(l, l)
+    u, s, v = A.svd()
+    for i in range(l):
+        if i >= rank:
+            s[i] = 0
+        elif s[i] == 0:
+            s[i] = 1
+    return u.mm(torch.diag(s)).mm(v.transpose(0, 1))
+
+
+def random_symmetric_matrix(l):
+    A = torch.randn(l, l)
+    for i in range(l):
+        for j in range(i):
+            A[i, j] = A[j, i]
+    return A
+
+
+def random_symmetric_psd_matrix(l):
+    A = torch.randn(l, l)
+    return A.mm(A.transpose(0, 1))
+
+
+def random_symmetric_pd_matrix(l, eps=1e-5):
+    A = torch.randn(l, l)
+    return A.mm(A.transpose(0, 1)) + torch.eye(l) * eps
+
+
+def make_nonzero_det(A, sign=None, min_singular_value=0.1):
+    u, s, v = A.svd()
+    s[s < min_singular_value] = min_singular_value
+    A = u.mm(torch.diag(s)).mm(v.t())
+    det = A.det().item()
+    if sign is not None:
+        if (det < 0) ^ (sign < 0):
+            A[0, :].neg_()
+    return A
+
+
+def random_fullrank_matrix_distinct_singular_value(l, *batches):
+    if len(batches) == 0:
+        A = torch.randn(l, l)
+        u, _, v = A.svd()
+        s = torch.arange(1., l + 1).mul_(1.0 / (l + 1))
+        return u.mm(torch.diag(s)).mm(v.t())
+    else:
+        all_matrices = []
+        for _ in range(0, torch.prod(torch.as_tensor(batches)).item()):
+            A = torch.randn(l, l)
+            u, _, v = A.svd()
+            s = torch.arange(1., l + 1).mul_(1.0 / (l + 1))
+            all_matrices.append(u.mm(torch.diag(s)).mm(v.t()))
+        return torch.stack(all_matrices).reshape(*(batches + (l, l)))
diff --git a/test/cpp/api/any.cpp b/test/cpp/api/any.cpp
index ab044b84dc056d..18db2f531ee358 100644
--- a/test/cpp/api/any.cpp
+++ b/test/cpp/api/any.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/modules/any.h>
 #include <torch/torch.h>
@@ -13,37 +13,39 @@ using namespace torch::detail;
 using Catch::Contains;
 using Catch::StartsWith;
 
-TEST_CASE("any-module") {
+CATCH_TEST_CASE("any-module") {
   torch::manual_seed(0);
-  SECTION("int()") {
+  CATCH_SECTION("int()") {
     struct M : torch::nn::Module {
       int forward() {
         return 123;
       }
     };
     AnyModule any(M{});
-    REQUIRE(any.forward().get<int>() == 123);
+    CATCH_REQUIRE(any.forward<int>() == 123);
   }
-  SECTION("int(int)") {
+
+  CATCH_SECTION("int(int)") {
     struct M : torch::nn::Module {
       int forward(int x) {
         return x;
       }
     };
     AnyModule any(M{});
-    REQUIRE(any.forward(5).get<int>() == 5);
+    CATCH_REQUIRE(any.forward<int>(5) == 5);
   }
-  SECTION("const char*(const char*)") {
+
+  CATCH_SECTION("const char*(const char*)") {
     struct M : torch::nn::Module {
       const char* forward(const char* x) {
         return x;
       }
     };
     AnyModule any(M{});
-    REQUIRE(any.forward("hello").get<const char*>() == std::string("hello"));
+    CATCH_REQUIRE(any.forward<const char*>("hello") == std::string("hello"));
   }
 
-  SECTION("string(int, const double)") {
+  CATCH_SECTION("string(int, const double)") {
     struct M : torch::nn::Module {
       std::string forward(int x, const double f) {
         return std::to_string(static_cast<int>(x + f));
@@ -51,10 +53,10 @@ TEST_CASE("any-module") {
     };
     AnyModule any(M{});
     int x = 4;
-    REQUIRE(any.forward(x, 3.14).get<std::string>() == std::string("7"));
+    CATCH_REQUIRE(any.forward<std::string>(x, 3.14) == std::string("7"));
   }
 
-  SECTION("Tensor(string, const string&, string&&)") {
+  CATCH_SECTION("Tensor(string, const string&, string&&)") {
     struct M : torch::nn::Module {
       torch::Tensor forward(
           std::string a,
@@ -65,42 +67,42 @@ TEST_CASE("any-module") {
       }
     };
     AnyModule any(M{});
-    REQUIRE(
-        any.forward(std::string("a"), std::string("ab"), std::string("abc"))
-            .get<torch::Tensor>()
+    CATCH_REQUIRE(
+        any.forward(
+               std::string("a"), std::string("ab"), std::string("abc"))
             .sum()
             .toCInt() == 6);
   }
-  SECTION("wrong argument type") {
+  CATCH_SECTION("wrong argument type") {
     struct M : torch::nn::Module {
       int forward(float x) {
         return x;
       }
     };
     AnyModule any(M{});
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.forward(5.0),
         StartsWith("Expected argument #0 to be of type float, "
                    "but received value of type double"));
   }
-  SECTION("wrong number of arguments") {
+  CATCH_SECTION("wrong number of arguments") {
     struct M : torch::nn::Module {
       int forward(int a, int b) {
         return a + b;
       }
     };
     AnyModule any(M{});
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.forward(),
         Contains("M's forward() method expects 2 arguments, but received 0"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.forward(5),
         Contains("M's forward() method expects 2 arguments, but received 1"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.forward(1, 2, 3),
         Contains("M's forward() method expects 2 arguments, but received 3"));
   }
-  SECTION("get()") {
+  CATCH_SECTION("get()") {
     struct M : torch::nn::Module {
       explicit M(int value_) : torch::nn::Module("M"), value(value_) {}
       int value;
@@ -110,16 +112,16 @@ TEST_CASE("any-module") {
     };
     AnyModule any(M{5});
 
-    SECTION("good cast") {
-      REQUIRE(any.get<M>().value == 5);
+    CATCH_SECTION("good cast") {
+      CATCH_REQUIRE(any.get<M>().value == 5);
     }
 
-    SECTION("bad cast") {
+    CATCH_SECTION("bad cast") {
       struct N : torch::nn::Module {};
-      REQUIRE_THROWS_WITH(any.get<N>(), StartsWith("Attempted to cast module"));
+      CATCH_REQUIRE_THROWS_WITH(any.get<N>(), StartsWith("Attempted to cast module"));
     }
   }
-  SECTION("ptr()") {
+  CATCH_SECTION("ptr()") {
     struct M : torch::nn::Module {
       explicit M(int value_) : torch::nn::Module("M"), value(value_) {}
       int value;
@@ -129,24 +131,24 @@ TEST_CASE("any-module") {
     };
     AnyModule any(M{5});
 
-    SECTION("base class cast") {
+    CATCH_SECTION("base class cast") {
       auto ptr = any.ptr();
-      REQUIRE(ptr != nullptr);
-      REQUIRE(ptr->name() == "M");
+      CATCH_REQUIRE(ptr != nullptr);
+      CATCH_REQUIRE(ptr->name() == "M");
     }
 
-    SECTION("good downcast") {
+    CATCH_SECTION("good downcast") {
       auto ptr = any.ptr<M>();
-      REQUIRE(ptr != nullptr);
-      REQUIRE(ptr->value == 5);
+      CATCH_REQUIRE(ptr != nullptr);
+      CATCH_REQUIRE(ptr->value == 5);
     }
 
-    SECTION("bad downcast") {
+    CATCH_SECTION("bad downcast") {
       struct N : torch::nn::Module {};
-      REQUIRE_THROWS_WITH(any.ptr<N>(), StartsWith("Attempted to cast module"));
+      CATCH_REQUIRE_THROWS_WITH(any.ptr<N>(), StartsWith("Attempted to cast module"));
     }
   }
-  SECTION("default state is empty") {
+  CATCH_SECTION("default state is empty") {
     struct M : torch::nn::Module {
       explicit M(int value_) : value(value_) {}
       int value;
@@ -155,33 +157,33 @@ TEST_CASE("any-module") {
       }
     };
     AnyModule any;
-    REQUIRE(any.is_empty());
+    CATCH_REQUIRE(any.is_empty());
     any = std::make_shared<M>(5);
-    REQUIRE(!any.is_empty());
-    REQUIRE(any.get<M>().value == 5);
+    CATCH_REQUIRE(!any.is_empty());
+    CATCH_REQUIRE(any.get<M>().value == 5);
   }
-  SECTION("all methods throw for empty AnyModule") {
+  CATCH_SECTION("all methods throw for empty AnyModule") {
     struct M : torch::nn::Module {
       int forward(int x) {
         return x;
       }
     };
     AnyModule any;
-    REQUIRE(any.is_empty());
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE(any.is_empty());
+    CATCH_REQUIRE_THROWS_WITH(
         any.get<M>(), StartsWith("Cannot call get() on an empty AnyModule"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.ptr<M>(), StartsWith("Cannot call ptr() on an empty AnyModule"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.ptr(), StartsWith("Cannot call ptr() on an empty AnyModule"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.type_info(),
         StartsWith("Cannot call type_info() on an empty AnyModule"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         any.forward<int>(5),
         StartsWith("Cannot call forward() on an empty AnyModule"));
   }
-  SECTION("can move assign differentm modules") {
+  CATCH_SECTION("can move assign different modules") {
     struct M : torch::nn::Module {
       std::string forward(int x) {
         return std::to_string(x);
@@ -193,15 +195,15 @@ TEST_CASE("any-module") {
       }
     };
     AnyModule any;
-    REQUIRE(any.is_empty());
+    CATCH_REQUIRE(any.is_empty());
     any = std::make_shared<M>();
-    REQUIRE(!any.is_empty());
-    REQUIRE(any.forward(5).get<std::string>() == "5");
+    CATCH_REQUIRE(!any.is_empty());
+    CATCH_REQUIRE(any.forward<std::string>(5) == "5");
     any = std::make_shared<N>();
-    REQUIRE(!any.is_empty());
-    REQUIRE(any.forward(5.0f).get<int>() == 8);
+    CATCH_REQUIRE(!any.is_empty());
+    CATCH_REQUIRE(any.forward<int>(5.0f) == 8);
   }
-  SECTION("constructs from ModuleHolder") {
+  CATCH_SECTION("constructs from ModuleHolder") {
     struct MImpl : torch::nn::Module {
       explicit MImpl(int value_) : torch::nn::Module("M"), value(value_) {}
       int value;
@@ -216,10 +218,14 @@ TEST_CASE("any-module") {
     };
 
     AnyModule any(M{5});
-    REQUIRE(any.get<MImpl>().value == 5);
-    REQUIRE(any.get<M>()->value == 5);
+    CATCH_REQUIRE(any.get<MImpl>().value == 5);
+    CATCH_REQUIRE(any.get<M>()->value == 5);
+
+    AnyModule module(Linear(3, 4));
+    std::shared_ptr<Module> ptr = module.ptr();
+    Linear linear(module.get<Linear>());
   }
-  SECTION("converts autograd::Variable to torch::Tensor correctly") {
+  CATCH_SECTION("converts autograd::Variable to torch::Tensor correctly") {
     struct M : torch::nn::Module {
       torch::Tensor forward(torch::Tensor input) {
         return input;
@@ -230,14 +236,12 @@ TEST_CASE("any-module") {
       // torch::Tensor before being passed to the function (to avoid a type
       // mismatch).
       AnyModule any(M{});
-      REQUIRE(
+      CATCH_REQUIRE(
           any.forward(torch::autograd::Variable(torch::ones(5)))
-              .get<torch::Tensor>()
               .sum()
               .toCFloat() == 5);
       // at::Tensors that are not variables work too.
-      REQUIRE(
-          any.forward(at::ones(5)).get<torch::Tensor>().sum().toCFloat() == 5);
+      CATCH_REQUIRE(any.forward(at::ones(5)).sum().toCFloat() == 5);
     }
   }
 }
@@ -259,92 +263,92 @@ AnyModule::Value make_value(T&& value) {
 } // namespace nn
 } // namespace torch
 
-TEST_CASE("any-value") {
+CATCH_TEST_CASE("any-value") {
   torch::manual_seed(0);
-  SECTION("gets the correct value for the right type") {
-    SECTION("int") {
+  CATCH_SECTION("gets the correct value for the right type") {
+    CATCH_SECTION("int") {
       auto value = make_value(5);
       // const and non-const types have the same typeid()
-      REQUIRE(value.try_get<int>() != nullptr);
-      REQUIRE(value.try_get<const int>() != nullptr);
-      REQUIRE(value.get<int>() == 5);
+      CATCH_REQUIRE(value.try_get<int>() != nullptr);
+      CATCH_REQUIRE(value.try_get<const int>() != nullptr);
+      CATCH_REQUIRE(value.get<int>() == 5);
     }
-    SECTION("const int") {
+    CATCH_SECTION("const int") {
       auto value = make_value(5);
-      REQUIRE(value.try_get<const int>() != nullptr);
-      REQUIRE(value.try_get<int>() != nullptr);
-      REQUIRE(value.get<const int>() == 5);
+      CATCH_REQUIRE(value.try_get<const int>() != nullptr);
+      CATCH_REQUIRE(value.try_get<int>() != nullptr);
+      CATCH_REQUIRE(value.get<const int>() == 5);
     }
-    SECTION("const char*") {
+    CATCH_SECTION("const char*") {
       auto value = make_value("hello");
-      REQUIRE(value.try_get<const char*>() != nullptr);
-      REQUIRE(value.get<const char*>() == std::string("hello"));
+      CATCH_REQUIRE(value.try_get<const char*>() != nullptr);
+      CATCH_REQUIRE(value.get<const char*>() == std::string("hello"));
     }
-    SECTION("std::string") {
+    CATCH_SECTION("std::string") {
       auto value = make_value(std::string("hello"));
-      REQUIRE(value.try_get<std::string>() != nullptr);
-      REQUIRE(value.get<std::string>() == "hello");
+      CATCH_REQUIRE(value.try_get<std::string>() != nullptr);
+      CATCH_REQUIRE(value.get<std::string>() == "hello");
     }
-    SECTION("pointers") {
+    CATCH_SECTION("pointers") {
       std::string s("hello");
       std::string* p = &s;
       auto value = make_value(p);
-      REQUIRE(value.try_get<std::string*>() != nullptr);
-      REQUIRE(*value.get<std::string*>() == "hello");
+      CATCH_REQUIRE(value.try_get<std::string*>() != nullptr);
+      CATCH_REQUIRE(*value.get<std::string*>() == "hello");
     }
-    SECTION("references") {
+    CATCH_SECTION("references") {
       std::string s("hello");
       const std::string& t = s;
       auto value = make_value(t);
-      REQUIRE(value.try_get<std::string>() != nullptr);
-      REQUIRE(value.get<std::string>() == "hello");
+      CATCH_REQUIRE(value.try_get<std::string>() != nullptr);
+      CATCH_REQUIRE(value.get<std::string>() == "hello");
     }
   }
-  SECTION("try_get returns nullptr for the wrong type") {
+  CATCH_SECTION("try_get returns nullptr for the wrong type") {
     auto value = make_value(5);
-    REQUIRE(value.try_get<int>() != nullptr);
-    REQUIRE(value.try_get<float>() == nullptr);
-    REQUIRE(value.try_get<long>() == nullptr);
-    REQUIRE(value.try_get<std::string>() == nullptr);
+    CATCH_REQUIRE(value.try_get<int>() != nullptr);
+    CATCH_REQUIRE(value.try_get<float>() == nullptr);
+    CATCH_REQUIRE(value.try_get<long>() == nullptr);
+    CATCH_REQUIRE(value.try_get<std::string>() == nullptr);
   }
-  SECTION("get throws for the wrong type") {
+  CATCH_SECTION("get throws for the wrong type") {
     auto value = make_value(5);
-    REQUIRE(value.try_get<int>() != nullptr);
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE(value.try_get<int>() != nullptr);
+    CATCH_REQUIRE_THROWS_WITH(
         value.get<float>(),
         StartsWith("Attempted to cast Value to float, "
                    "but its actual type is int"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         value.get<long>(),
         StartsWith("Attempted to cast Value to long, "
                    "but its actual type is int"));
   }
-  SECTION("move is allowed") {
+  CATCH_SECTION("move is allowed") {
     auto value = make_value(5);
-    SECTION("construction") {
+    CATCH_SECTION("construction") {
       auto copy = make_value(std::move(value));
-      REQUIRE(copy.try_get<int>() != nullptr);
-      REQUIRE(copy.get<int>() == 5);
+      CATCH_REQUIRE(copy.try_get<int>() != nullptr);
+      CATCH_REQUIRE(copy.get<int>() == 5);
     }
-    SECTION("assignment") {
+    CATCH_SECTION("assignment") {
       auto copy = make_value(10);
       copy = std::move(value);
-      REQUIRE(copy.try_get<int>() != nullptr);
-      REQUIRE(copy.get<int>() == 5);
+      CATCH_REQUIRE(copy.try_get<int>() != nullptr);
+      CATCH_REQUIRE(copy.get<int>() == 5);
     }
   }
-  SECTION("type_info is correct") {
-    SECTION("int") {
+  CATCH_SECTION("type_info is correct") {
+    CATCH_SECTION("int") {
       auto value = make_value(5);
-      REQUIRE(value.type_info().hash_code() == typeid(int).hash_code());
+      CATCH_REQUIRE(value.type_info().hash_code() == typeid(int).hash_code());
     }
-    SECTION("const char") {
+    CATCH_SECTION("const char") {
       auto value = make_value("hello");
-      REQUIRE(value.type_info().hash_code() == typeid(const char*).hash_code());
+      CATCH_REQUIRE(value.type_info().hash_code() == typeid(const char*).hash_code());
     }
-    SECTION("std::string") {
+    CATCH_SECTION("std::string") {
       auto value = make_value(std::string("hello"));
-      REQUIRE(value.type_info().hash_code() == typeid(std::string).hash_code());
+      CATCH_REQUIRE(value.type_info().hash_code() == typeid(std::string).hash_code());
     }
   }
 }
diff --git a/test/cpp/api/catch_utils.hpp b/test/cpp/api/catch_utils.hpp
new file mode 100644
index 00000000000000..b9b0a87990a9ce
--- /dev/null
+++ b/test/cpp/api/catch_utils.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+#define CATCH_CONFIG_PREFIX_ALL
+#include <catch.hpp>
+
+// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning;
+// define our own version that doesn't warn.
+#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
diff --git a/test/cpp/api/cursor.cpp b/test/cpp/api/cursor.cpp
index 5c998661be2368..e08bd7858dbb4a 100644
--- a/test/cpp/api/cursor.cpp
+++ b/test/cpp/api/cursor.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/cursor.h>
 #include <torch/nn/module.h>
@@ -58,158 +58,158 @@ struct Container : public torch::nn::Module {
   std::vector<std::shared_ptr<Module>> m;
 };
 
-TEST_CASE("cursor/module") {
+CATCH_TEST_CASE("cursor/module") {
   torch::manual_seed(0);
-  SECTION("Works for flat models (depth = 1)") {
+  CATCH_SECTION("Works for flat models (depth = 1)") {
     Container model(TestModule(1), TestModule(2), TestModule(3));
     auto cursor = model.modules();
 
-    SECTION("Iterates in the correct order") {
+    CATCH_SECTION("Iterates in the correct order") {
       auto iterator = cursor.begin();
-      REQUIRE(&iterator->value == &model[0]);
-      REQUIRE(&(++iterator)->value == &model[1]);
-      REQUIRE(&(++iterator)->value == &model[2]);
-      REQUIRE(++iterator == cursor.end());
+      CATCH_REQUIRE(&iterator->value == &model[0]);
+      CATCH_REQUIRE(&(++iterator)->value == &model[1]);
+      CATCH_REQUIRE(&(++iterator)->value == &model[2]);
+      CATCH_REQUIRE(++iterator == cursor.end());
     }
 
-    SECTION("names are flat") {
+    CATCH_SECTION("names are flat") {
       auto iterator = cursor.begin();
-      REQUIRE(iterator->key == "0");
-      REQUIRE((++iterator)->key == "1");
-      REQUIRE((++iterator)->key == "2");
+      CATCH_REQUIRE(iterator->key == "0");
+      CATCH_REQUIRE((++iterator)->key == "1");
+      CATCH_REQUIRE((++iterator)->key == "2");
     }
 
-    SECTION("Apply works") {
+    CATCH_SECTION("Apply works") {
       size_t count = 0;
       cursor.apply([&count, &model](Module& module) {
-        REQUIRE(&module == &model[count]);
+        CATCH_REQUIRE(&module == &model[count]);
         count += 1;
       });
-      REQUIRE(count == 3);
+      CATCH_REQUIRE(count == 3);
     }
 
-    SECTION("Apply_items works") {
+    CATCH_SECTION("Apply_items works") {
       size_t count = 0;
       cursor.apply_items(
           [&count, &model](const std::string& key, Module& module) {
-            REQUIRE(&module == &model[count]);
+            CATCH_REQUIRE(&module == &model[count]);
             count += 1;
           });
-      REQUIRE(count == 3);
+      CATCH_REQUIRE(count == 3);
     }
 
-    SECTION("Map works") {
+    CATCH_SECTION("Map works") {
       std::vector<Module*> vector(3);
       cursor.map(vector.begin(), [](Module& module) { return &module; });
-      REQUIRE(vector[0] == &model[0]);
-      REQUIRE(vector[1] == &model[1]);
-      REQUIRE(vector[2] == &model[2]);
+      CATCH_REQUIRE(vector[0] == &model[0]);
+      CATCH_REQUIRE(vector[1] == &model[1]);
+      CATCH_REQUIRE(vector[2] == &model[2]);
 
       std::list<Module*> list;
       cursor.map(std::inserter(list, list.end()), [](Module& module) {
         return &module;
       });
-      REQUIRE(list.size() == 3);
+      CATCH_REQUIRE(list.size() == 3);
       auto iterator = list.begin();
-      REQUIRE(*iterator++ == &model[0]);
-      REQUIRE(*iterator++ == &model[1]);
-      REQUIRE(*iterator++ == &model[2]);
-      REQUIRE(iterator == list.end());
+      CATCH_REQUIRE(*iterator++ == &model[0]);
+      CATCH_REQUIRE(*iterator++ == &model[1]);
+      CATCH_REQUIRE(*iterator++ == &model[2]);
+      CATCH_REQUIRE(iterator == list.end());
     }
 
-    SECTION("Map_items works") {
+    CATCH_SECTION("Map_items works") {
       std::map<std::string, Module*> output;
       cursor.map_items(
           std::inserter(output, output.end()),
           [](const std::string& key, Module& module) {
             return std::make_pair(key, &module);
           });
-      REQUIRE(output.size() == 3);
-      REQUIRE(output.count("0"));
-      REQUIRE(output.count("1"));
-      REQUIRE(output.count("2"));
-      REQUIRE(output["0"] == &model[0]);
-      REQUIRE(output["1"] == &model[1]);
-      REQUIRE(output["2"] == &model[2]);
+      CATCH_REQUIRE(output.size() == 3);
+      CATCH_REQUIRE(output.count("0"));
+      CATCH_REQUIRE(output.count("1"));
+      CATCH_REQUIRE(output.count("2"));
+      CATCH_REQUIRE(output["0"] == &model[0]);
+      CATCH_REQUIRE(output["1"] == &model[1]);
+      CATCH_REQUIRE(output["2"] == &model[2]);
     }
 
-    SECTION("Count works for flat models") {
-      REQUIRE(cursor.size() == model.m.size());
+    CATCH_SECTION("Count works for flat models") {
+      CATCH_REQUIRE(cursor.size() == model.m.size());
     }
 
-    SECTION("find() finds the correct modules when given a valid key") {
-      REQUIRE(cursor.find("0") == &model[0]);
-      REQUIRE(cursor.find("1") == &model[1]);
-      REQUIRE(cursor.find("2") == &model[2]);
+    CATCH_SECTION("find() finds the correct modules when given a valid key") {
+      CATCH_REQUIRE(cursor.find("0") == &model[0]);
+      CATCH_REQUIRE(cursor.find("1") == &model[1]);
+      CATCH_REQUIRE(cursor.find("2") == &model[2]);
     }
 
-    SECTION("find() returns nullptr when given an invalid key") {
-      REQUIRE(cursor.find("foo") == nullptr);
-      REQUIRE(cursor.find("bar") == nullptr);
+    CATCH_SECTION("find() returns nullptr when given an invalid key") {
+      CATCH_REQUIRE(cursor.find("foo") == nullptr);
+      CATCH_REQUIRE(cursor.find("bar") == nullptr);
     }
 
-    SECTION("at(key) returns the correct modules when given a valid key") {
-      REQUIRE(&cursor.at("0") == &model[0]);
-      REQUIRE(&cursor.at("1") == &model[1]);
-      REQUIRE(&cursor.at("2") == &model[2]);
+    CATCH_SECTION("at(key) returns the correct modules when given a valid key") {
+      CATCH_REQUIRE(&cursor.at("0") == &model[0]);
+      CATCH_REQUIRE(&cursor.at("1") == &model[1]);
+      CATCH_REQUIRE(&cursor.at("2") == &model[2]);
     }
 
-    SECTION("at(key) throws when given an invalid key") {
-      REQUIRE_THROWS_WITH(cursor.at("foo"), StartsWith("No such key: 'foo'"));
-      REQUIRE_THROWS_WITH(cursor.at("bar"), StartsWith("No such key: 'bar'"));
+    CATCH_SECTION("at(key) throws when given an invalid key") {
+      CATCH_REQUIRE_THROWS_WITH(cursor.at("foo"), StartsWith("No such key: 'foo'"));
+      CATCH_REQUIRE_THROWS_WITH(cursor.at("bar"), StartsWith("No such key: 'bar'"));
     }
 
-    SECTION(
+    CATCH_SECTION(
         "operator[key] returns the correct modules when given a valid key") {
-      REQUIRE(&cursor["0"] == &model[0]);
-      REQUIRE(&cursor["1"] == &model[1]);
-      REQUIRE(&cursor["2"] == &model[2]);
+      CATCH_REQUIRE(&cursor["0"] == &model[0]);
+      CATCH_REQUIRE(&cursor["1"] == &model[1]);
+      CATCH_REQUIRE(&cursor["2"] == &model[2]);
     }
 
-    SECTION("operator[key] throws when given an invalid key") {
-      REQUIRE_THROWS_WITH(cursor["foo"], StartsWith("No such key: 'foo'"));
-      REQUIRE_THROWS_WITH(cursor["bar"], StartsWith("No such key: 'bar'"));
+    CATCH_SECTION("operator[key] throws when given an invalid key") {
+      CATCH_REQUIRE_THROWS_WITH(cursor["foo"], StartsWith("No such key: 'foo'"));
+      CATCH_REQUIRE_THROWS_WITH(cursor["bar"], StartsWith("No such key: 'bar'"));
     }
 
-    SECTION("at(index) returns the correct modules when given a valid index") {
-      REQUIRE(&cursor.at(0).value == &model[0]);
-      REQUIRE(&cursor.at(1).value == &model[1]);
-      REQUIRE(&cursor.at(2).value == &model[2]);
+    CATCH_SECTION("at(index) returns the correct modules when given a valid index") {
+      CATCH_REQUIRE(&cursor.at(0).value == &model[0]);
+      CATCH_REQUIRE(&cursor.at(1).value == &model[1]);
+      CATCH_REQUIRE(&cursor.at(2).value == &model[2]);
     }
 
-    SECTION("at(index) throws when given an invalid index") {
-      REQUIRE_THROWS_WITH(
+    CATCH_SECTION("at(index) throws when given an invalid index") {
+      CATCH_REQUIRE_THROWS_WITH(
           cursor.at(5),
           StartsWith("Index 5 is out of range for cursor of size 3"));
-      REQUIRE_THROWS_WITH(
+      CATCH_REQUIRE_THROWS_WITH(
           cursor.at(123),
           StartsWith("Index 123 is out of range for cursor of size 3"));
     }
 
-    SECTION(
+    CATCH_SECTION(
         "operator[index] returns the correct modules when given a valid index") {
-      REQUIRE(&cursor[0].value == &model[0]);
-      REQUIRE(&cursor[1].value == &model[1]);
-      REQUIRE(&cursor[2].value == &model[2]);
+      CATCH_REQUIRE(&cursor[0].value == &model[0]);
+      CATCH_REQUIRE(&cursor[1].value == &model[1]);
+      CATCH_REQUIRE(&cursor[2].value == &model[2]);
     }
 
-    SECTION("operator[index] throws when given an invalid key") {
-      REQUIRE_THROWS_WITH(
+    CATCH_SECTION("operator[index] throws when given an invalid key") {
+      CATCH_REQUIRE_THROWS_WITH(
           cursor[5],
           StartsWith("Index 5 is out of range for cursor of size 3"));
-      REQUIRE_THROWS_WITH(
+      CATCH_REQUIRE_THROWS_WITH(
           cursor[123],
           StartsWith("Index 123 is out of range for cursor of size 3"));
     }
 
-    SECTION("contains() is correct") {
-      REQUIRE(cursor.contains("0"));
-      REQUIRE(cursor.contains("1"));
-      REQUIRE(cursor.contains("2"));
+    CATCH_SECTION("contains() is correct") {
+      CATCH_REQUIRE(cursor.contains("0"));
+      CATCH_REQUIRE(cursor.contains("1"));
+      CATCH_REQUIRE(cursor.contains("2"));
     }
   }
 
-  SECTION("Works for deeper hierarchies (depth > 1)") {
+  CATCH_SECTION("Works for deeper hierarchies (depth > 1)") {
     // clang-format off
     Container model(
         Container(
@@ -227,106 +227,106 @@ TEST_CASE("cursor/module") {
     auto cursor = model.modules();
     // This is sufficient for the hierarchical case
     // (other tests build on top)
-    SECTION("Iterates in the correct order") {
+    CATCH_SECTION("Iterates in the correct order") {
       auto iterator = cursor.begin();
 
-      REQUIRE(&iterator->value == &model[0]);
+      CATCH_REQUIRE(&iterator->value == &model[0]);
 
       auto* seq = dynamic_cast<Container*>(&model[0]);
-      REQUIRE(seq != nullptr);
-      REQUIRE(&(++iterator)->value == &(*seq)[0]);
-      REQUIRE(&(++iterator)->value == &(*seq)[1]);
+      CATCH_REQUIRE(seq != nullptr);
+      CATCH_REQUIRE(&(++iterator)->value == &(*seq)[0]);
+      CATCH_REQUIRE(&(++iterator)->value == &(*seq)[1]);
 
-      REQUIRE(&(++iterator)->value == &model[1]);
-      REQUIRE(&(++iterator)->value == &model[2]);
+      CATCH_REQUIRE(&(++iterator)->value == &model[1]);
+      CATCH_REQUIRE(&(++iterator)->value == &model[2]);
 
       seq = dynamic_cast<Container*>(&model[2]);
-      REQUIRE(seq != nullptr);
-      REQUIRE(&(++iterator)->value == &(*seq)[0]);
-      REQUIRE(&(++iterator)->value == &(*seq)[1]);
+      CATCH_REQUIRE(seq != nullptr);
+      CATCH_REQUIRE(&(++iterator)->value == &(*seq)[0]);
+      CATCH_REQUIRE(&(++iterator)->value == &(*seq)[1]);
 
       seq = dynamic_cast<Container*>(&(*seq)[1]);
-      REQUIRE(seq != nullptr);
-      REQUIRE(&(++iterator)->value == &(*seq)[0]);
-      REQUIRE(&(++iterator)->value == &(*seq)[1]);
+      CATCH_REQUIRE(seq != nullptr);
+      CATCH_REQUIRE(&(++iterator)->value == &(*seq)[0]);
+      CATCH_REQUIRE(&(++iterator)->value == &(*seq)[1]);
     }
 
-    SECTION("children() returns only the first level of submodules") {
+    CATCH_SECTION("children() returns only the first level of submodules") {
       auto children = model.children();
-      REQUIRE(children.size() == 3);
-      REQUIRE(&children.at("0") == &model[0]);
-      REQUIRE(&children.at("1") == &model[1]);
-      REQUIRE(&children.at("2") == &model[2]);
-      REQUIRE(!children.contains("0.0"));
+      CATCH_REQUIRE(children.size() == 3);
+      CATCH_REQUIRE(&children.at("0") == &model[0]);
+      CATCH_REQUIRE(&children.at("1") == &model[1]);
+      CATCH_REQUIRE(&children.at("2") == &model[2]);
+      CATCH_REQUIRE(!children.contains("0.0"));
       size_t count = 0;
       for (auto& child : children) {
-        REQUIRE(child.key == std::to_string(count));
-        REQUIRE(&child.value == &model[count]);
+        CATCH_REQUIRE(child.key == std::to_string(count));
+        CATCH_REQUIRE(&child.value == &model[count]);
         count += 1;
       }
     }
   }
 }
 
-TEST_CASE("cursor/parameter") {
+CATCH_TEST_CASE("cursor/parameter") {
   torch::manual_seed(0);
-  SECTION("Works for single models") {
+  CATCH_SECTION("Works for single models") {
     TestModule model(1);
     auto cursor = model.parameters();
 
-    SECTION("Iterates in the correct order") {
+    CATCH_SECTION("Iterates in the correct order") {
       auto iterator = cursor.begin();
-      REQUIRE(iterator->value.equal(model.tensor1));
-      REQUIRE((++iterator)->value.equal(model.tensor2));
+      CATCH_REQUIRE(iterator->value.equal(model.tensor1));
+      CATCH_REQUIRE((++iterator)->value.equal(model.tensor2));
     }
   }
 
-  SECTION("Works for flat models (depth = 1)") {
+  CATCH_SECTION("Works for flat models (depth = 1)") {
     auto first = std::make_shared<TestModule>(1);
     auto second = std::make_shared<TestModule>(2);
     Container model(first, second);
     auto cursor = model.parameters();
 
-    SECTION("Iterates in the correct order") {
+    CATCH_SECTION("Iterates in the correct order") {
       auto iterator = cursor.begin();
-      REQUIRE(iterator->value.equal(first->tensor1));
-      REQUIRE((++iterator)->value.equal(first->tensor2));
-      REQUIRE((++iterator)->value.equal(second->tensor1));
-      REQUIRE((++iterator)->value.equal(second->tensor2));
+      CATCH_REQUIRE(iterator->value.equal(first->tensor1));
+      CATCH_REQUIRE((++iterator)->value.equal(first->tensor2));
+      CATCH_REQUIRE((++iterator)->value.equal(second->tensor1));
+      CATCH_REQUIRE((++iterator)->value.equal(second->tensor2));
     }
 
-    SECTION("Apply_items works") {
+    CATCH_SECTION("Apply_items works") {
       size_t count = 0;
       cursor.apply_items([&count, &model, &first, &second](
                              const std::string& key, torch::Tensor& tensor) {
         switch (count) {
           case 0: {
-            REQUIRE(tensor.equal(first->tensor1));
+            CATCH_REQUIRE(tensor.equal(first->tensor1));
             break;
           }
           case 1: {
-            REQUIRE(tensor.equal(first->tensor2));
+            CATCH_REQUIRE(tensor.equal(first->tensor2));
             break;
           }
           case 2: {
-            REQUIRE(tensor.equal(second->tensor1));
+            CATCH_REQUIRE(tensor.equal(second->tensor1));
             break;
           }
           case 3: {
-            REQUIRE(tensor.equal(second->tensor2));
+            CATCH_REQUIRE(tensor.equal(second->tensor2));
             break;
           }
         }
         count += 1;
       });
-      REQUIRE(count == 4);
+      CATCH_REQUIRE(count == 4);
     }
 
     // Other tests are correct based on correct iteration behavior and apply
     // working.
   }
 
-  SECTION("Works for deeper hierarchies (depth > 1)") {
+  CATCH_SECTION("Works for deeper hierarchies (depth > 1)") {
     std::vector<std::shared_ptr<TestModule>> modules;
     for (size_t i = 1; i <= 6; ++i) {
       modules.push_back(std::make_shared<TestModule>(i));
@@ -346,36 +346,36 @@ TEST_CASE("cursor/parameter") {
     // clang-format on
     auto cursor = model.parameters();
 
-    SECTION("Iterates in the correct order") {
+    CATCH_SECTION("Iterates in the correct order") {
       auto iterator = cursor.begin();
-      REQUIRE(iterator->value.equal(modules[0]->tensor1));
-      REQUIRE((++iterator)->value.equal(modules[0]->tensor2));
+      CATCH_REQUIRE(iterator->value.equal(modules[0]->tensor1));
+      CATCH_REQUIRE((++iterator)->value.equal(modules[0]->tensor2));
       for (size_t index = 1; index < 6; ++index) {
-        REQUIRE((++iterator)->value.equal(modules[index]->tensor1));
-        REQUIRE((++iterator)->value.equal(modules[index]->tensor2));
+        CATCH_REQUIRE((++iterator)->value.equal(modules[index]->tensor1));
+        CATCH_REQUIRE((++iterator)->value.equal(modules[index]->tensor2));
       }
     }
 
-    SECTION("names are hierarchical") {
+    CATCH_SECTION("names are hierarchical") {
       auto iterator = cursor.begin();
-      REQUIRE(iterator->key == "0.0.tensor1");
-      REQUIRE((++iterator)->key == "0.0.tensor2");
-      REQUIRE((++iterator)->key == "0.1.tensor1");
-      REQUIRE((++iterator)->key == "0.1.tensor2");
-      REQUIRE((++iterator)->key == "1.tensor1");
-      REQUIRE((++iterator)->key == "1.tensor2");
-      REQUIRE((++iterator)->key == "2.0.tensor1");
-      REQUIRE((++iterator)->key == "2.0.tensor2");
-      REQUIRE((++iterator)->key == "2.1.0.tensor1");
-      REQUIRE((++iterator)->key == "2.1.0.tensor2");
-      REQUIRE((++iterator)->key == "2.1.1.tensor1");
-      REQUIRE((++iterator)->key == "2.1.1.tensor2");
-      REQUIRE(++iterator == cursor.end());
+      CATCH_REQUIRE(iterator->key == "0.0.tensor1");
+      CATCH_REQUIRE((++iterator)->key == "0.0.tensor2");
+      CATCH_REQUIRE((++iterator)->key == "0.1.tensor1");
+      CATCH_REQUIRE((++iterator)->key == "0.1.tensor2");
+      CATCH_REQUIRE((++iterator)->key == "1.tensor1");
+      CATCH_REQUIRE((++iterator)->key == "1.tensor2");
+      CATCH_REQUIRE((++iterator)->key == "2.0.tensor1");
+      CATCH_REQUIRE((++iterator)->key == "2.0.tensor2");
+      CATCH_REQUIRE((++iterator)->key == "2.1.0.tensor1");
+      CATCH_REQUIRE((++iterator)->key == "2.1.0.tensor2");
+      CATCH_REQUIRE((++iterator)->key == "2.1.1.tensor1");
+      CATCH_REQUIRE((++iterator)->key == "2.1.1.tensor2");
+      CATCH_REQUIRE(++iterator == cursor.end());
     }
   }
 }
 
-TEST_CASE("cursor/non-const-to-const-conversion") {
+CATCH_TEST_CASE("cursor/non-const-to-const-conversion") {
   torch::manual_seed(0);
   auto first = std::make_shared<TestModule>(1);
   auto second = std::make_shared<TestModule>(2);
@@ -404,11 +404,11 @@ TEST_CASE("cursor/non-const-to-const-conversion") {
   }
 }
 
-TEST_CASE("cursor/can-invoke-const-method-on-const-cursor") {
+CATCH_TEST_CASE("cursor/can-invoke-const-method-on-const-cursor") {
   torch::manual_seed(0);
   TestModule model(1);
 
   /// This will only compile if `Cursor` has the appropriate const methods.
   const auto cursor = model.parameters();
-  REQUIRE(cursor.contains("tensor1"));
+  CATCH_REQUIRE(cursor.contains("tensor1"));
 }
diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp
index 58e716e3096cfd..972223a89fd42f 100644
--- a/test/cpp/api/integration.cpp
+++ b/test/cpp/api/integration.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/modules/batchnorm.h>
 #include <torch/nn/modules/conv.h>
@@ -230,7 +230,7 @@ bool test_mnist(
   return correct.sum().toCFloat() > telabel.size(0) * 0.8;
 }
 
-TEST_CASE("integration/cartpole") {
+CATCH_TEST_CASE("integration/cartpole") {
   torch::manual_seed(0);
   std::cerr << "Training episodic policy gradient with a critic for up to 3000"
                " episodes, rest your eyes for a bit!\n";
@@ -281,8 +281,8 @@ TEST_CASE("integration/cartpole") {
     for (auto i = 0U; i < saved_log_probs.size(); i++) {
       auto r = rewards[i] - saved_values[i].toCFloat();
       policy_loss.push_back(-r * saved_log_probs[i]);
-      value_loss.push_back(torch::smooth_l1_loss(
-          saved_values[i], torch::ones(1) * rewards[i]));
+      value_loss.push_back(
+          torch::smooth_l1_loss(saved_values[i], torch::ones(1) * rewards[i]));
     }
 
     auto loss =
@@ -326,17 +326,17 @@ TEST_CASE("integration/cartpole") {
     if (running_reward > 150) {
       break;
     }
-    REQUIRE(episode < 3000);
+    CATCH_REQUIRE(episode < 3000);
   }
 }
 
-TEST_CASE("integration/mnist", "[cuda]") {
+CATCH_TEST_CASE("integration/mnist", "[cuda]") {
   torch::manual_seed(0);
   auto model = std::make_shared<SimpleContainer>();
   auto conv1 = model->add(Conv2d(1, 10, 5), "conv1");
   auto conv2 = model->add(Conv2d(10, 20, 5), "conv2");
   auto drop = Dropout(0.3);
-  auto drop2d = Dropout2d(0.3);
+  auto drop2d = FeatureDropout(0.3);
   auto linear1 = model->add(Linear(320, 50), "linear1");
   auto linear2 = model->add(Linear(50, 10), "linear2");
 
@@ -357,7 +357,7 @@ TEST_CASE("integration/mnist", "[cuda]") {
   auto optimizer = torch::optim::SGD(
       model->parameters(), torch::optim::SGDOptions(1e-2).momentum(0.5));
 
-  REQUIRE(test_mnist(
+  CATCH_REQUIRE(test_mnist(
       32, // batch_size
       3, // num_epochs
       true, // useGPU
@@ -366,16 +366,14 @@ TEST_CASE("integration/mnist", "[cuda]") {
       optimizer));
 }
 
-TEST_CASE("integration/mnist/batchnorm", "[cuda]") {
+CATCH_TEST_CASE("integration/mnist/batchnorm", "[cuda]") {
   torch::manual_seed(0);
   auto model = std::make_shared<SimpleContainer>();
   auto conv1 = model->add(Conv2d(1, 10, 5), "conv1");
-  auto batchnorm2d =
-      model->add(BatchNorm(BatchNormOptions(10).stateful(true)), "batchnorm2d");
+  auto batchnorm2d = model->add(BatchNorm(10), "batchnorm2d");
   auto conv2 = model->add(Conv2d(10, 20, 5), "conv2");
   auto linear1 = model->add(Linear(320, 50), "linear1");
-  auto batchnorm1 =
-      model->add(BatchNorm(BatchNormOptions(50).stateful(true)), "batchnorm1");
+  auto batchnorm1 = model->add(BatchNorm(50), "batchnorm1");
   auto linear2 = model->add(Linear(50, 10), "linear2");
 
   auto forward = [&](torch::Tensor x) {
@@ -395,7 +393,7 @@ TEST_CASE("integration/mnist/batchnorm", "[cuda]") {
   auto optimizer = torch::optim::SGD(
       model->parameters(), torch::optim::SGDOptions(1e-2).momentum(0.5));
 
-  REQUIRE(test_mnist(
+  CATCH_REQUIRE(test_mnist(
       32, // batch_size
       3, // num_epochs
       true, // useGPU
diff --git a/test/cpp/api/jit.cpp b/test/cpp/api/jit.cpp
index 8879d6f0007fdc..b477b11c8e5ae1 100644
--- a/test/cpp/api/jit.cpp
+++ b/test/cpp/api/jit.cpp
@@ -1,19 +1,19 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/jit.h>
 #include <torch/tensor.h>
 
 #include <string>
 
-TEST_CASE("torch script") {
-  SECTION("multiple functions") {
+CATCH_TEST_CASE("torch script") {
+  CATCH_SECTION("multiple functions") {
     auto module = torch::jit::compile(R"JIT(
       def test_mul(a, b):
         return a * b
       def test_relu(a, b):
         return torch.relu(a + b)
       def test_while(a, i):
-        while i < 10:
+        while bool(i < 10):
           a += a
           i += 1
         return a
@@ -21,11 +21,11 @@ TEST_CASE("torch script") {
     auto a = torch::ones(1);
     auto b = torch::ones(1);
 
-    REQUIRE(1 == module->run_method("test_mul", a, b).toTensor().toCLong());
+    CATCH_REQUIRE(1 == module->run_method("test_mul", a, b).toTensor().toCLong());
 
-    REQUIRE(2 == module->run_method("test_relu", a, b).toTensor().toCLong());
+    CATCH_REQUIRE(2 == module->run_method("test_relu", a, b).toTensor().toCLong());
 
-    REQUIRE(
+    CATCH_REQUIRE(
         0x200 == module->run_method("test_while", a, b).toTensor().toCLong());
   }
 }
diff --git a/test/cpp/api/main.cpp b/test/cpp/api/main.cpp
index 4b1aaba64b2ef1..92ea3567a3b9a7 100644
--- a/test/cpp/api/main.cpp
+++ b/test/cpp/api/main.cpp
@@ -1,5 +1,5 @@
 #define CATCH_CONFIG_RUNNER
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/cuda.h>
 
diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp
index 6d065bf6fa6523..8ced0e0a750dc8 100644
--- a/test/cpp/api/misc.cpp
+++ b/test/cpp/api/misc.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/detail/ordered_dict.h>
 #include <torch/expanding_array.h>
@@ -18,7 +18,7 @@ using OrderedDict = torch::detail::OrderedDict<std::string, T>;
 
 using Catch::StartsWith;
 
-TEST_CASE("NoGrad") {
+CATCH_TEST_CASE("NoGrad") {
   torch::manual_seed(0);
   torch::NoGradGuard guard;
   Linear model(5, 2);
@@ -27,88 +27,88 @@ TEST_CASE("NoGrad") {
   torch::Tensor s = y.sum();
 
   s.backward();
-  REQUIRE(!model->parameters()["weight"].grad().defined());
+  CATCH_REQUIRE(!model->parameters()["weight"].grad().defined());
 }
 
-TEST_CASE("autograd") {
+CATCH_TEST_CASE("autograd") {
   torch::manual_seed(0);
   auto x = torch::randn({3, 3}, torch::requires_grad());
   auto y = torch::randn({3, 3});
   auto z = x * y;
-  SECTION("derivatives of zero-dim tensors") {
+  CATCH_SECTION("derivatives of zero-dim tensors") {
     z.sum().backward();
-    REQUIRE(x.grad().allclose(y));
+    CATCH_REQUIRE(x.grad().allclose(y));
   }
-  SECTION("derivatives of tensors") {
+  CATCH_SECTION("derivatives of tensors") {
     z.backward();
-    REQUIRE(x.grad().allclose(y));
+    CATCH_REQUIRE(x.grad().allclose(y));
   }
-  SECTION("custom gradient inputs") {
+  CATCH_SECTION("custom gradient inputs") {
     z.sum().backward(torch::ones({}) * 2);
-    REQUIRE(x.grad().allclose(y * 2));
+    CATCH_REQUIRE(x.grad().allclose(y * 2));
   }
   // Assume everything else is safe from PyTorch tests.
 }
 
-TEST_CASE("nn::init") {
+CATCH_TEST_CASE("nn::init") {
   auto tensor = torch::empty({3, 4}, torch::requires_grad());
-  REQUIRE_THROWS_WITH(
+  CATCH_REQUIRE_THROWS_WITH(
       tensor.fill_(1),
       StartsWith("a leaf Variable that requires grad "
                  "has been used in an in-place operation"));
-  REQUIRE(torch::nn::init::ones_(tensor).sum().toCInt() == 12);
+  CATCH_REQUIRE(torch::nn::init::ones_(tensor).sum().toCInt() == 12);
 }
 
-TEST_CASE("expanding-array") {
+CATCH_TEST_CASE("expanding-array") {
   torch::manual_seed(0);
-  SECTION("successful construction") {
-    SECTION("initializer_list") {
+  CATCH_SECTION("successful construction") {
+    CATCH_SECTION("initializer_list") {
       torch::ExpandingArray<5> e({1, 2, 3, 4, 5});
-      REQUIRE(e.size() == 5);
+      CATCH_REQUIRE(e.size() == 5);
       for (size_t i = 0; i < e.size(); ++i) {
-        REQUIRE((*e)[i] == i + 1);
+        CATCH_REQUIRE((*e)[i] == i + 1);
       }
     }
 
-    SECTION("vector") {
+    CATCH_SECTION("vector") {
       torch::ExpandingArray<5> e(std::vector<int64_t>{1, 2, 3, 4, 5});
-      REQUIRE(e.size() == 5);
+      CATCH_REQUIRE(e.size() == 5);
       for (size_t i = 0; i < e.size(); ++i) {
-        REQUIRE((*e)[i] == i + 1);
+        CATCH_REQUIRE((*e)[i] == i + 1);
       }
     }
 
-    SECTION("array") {
+    CATCH_SECTION("array") {
       torch::ExpandingArray<5> e(std::array<int64_t, 5>({1, 2, 3, 4, 5}));
-      REQUIRE(e.size() == 5);
+      CATCH_REQUIRE(e.size() == 5);
       for (size_t i = 0; i < e.size(); ++i) {
-        REQUIRE((*e)[i] == i + 1);
+        CATCH_REQUIRE((*e)[i] == i + 1);
       }
     }
 
-    SECTION("single value") {
+    CATCH_SECTION("single value") {
       torch::ExpandingArray<5> e(5);
-      REQUIRE(e.size() == 5);
+      CATCH_REQUIRE(e.size() == 5);
       for (size_t i = 0; i < e.size(); ++i) {
-        REQUIRE((*e)[i] == 5);
+        CATCH_REQUIRE((*e)[i] == 5);
       }
     }
   }
-  SECTION("throws for incorrect size on construction") {
-    SECTION("initializer_list") {
-      REQUIRE_THROWS_WITH(
+  CATCH_SECTION("throws for incorrect size on construction") {
+    CATCH_SECTION("initializer_list") {
+      CATCH_REQUIRE_THROWS_WITH(
           torch::ExpandingArray<5>({1, 2, 3, 4, 5, 6, 7}),
           StartsWith("Expected 5 values, but instead got 7"));
     }
-    SECTION("vector") {
-      REQUIRE_THROWS_WITH(
+    CATCH_SECTION("vector") {
+      CATCH_REQUIRE_THROWS_WITH(
           torch::ExpandingArray<5>(std::vector<int64_t>({1, 2, 3, 4, 5, 6, 7})),
           StartsWith("Expected 5 values, but instead got 7"));
     }
   }
 }
 
-TEST_CASE("make_unique") {
+CATCH_TEST_CASE("make_unique") {
   struct Test {
     explicit Test(const int& x) : lvalue_(x) {}
     explicit Test(int&& x) : rvalue_(x) {}
@@ -117,216 +117,216 @@ TEST_CASE("make_unique") {
     at::optional<int> rvalue_;
   };
 
-  SECTION("forwards rvalues correctly") {
+  CATCH_SECTION("forwards rvalues correctly") {
     auto ptr = torch::make_unique<Test>(123);
-    REQUIRE(!ptr->lvalue_.has_value());
-    REQUIRE(ptr->rvalue_.has_value());
-    REQUIRE(*ptr->rvalue_ == 123);
+    CATCH_REQUIRE(!ptr->lvalue_.has_value());
+    CATCH_REQUIRE(ptr->rvalue_.has_value());
+    CATCH_REQUIRE(*ptr->rvalue_ == 123);
   }
 
-  SECTION("forwards lvalues correctly") {
+  CATCH_SECTION("forwards lvalues correctly") {
     int x = 5;
     auto ptr = torch::make_unique<Test>(x);
-    REQUIRE(ptr->lvalue_.has_value());
-    REQUIRE(*ptr->lvalue_ == 5);
-    REQUIRE(!ptr->rvalue_.has_value());
+    CATCH_REQUIRE(ptr->lvalue_.has_value());
+    CATCH_REQUIRE(*ptr->lvalue_ == 5);
+    CATCH_REQUIRE(!ptr->rvalue_.has_value());
   }
 
-  SECTION("Can construct unique_ptr of array") {
+  CATCH_SECTION("Can construct unique_ptr of array") {
     auto ptr = torch::make_unique<int[]>(3);
     // Value initialization is required by the standard.
-    REQUIRE(ptr[0] == 0);
-    REQUIRE(ptr[1] == 0);
-    REQUIRE(ptr[2] == 0);
+    CATCH_REQUIRE(ptr[0] == 0);
+    CATCH_REQUIRE(ptr[1] == 0);
+    CATCH_REQUIRE(ptr[2] == 0);
   }
 }
 
-TEST_CASE("ordered-dict") {
-  SECTION("is empty after default construction") {
+CATCH_TEST_CASE("ordered-dict") {
+  CATCH_SECTION("is empty after default construction") {
     OrderedDict<int> dict;
-    REQUIRE(dict.subject() == "Key");
-    REQUIRE(dict.is_empty());
-    REQUIRE(dict.size() == 0);
+    CATCH_REQUIRE(dict.subject() == "Key");
+    CATCH_REQUIRE(dict.is_empty());
+    CATCH_REQUIRE(dict.size() == 0);
   }
 
-  SECTION("insert inserts elements when they are not yet present") {
+  CATCH_SECTION("insert inserts elements when they are not yet present") {
     OrderedDict<int> dict;
     dict.insert("a", 1);
     dict.insert("b", 2);
-    REQUIRE(dict.size() == 2);
+    CATCH_REQUIRE(dict.size() == 2);
   }
 
-  SECTION("get returns values when present") {
+  CATCH_SECTION("get returns values when present") {
     OrderedDict<int> dict;
     dict.insert("a", 1);
     dict.insert("b", 2);
-    REQUIRE(dict.get("a") == 1);
-    REQUIRE(dict.get("b") == 2);
+    CATCH_REQUIRE(dict.get("a") == 1);
+    CATCH_REQUIRE(dict.get("b") == 2);
   }
 
-  SECTION("get throws when passed keys that are not present") {
+  CATCH_SECTION("get throws when passed keys that are not present") {
     OrderedDict<int> dict;
     dict.insert("a", 1);
     dict.insert("b", 2);
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         dict.get("foo"), StartsWith("Key 'foo' is not defined"));
-    REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined"));
+    CATCH_REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined"));
   }
 
-  SECTION("can initialize from list") {
+  CATCH_SECTION("can initialize from list") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict.size() == 2);
-    REQUIRE(dict.get("a") == 1);
-    REQUIRE(dict.get("b") == 2);
+    CATCH_REQUIRE(dict.size() == 2);
+    CATCH_REQUIRE(dict.get("a") == 1);
+    CATCH_REQUIRE(dict.get("b") == 2);
   }
 
-  SECTION("insert throws when passed elements that are present") {
+  CATCH_SECTION("insert throws when passed elements that are present") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         dict.insert("a", 1), StartsWith("Key 'a' already defined"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         dict.insert("b", 1), StartsWith("Key 'b' already defined"));
   }
 
-  SECTION("front() returns the first item") {
+  CATCH_SECTION("front() returns the first item") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict.front().key == "a");
-    REQUIRE(dict.front().value == 1);
+    CATCH_REQUIRE(dict.front().key == "a");
+    CATCH_REQUIRE(dict.front().value == 1);
   }
 
-  SECTION("back() returns the last item") {
+  CATCH_SECTION("back() returns the last item") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict.back().key == "b");
-    REQUIRE(dict.back().value == 2);
+    CATCH_REQUIRE(dict.back().key == "b");
+    CATCH_REQUIRE(dict.back().value == 2);
   }
 
-  SECTION("find returns pointers to values when present") {
+  CATCH_SECTION("find returns pointers to values when present") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict.find("a") != nullptr);
-    REQUIRE(*dict.find("a") == 1);
-    REQUIRE(dict.find("b") != nullptr);
-    REQUIRE(*dict.find("b") == 2);
+    CATCH_REQUIRE(dict.find("a") != nullptr);
+    CATCH_REQUIRE(*dict.find("a") == 1);
+    CATCH_REQUIRE(dict.find("b") != nullptr);
+    CATCH_REQUIRE(*dict.find("b") == 2);
   }
 
-  SECTION("find returns null pointers when passed keys that are not present") {
+  CATCH_SECTION("find returns null pointers when passed keys that are not present") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict.find("bar") == nullptr);
-    REQUIRE(dict.find("") == nullptr);
+    CATCH_REQUIRE(dict.find("bar") == nullptr);
+    CATCH_REQUIRE(dict.find("") == nullptr);
   }
 
-  SECTION("operator[] returns values when passed keys that are present") {
+  CATCH_SECTION("operator[] returns values when passed keys that are present") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict["a"] == 1);
-    REQUIRE(dict["b"] == 2);
+    CATCH_REQUIRE(dict["a"] == 1);
+    CATCH_REQUIRE(dict["b"] == 2);
   }
 
-  SECTION("operator[] returns items positionally when passed integers") {
+  CATCH_SECTION("operator[] returns items positionally when passed integers") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(dict[0].key == "a");
-    REQUIRE(dict[0].value == 1);
-    REQUIRE(dict[1].key == "b");
-    REQUIRE(dict[1].value == 2);
+    CATCH_REQUIRE(dict[0].key == "a");
+    CATCH_REQUIRE(dict[0].value == 1);
+    CATCH_REQUIRE(dict[1].key == "b");
+    CATCH_REQUIRE(dict[1].value == 2);
   }
 
-  SECTION("operator[] throws when passed keys that are not present") {
+  CATCH_SECTION("operator[] throws when passed keys that are not present") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         dict.get("foo"), StartsWith("Key 'foo' is not defined"));
-    REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined"));
+    CATCH_REQUIRE_THROWS_WITH(dict.get(""), StartsWith("Key '' is not defined"));
   }
 
-  SECTION("update inserts all items from another OrderedDict") {
+  CATCH_SECTION("update inserts all items from another OrderedDict") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     OrderedDict<int> dict2 = {{"c", 3}};
     dict2.update(dict);
-    REQUIRE(dict2.size() == 3);
-    REQUIRE(dict2.find("a") != nullptr);
-    REQUIRE(dict2.find("b") != nullptr);
-    REQUIRE(dict2.find("c") != nullptr);
+    CATCH_REQUIRE(dict2.size() == 3);
+    CATCH_REQUIRE(dict2.find("a") != nullptr);
+    CATCH_REQUIRE(dict2.find("b") != nullptr);
+    CATCH_REQUIRE(dict2.find("c") != nullptr);
   }
 
-  SECTION("update also checks for duplicates") {
+  CATCH_SECTION("update also checks for duplicates") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     OrderedDict<int> dict2 = {{"a", 1}};
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         dict2.update(dict), StartsWith("Key 'a' already defined"));
   }
 
-  SECTION("Can iterate items") {
+  CATCH_SECTION("Can iterate items") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     auto iterator = dict.begin();
-    REQUIRE(iterator != dict.end());
-    REQUIRE(iterator->key == "a");
-    REQUIRE(iterator->value == 1);
+    CATCH_REQUIRE(iterator != dict.end());
+    CATCH_REQUIRE(iterator->key == "a");
+    CATCH_REQUIRE(iterator->value == 1);
     ++iterator;
-    REQUIRE(iterator != dict.end());
-    REQUIRE(iterator->key == "b");
-    REQUIRE(iterator->value == 2);
+    CATCH_REQUIRE(iterator != dict.end());
+    CATCH_REQUIRE(iterator->key == "b");
+    CATCH_REQUIRE(iterator->value == 2);
     ++iterator;
-    REQUIRE(iterator == dict.end());
+    CATCH_REQUIRE(iterator == dict.end());
   }
 
-  SECTION("clear makes the dict empty") {
+  CATCH_SECTION("clear makes the dict empty") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
-    REQUIRE(!dict.is_empty());
+    CATCH_REQUIRE(!dict.is_empty());
     dict.clear();
-    REQUIRE(dict.is_empty());
+    CATCH_REQUIRE(dict.is_empty());
   }
 
-  SECTION("can copy construct") {
+  CATCH_SECTION("can copy construct") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     OrderedDict<int> copy = dict;
-    REQUIRE(copy.size() == 2);
-    REQUIRE(*copy[0] == 1);
-    REQUIRE(*copy[1] == 2);
+    CATCH_REQUIRE(copy.size() == 2);
+    CATCH_REQUIRE(*copy[0] == 1);
+    CATCH_REQUIRE(*copy[1] == 2);
   }
 
-  SECTION("can copy assign") {
+  CATCH_SECTION("can copy assign") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     OrderedDict<int> copy = {{"c", 1}};
-    REQUIRE(copy.find("c") != nullptr);
+    CATCH_REQUIRE(copy.find("c") != nullptr);
     copy = dict;
-    REQUIRE(copy.size() == 2);
-    REQUIRE(*copy[0] == 1);
-    REQUIRE(*copy[1] == 2);
-    REQUIRE(copy.find("c") == nullptr);
+    CATCH_REQUIRE(copy.size() == 2);
+    CATCH_REQUIRE(*copy[0] == 1);
+    CATCH_REQUIRE(*copy[1] == 2);
+    CATCH_REQUIRE(copy.find("c") == nullptr);
   }
 
-  SECTION("can move construct") {
+  CATCH_SECTION("can move construct") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     OrderedDict<int> copy = std::move(dict);
-    REQUIRE(copy.size() == 2);
-    REQUIRE(*copy[0] == 1);
-    REQUIRE(*copy[1] == 2);
+    CATCH_REQUIRE(copy.size() == 2);
+    CATCH_REQUIRE(*copy[0] == 1);
+    CATCH_REQUIRE(*copy[1] == 2);
   }
 
-  SECTION("can move assign") {
+  CATCH_SECTION("can move assign") {
     OrderedDict<int> dict = {{"a", 1}, {"b", 2}};
     OrderedDict<int> copy = {{"c", 1}};
-    REQUIRE(copy.find("c") != nullptr);
+    CATCH_REQUIRE(copy.find("c") != nullptr);
     copy = std::move(dict);
-    REQUIRE(copy.size() == 2);
-    REQUIRE(*copy[0] == 1);
-    REQUIRE(*copy[1] == 2);
-    REQUIRE(copy.find("c") == nullptr);
+    CATCH_REQUIRE(copy.size() == 2);
+    CATCH_REQUIRE(*copy[0] == 1);
+    CATCH_REQUIRE(*copy[1] == 2);
+    CATCH_REQUIRE(copy.find("c") == nullptr);
   }
 
-  SECTION("can insert with braces") {
+  CATCH_SECTION("can insert with braces") {
     OrderedDict<std::pair<int, int>> dict;
     dict.insert("a", {1, 2});
-    REQUIRE(!dict.is_empty());
-    REQUIRE(dict["a"].first == 1);
-    REQUIRE(dict["a"].second == 2);
+    CATCH_REQUIRE(!dict.is_empty());
+    CATCH_REQUIRE(dict["a"].first == 1);
+    CATCH_REQUIRE(dict["a"].second == 2);
   }
 
-  SECTION("Error messages include the what") {
+  CATCH_SECTION("Error messages include the what") {
     OrderedDict<int> dict("Penguin");
-    REQUIRE(dict.subject() == "Penguin");
+    CATCH_REQUIRE(dict.subject() == "Penguin");
     dict.insert("a", 1);
-    REQUIRE(!dict.is_empty());
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE(!dict.is_empty());
+    CATCH_REQUIRE_THROWS_WITH(
         dict.get("b"), StartsWith("Penguin 'b' is not defined"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         dict.insert("a", 1), StartsWith("Penguin 'a' already defined"));
   }
 }
diff --git a/test/cpp/api/module.cpp b/test/cpp/api/module.cpp
index d4049d6b270b26..2b9d0ad99c3477 100644
--- a/test/cpp/api/module.cpp
+++ b/test/cpp/api/module.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/module.h>
 #include <torch/nn/modules/linear.h>
@@ -22,21 +22,21 @@ struct AGIUnit2 : torch::nn::Module {
 };
 } // namespace test
 
-TEST_CASE("module/training-mode") {
+CATCH_TEST_CASE("module/training-mode") {
   torch::manual_seed(0);
   Linear module(3, 4);
-  REQUIRE(module->is_training());
-  SECTION("Enable eval mode") {
+  CATCH_REQUIRE(module->is_training());
+  CATCH_SECTION("Enable eval mode") {
     module->eval();
-    REQUIRE(!module->is_training());
+    CATCH_REQUIRE(!module->is_training());
   }
-  SECTION("Enable train mode") {
+  CATCH_SECTION("Enable train mode") {
     module->train();
-    REQUIRE(module->is_training());
+    CATCH_REQUIRE(module->is_training());
   }
 }
 
-TEST_CASE("module/zero-grad") {
+CATCH_TEST_CASE("module/zero-grad") {
   torch::manual_seed(0);
   Linear module(3, 4);
   auto weight = torch::ones({8, 3}, torch::requires_grad());
@@ -44,18 +44,18 @@ TEST_CASE("module/zero-grad") {
   loss.backward();
   for (auto& parameter : module->parameters()) {
     auto grad = parameter->grad();
-    REQUIRE(grad.defined());
-    REQUIRE(grad.sum().toCFloat() != 0);
+    CATCH_REQUIRE(grad.defined());
+    CATCH_REQUIRE(grad.sum().toCFloat() != 0);
   }
   module->zero_grad();
   for (auto& parameter : module->parameters()) {
     auto grad = parameter->grad();
-    REQUIRE(grad.defined());
-    REQUIRE(grad.sum().toCFloat() == 0);
+    CATCH_REQUIRE(grad.defined());
+    CATCH_REQUIRE(grad.sum().toCFloat() == 0);
   }
 }
 
-TEST_CASE("module/zero-grad-with-undefined") {
+CATCH_TEST_CASE("module/zero-grad-with-undefined") {
   struct TestModule : torch::nn::Module {
     TestModule() {
       x = register_parameter("x", torch::ones(5, at::requires_grad()));
@@ -68,120 +68,120 @@ TEST_CASE("module/zero-grad-with-undefined") {
   auto z = module.x * 2;
   z.sum().backward();
 
-  REQUIRE(module.x.grad().defined());
-  REQUIRE(!module.y.grad().defined());
+  CATCH_REQUIRE(module.x.grad().defined());
+  CATCH_REQUIRE(!module.y.grad().defined());
 
   module.zero_grad();
 
-  REQUIRE(module.x.grad().defined());
-  REQUIRE(!module.y.grad().defined());
+  CATCH_REQUIRE(module.x.grad().defined());
+  CATCH_REQUIRE(!module.y.grad().defined());
 
-  REQUIRE(module.x.grad().sum().toCFloat() == 0);
+  CATCH_REQUIRE(module.x.grad().sum().toCFloat() == 0);
 }
 
-TEST_CASE("module/name") {
+CATCH_TEST_CASE("module/name") {
   // CHECK instead of REQUIRE because demangling may fail.
   AGIUnit agi;
   // Call it twice just to make sure there are no bugs in the lazy
   // initialization semantics.
-  CHECK(agi.name() == "AGIUnit");
-  CHECK(agi.name() == "AGIUnit");
-  SECTION("correctly demangled") {
-    CHECK(test::AGIUnit().name() == "test::AGIUnit");
-    CHECK(test::AGIUnit2().name() == "Foo");
+  CATCH_CHECK(agi.name() == "AGIUnit");
+  CATCH_CHECK(agi.name() == "AGIUnit");
+  CATCH_SECTION("correctly demangled") {
+    CATCH_CHECK(test::AGIUnit().name() == "test::AGIUnit");
+    CATCH_CHECK(test::AGIUnit2().name() == "Foo");
   }
 }
 
-TEST_CASE("module/as") {
+CATCH_TEST_CASE("module/as") {
   Linear module(3, 4);
-  REQUIRE(module->as<Linear>() == module.get());
-  REQUIRE(module->as<LinearImpl>() == module.get());
-  REQUIRE(module->as<Module>() == module.get());
-  REQUIRE(module->as<AGIUnit>() == nullptr);
+  CATCH_REQUIRE(module->as<Linear>() == module.get());
+  CATCH_REQUIRE(module->as<LinearImpl>() == module.get());
+  CATCH_REQUIRE(module->as<Module>() == module.get());
+  CATCH_REQUIRE(module->as<AGIUnit>() == nullptr);
 
   std::shared_ptr<Module> raw = module.ptr();
-  REQUIRE(raw->as<Linear>() == module.get());
-  REQUIRE(raw->as<LinearImpl>() == module.get());
-  REQUIRE(raw->as<Module>() == module.get());
-  REQUIRE(raw->as<AGIUnit>() == nullptr);
+  CATCH_REQUIRE(raw->as<Linear>() == module.get());
+  CATCH_REQUIRE(raw->as<LinearImpl>() == module.get());
+  CATCH_REQUIRE(raw->as<Module>() == module.get());
+  CATCH_REQUIRE(raw->as<AGIUnit>() == nullptr);
 
   Module& raw_ref = *raw.get();
-  REQUIRE(raw_ref.as<Linear>() == module.get());
-  REQUIRE(raw_ref.as<LinearImpl>() == module.get());
-  REQUIRE(raw_ref.as<Module>() == module.get());
-  REQUIRE(raw_ref.as<AGIUnit>() == nullptr);
+  CATCH_REQUIRE(raw_ref.as<Linear>() == module.get());
+  CATCH_REQUIRE(raw_ref.as<LinearImpl>() == module.get());
+  CATCH_REQUIRE(raw_ref.as<Module>() == module.get());
+  CATCH_REQUIRE(raw_ref.as<AGIUnit>() == nullptr);
   if (auto* linear = raw_ref.as<Linear>()) {
-    REQUIRE(linear->weight.ndimension() == 2);
+    CATCH_REQUIRE(linear->weight.ndimension() == 2);
   }
 
   AGIUnit unit;
-  REQUIRE(unit.as<Linear>() == nullptr);
-  REQUIRE(unit.as<LinearImpl>() == nullptr);
-  REQUIRE(unit.as<AGIUnit>() == &unit);
+  CATCH_REQUIRE(unit.as<Linear>() == nullptr);
+  CATCH_REQUIRE(unit.as<LinearImpl>() == nullptr);
+  CATCH_REQUIRE(unit.as<AGIUnit>() == &unit);
 }
 
-TEST_CASE("module/conversions", "[multi-cuda]") {
+CATCH_TEST_CASE("module/conversions", "[multi-cuda]") {
   torch::manual_seed(0);
   Linear module(128, 64);
-  SECTION("starts as float on CPU") {
+  CATCH_SECTION("starts as float on CPU") {
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->device() == torch::Device(torch::kCPU));
-      REQUIRE(parameter->dtype() == torch::kFloat32);
+      CATCH_REQUIRE(parameter->device() == torch::Device(torch::kCPU));
+      CATCH_REQUIRE(parameter->dtype() == torch::kFloat32);
     }
   }
-  SECTION("to(CUDA)") {
+  CATCH_SECTION("to(CUDA)") {
     module->to({torch::kCUDA, 0});
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
-      REQUIRE(parameter->device().index() == 0);
+      CATCH_REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
+      CATCH_REQUIRE(parameter->device().index() == 0);
     }
     module->to({at::kCUDA, 1});
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
-      REQUIRE(parameter->device().index() == 1);
+      CATCH_REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
+      CATCH_REQUIRE(parameter->device().index() == 1);
     }
   }
-  SECTION("to(CPU)") {
+  CATCH_SECTION("to(CPU)") {
     module->to(torch::Device(torch::kCPU));
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->device().type() == torch::Device::Type::CPU);
+      CATCH_REQUIRE(parameter->device().type() == torch::Device::Type::CPU);
     }
   }
-  SECTION("to(Int32)") {
+  CATCH_SECTION("to(Int32)") {
     module->to(torch::kInt32);
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->dtype() == torch::kInt32);
+      CATCH_REQUIRE(parameter->dtype() == torch::kInt32);
     }
   }
-  SECTION("to(Float64)") {
+  CATCH_SECTION("to(Float64)") {
     module->to(torch::kFloat64);
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->dtype() == torch::kFloat64);
+      CATCH_REQUIRE(parameter->dtype() == torch::kFloat64);
     }
   }
-  SECTION("to(CUDA, Byte)") {
+  CATCH_SECTION("to(CUDA, Byte)") {
     module->to(torch::Device(torch::kCUDA, 1), torch::kUInt8);
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
-      REQUIRE(parameter->device().index() == 1);
+      CATCH_REQUIRE(parameter->device().type() == torch::Device::Type::CUDA);
+      CATCH_REQUIRE(parameter->device().index() == 1);
     }
     for (auto& parameter : module->parameters()) {
-      REQUIRE(parameter->dtype() == torch::kUInt8);
+      CATCH_REQUIRE(parameter->dtype() == torch::kUInt8);
     }
   }
 }
 
-TEST_CASE("module/clone") {
+CATCH_TEST_CASE("module/clone") {
   torch::manual_seed(0);
-  SECTION(
+  CATCH_SECTION(
       "a module that does not override clone() throws when clone() is called") {
     struct UnCloneable : Module {};
     UnCloneable module;
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         module.clone(), StartsWith("clone() has not been implemented"));
   }
 
-  SECTION(
+  CATCH_SECTION(
       "a module that overrides clone() does not throw when clone() is called ") {
     struct Cloneable : Module {
       std::shared_ptr<Module> clone(
@@ -190,10 +190,10 @@ TEST_CASE("module/clone") {
       }
     };
     Cloneable module;
-    REQUIRE_NOTHROW(module.clone());
+    CATCH_REQUIRE_NOTHROW(module.clone());
   }
 
-  SECTION("Cloning creates distinct parameters") {
+  CATCH_SECTION("Cloning creates distinct parameters") {
     struct TestModule : public Cloneable<TestModule> {
       TestModule() {
         reset();
@@ -216,32 +216,32 @@ TEST_CASE("module/clone") {
     auto module2 = module->clone();
     auto params1 = module->parameters();
     auto params2 = module2->parameters();
-    REQUIRE(params1.size() == 6);
-    REQUIRE(params2.size() == 6);
+    CATCH_REQUIRE(params1.size() == 6);
+    CATCH_REQUIRE(params2.size() == 6);
     for (auto& param : params1) {
-      REQUIRE(!pointer_equal(param.value, params2[param.key]));
-      REQUIRE(param->allclose(params2[param.key]));
+      CATCH_REQUIRE(!pointer_equal(param.value, params2[param.key]));
+      CATCH_REQUIRE(param->allclose(params2[param.key]));
       param->add_(2);
     }
     for (auto& param : params1) {
-      REQUIRE(!param->allclose(params2[param.key]));
+      CATCH_REQUIRE(!param->allclose(params2[param.key]));
     }
 
     auto buffers1 = module->buffers();
     auto buffers2 = module2->buffers();
-    REQUIRE(buffers1.size() == 1);
-    REQUIRE(buffers2.size() == 1);
+    CATCH_REQUIRE(buffers1.size() == 1);
+    CATCH_REQUIRE(buffers2.size() == 1);
     for (auto& buffer : buffers1) {
-      REQUIRE(!pointer_equal(buffer.value, buffers2[buffer.key]));
-      REQUIRE(buffer->allclose(buffers2[buffer.key]));
+      CATCH_REQUIRE(!pointer_equal(buffer.value, buffers2[buffer.key]));
+      CATCH_REQUIRE(buffer->allclose(buffers2[buffer.key]));
       buffer->add_(2);
     }
     for (auto& buffer : buffers1) {
-      REQUIRE(!buffer->allclose(buffers2[buffer.key]));
+      CATCH_REQUIRE(!buffer->allclose(buffers2[buffer.key]));
     }
   }
 
-  SECTION("Cloning preserves external references") {
+  CATCH_SECTION("Cloning preserves external references") {
     struct TestModule : public Cloneable<TestModule> {
       TestModule() {
         reset();
@@ -256,19 +256,19 @@ TEST_CASE("module/clone") {
       torch::NoGradGuard no_grad;
       module->weight += 1;
     }
-    REQUIRE(pointer_equal(module->weight, module->parameters()["weight"]));
-    REQUIRE(module->weight.allclose(module->parameters()["weight"]));
+    CATCH_REQUIRE(pointer_equal(module->weight, module->parameters()["weight"]));
+    CATCH_REQUIRE(module->weight.allclose(module->parameters()["weight"]));
 
     auto module2 = std::dynamic_pointer_cast<TestModule>(
         std::shared_ptr<Module>(module->clone()));
-    REQUIRE(!pointer_equal(module2->weight, module->weight));
-    REQUIRE(pointer_equal(module2->weight, module2->parameters()["weight"]));
-    REQUIRE(module2->weight.allclose(module2->parameters()["weight"]));
-    REQUIRE(module2->weight.allclose(module->weight));
-    REQUIRE(!pointer_equal(module2->weight, module->parameters()["weight"]));
+    CATCH_REQUIRE(!pointer_equal(module2->weight, module->weight));
+    CATCH_REQUIRE(pointer_equal(module2->weight, module2->parameters()["weight"]));
+    CATCH_REQUIRE(module2->weight.allclose(module2->parameters()["weight"]));
+    CATCH_REQUIRE(module2->weight.allclose(module->weight));
+    CATCH_REQUIRE(!pointer_equal(module2->weight, module->parameters()["weight"]));
   }
 
-  SECTION("Cloning copies the values of variables of submodules") {
+  CATCH_SECTION("Cloning copies the values of variables of submodules") {
     struct TestModule : public Cloneable<TestModule> {
       TestModule() {
         reset();
@@ -299,16 +299,16 @@ TEST_CASE("module/clone") {
 
     auto b = std::dynamic_pointer_cast<NestedModule>(a->clone());
 
-    REQUIRE(!pointer_equal(b->module->weight, a->module->weight));
-    REQUIRE(
+    CATCH_REQUIRE(!pointer_equal(b->module->weight, a->module->weight));
+    CATCH_REQUIRE(
         pointer_equal(b->module->weight, b->module->parameters()["weight"]));
-    REQUIRE(b->module->parameters()["weight"].allclose(a->module->weight));
-    REQUIRE(b->module->weight.allclose(a->module->weight));
-    REQUIRE(b->module->value == a->module->value);
+    CATCH_REQUIRE(b->module->parameters()["weight"].allclose(a->module->weight));
+    CATCH_REQUIRE(b->module->weight.allclose(a->module->weight));
+    CATCH_REQUIRE(b->module->value == a->module->value);
   }
 }
 
-TEST_CASE("module/clone-to-device", "[cuda]") {
+CATCH_TEST_CASE("module/clone-to-device", "[cuda]") {
   struct TestModule : public Cloneable<TestModule> {
     TestModule() {
       reset();
@@ -324,7 +324,7 @@ TEST_CASE("module/clone-to-device", "[cuda]") {
     torch::Tensor buffer;
   };
 
-  SECTION("Cloning preserves the device of parameters/buffers") {
+  CATCH_SECTION("Cloning preserves the device of parameters/buffers") {
     TestModule m;
     torch::Device device(torch::kCUDA, 0);
 
@@ -332,33 +332,33 @@ TEST_CASE("module/clone-to-device", "[cuda]") {
 
     auto clone = m.clone();
     for (const auto& parameter : clone->parameters()) {
-      REQUIRE(parameter->device().type() == device.type());
-      REQUIRE(parameter->device().index() == device.index());
+      CATCH_REQUIRE(parameter->device().type() == device.type());
+      CATCH_REQUIRE(parameter->device().index() == device.index());
     }
     for (const auto& buffer : clone->buffers()) {
-      REQUIRE(buffer->device().type() == device.type());
-      REQUIRE(buffer->device().index() == device.index());
+      CATCH_REQUIRE(buffer->device().type() == device.type());
+      CATCH_REQUIRE(buffer->device().index() == device.index());
     }
   }
 
-  SECTION(
+  CATCH_SECTION(
       "Cloning to a particular device places all parameters/buffers there") {
     TestModule m;
     torch::Device device(torch::kCUDA, 1);
     // everything is on CPU here
     auto clone = m.clone(device);
     for (const auto& parameter : clone->parameters()) {
-      REQUIRE(parameter->device().type() == device.type());
-      REQUIRE(parameter->device().index() == device.index());
+      CATCH_REQUIRE(parameter->device().type() == device.type());
+      CATCH_REQUIRE(parameter->device().index() == device.index());
     }
     for (const auto& buffer : clone->buffers()) {
-      REQUIRE(buffer->device().type() == device.type());
-      REQUIRE(buffer->device().index() == device.index());
+      CATCH_REQUIRE(buffer->device().type() == device.type());
+      CATCH_REQUIRE(buffer->device().index() == device.index());
     }
   }
 }
 
-TEST_CASE("module/parameters") {
+CATCH_TEST_CASE("module/parameters") {
   torch::manual_seed(0);
   struct TestModule : Module {
     TestModule() {
@@ -372,19 +372,19 @@ TEST_CASE("module/parameters") {
 
   TestModule module;
 
-  SECTION("has correct number of parameters") {
-    REQUIRE(module.parameters().size() == 3);
+  CATCH_SECTION("has correct number of parameters") {
+    CATCH_REQUIRE(module.parameters().size() == 3);
   }
 
-  SECTION("contains parameters with the correct name") {
+  CATCH_SECTION("contains parameters with the correct name") {
     auto parameters = module.parameters();
-    REQUIRE(parameters.contains("a"));
-    REQUIRE(parameters.contains("b"));
-    REQUIRE(parameters.contains("c"));
+    CATCH_REQUIRE(parameters.contains("a"));
+    CATCH_REQUIRE(parameters.contains("b"));
+    CATCH_REQUIRE(parameters.contains("c"));
   }
 }
 
-TEST_CASE("module/buffers") {
+CATCH_TEST_CASE("module/buffers") {
   torch::manual_seed(0);
   struct TestModule : Module {
     TestModule() {
@@ -398,19 +398,19 @@ TEST_CASE("module/buffers") {
 
   TestModule module;
 
-  SECTION("has correct number of buffers") {
-    REQUIRE(module.buffers().size() == 3);
+  CATCH_SECTION("has correct number of buffers") {
+    CATCH_REQUIRE(module.buffers().size() == 3);
   }
 
-  SECTION("contains buffers with the correct name") {
+  CATCH_SECTION("contains buffers with the correct name") {
     auto buffers = module.buffers();
-    REQUIRE(buffers.contains("a"));
-    REQUIRE(buffers.contains("b"));
-    REQUIRE(buffers.contains("c"));
+    CATCH_REQUIRE(buffers.contains("a"));
+    CATCH_REQUIRE(buffers.contains("b"));
+    CATCH_REQUIRE(buffers.contains("c"));
   }
 }
 
-TEST_CASE("module/default-constructor") {
+CATCH_TEST_CASE("module/default-constructor") {
   struct AImpl : torch::nn::Module {
     AImpl() : x_(123) {}
     AImpl(int x) : x_(x) {}
@@ -420,20 +420,20 @@ TEST_CASE("module/default-constructor") {
 
   {
     A a;
-    REQUIRE(a);
-    REQUIRE(!a.is_empty());
-    REQUIRE(a->x_ == 123);
+    CATCH_REQUIRE(a);
+    CATCH_REQUIRE(!a.is_empty());
+    CATCH_REQUIRE(a->x_ == 123);
   }
   {
     A a(5);
-    REQUIRE(a);
-    REQUIRE(!a.is_empty());
-    REQUIRE(a->x_ == 5);
+    CATCH_REQUIRE(a);
+    CATCH_REQUIRE(!a.is_empty());
+    CATCH_REQUIRE(a->x_ == 5);
   }
   {
     A a = nullptr;
-    REQUIRE(!a);
-    REQUIRE(a.is_empty());
-    REQUIRE_THROWS_WITH(a->x_, StartsWith("Accessing empty ModuleHolder"));
+    CATCH_REQUIRE(!a);
+    CATCH_REQUIRE(a.is_empty());
+    CATCH_REQUIRE_THROWS_WITH(a->x_, StartsWith("Accessing empty ModuleHolder"));
   }
 }
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 95f38d15a4c822..7d4f9ab05fe93e 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/module.h>
 #include <torch/nn/modules/batchnorm.h>
@@ -12,6 +12,8 @@
 
 #include <test/cpp/api/util.h>
 
+using Catch::StartsWith;
+
 using namespace torch::nn;
 using namespace torch::test;
 
@@ -37,92 +39,92 @@ class NestedModel : public torch::nn::Module {
   std::shared_ptr<TestModel> t;
 };
 
-TEST_CASE("modules") {
+CATCH_TEST_CASE("modules") {
   torch::manual_seed(0);
-  SECTION("conv") {
-    SECTION("1d") {
+  CATCH_SECTION("conv") {
+    CATCH_SECTION("1d") {
       Conv1d model(Conv1dOptions(3, 2, 3).stride(2));
       auto x = torch::randn({2, 3, 5}, torch::requires_grad());
       auto y = model->forward(x);
       torch::Tensor s = y.sum();
 
       s.backward();
-      REQUIRE(y.ndimension() == 3);
-      REQUIRE(s.ndimension() == 0);
+      CATCH_REQUIRE(y.ndimension() == 3);
+      CATCH_REQUIRE(s.ndimension() == 0);
       for (auto i = 0; i < 3; i++) {
-        REQUIRE(y.size(i) == 2);
+        CATCH_REQUIRE(y.size(i) == 2);
       }
 
-      REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3);
+      CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3);
     }
-    SECTION("2d") {
-      SECTION("even") {
+    CATCH_SECTION("2d") {
+      CATCH_SECTION("even") {
         Conv2d model(Conv2dOptions(3, 2, 3).stride(2));
         auto x = torch::randn({2, 3, 5, 5}, torch::requires_grad());
         auto y = model->forward(x);
         torch::Tensor s = y.sum();
 
         s.backward();
-        REQUIRE(y.ndimension() == 4);
-        REQUIRE(s.ndimension() == 0);
+        CATCH_REQUIRE(y.ndimension() == 4);
+        CATCH_REQUIRE(s.ndimension() == 0);
         for (auto i = 0; i < 4; i++) {
-          REQUIRE(y.size(i) == 2);
+          CATCH_REQUIRE(y.size(i) == 2);
         }
 
-        REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 3);
+        CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 3);
       }
 
-      SECTION("uneven") {
+      CATCH_SECTION("uneven") {
         Conv2d model(Conv2dOptions(3, 2, {3, 2}).stride({2, 2}));
         auto x = torch::randn({2, 3, 5, 4}, torch::requires_grad());
         auto y = model->forward(x);
         torch::Tensor s = y.sum();
 
         s.backward();
-        REQUIRE(y.ndimension() == 4);
-        REQUIRE(s.ndimension() == 0);
+        CATCH_REQUIRE(y.ndimension() == 4);
+        CATCH_REQUIRE(s.ndimension() == 0);
         for (auto i = 0; i < 4; i++) {
-          REQUIRE(y.size(i) == 2);
+          CATCH_REQUIRE(y.size(i) == 2);
         }
 
-        REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 2);
+        CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 2);
       }
     }
-    SECTION("3d") {
+    CATCH_SECTION("3d") {
       Conv3d model(Conv3dOptions(3, 2, 3).stride(2));
       auto x = torch::randn({2, 3, 5, 5, 5}, torch::requires_grad());
       auto y = model->forward(x);
       torch::Tensor s = y.sum();
 
       s.backward();
-      REQUIRE(y.ndimension() == 5);
-      REQUIRE(s.ndimension() == 0);
+      CATCH_REQUIRE(y.ndimension() == 5);
+      CATCH_REQUIRE(s.ndimension() == 0);
       for (auto i = 0; i < 5; i++) {
-        REQUIRE(y.size(i) == 2);
+        CATCH_REQUIRE(y.size(i) == 2);
       }
 
-      REQUIRE(
+      CATCH_REQUIRE(
           model->parameters()["weight"].grad().numel() == 3 * 2 * 3 * 3 * 3);
     }
   }
-  SECTION("linear") {
-    SECTION("basic1") {
+  CATCH_SECTION("linear") {
+    CATCH_SECTION("basic1") {
       Linear model(5, 2);
       auto x = torch::randn({10, 5}, torch::requires_grad());
       auto y = model->forward(x);
       torch::Tensor s = y.sum();
 
       s.backward();
-      REQUIRE(y.ndimension() == 2);
-      REQUIRE(s.ndimension() == 0);
-      REQUIRE(y.size(0) == 10);
-      REQUIRE(y.size(1) == 2);
+      CATCH_REQUIRE(y.ndimension() == 2);
+      CATCH_REQUIRE(s.ndimension() == 0);
+      CATCH_REQUIRE(y.size(0) == 10);
+      CATCH_REQUIRE(y.size(1) == 2);
 
-      REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
+      CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
     }
   }
 
-  SECTION("simple") {
+  CATCH_SECTION("simple") {
     auto model = std::make_shared<SimpleContainer>();
     auto l1 = model->add(Linear(10, 3), "l1");
     auto l2 = model->add(Linear(3, 5), "l2");
@@ -134,20 +136,20 @@ TEST_CASE("modules") {
     x = l3->forward(x).clamp_min(0);
 
     x.backward();
-    REQUIRE(x.ndimension() == 2);
-    REQUIRE(x.size(0) == 1000);
-    REQUIRE(x.size(1) == 100);
-    REQUIRE(x.min().toCFloat() == 0);
+    CATCH_REQUIRE(x.ndimension() == 2);
+    CATCH_REQUIRE(x.size(0) == 1000);
+    CATCH_REQUIRE(x.size(1) == 100);
+    CATCH_REQUIRE(x.min().toCFloat() == 0);
   }
 
-  SECTION("embedding") {
-    SECTION("basic") {
+  CATCH_SECTION("embedding") {
+    CATCH_SECTION("basic") {
       const int64_t dict_size = 10;
       Embedding model(dict_size, 2);
-      REQUIRE(model->parameters().contains("weight"));
-      REQUIRE(model->weight.ndimension() == 2);
-      REQUIRE(model->weight.size(0) == dict_size);
-      REQUIRE(model->weight.size(1) == 2);
+      CATCH_REQUIRE(model->parameters().contains("weight"));
+      CATCH_REQUIRE(model->weight.ndimension() == 2);
+      CATCH_REQUIRE(model->weight.size(0) == dict_size);
+      CATCH_REQUIRE(model->weight.size(1) == 2);
 
       // Cannot get gradients to change indices (input) - only for embedding
       // params
@@ -156,65 +158,65 @@ TEST_CASE("modules") {
       torch::Tensor s = y.sum();
 
       s.backward();
-      REQUIRE(y.ndimension() == 2);
-      REQUIRE(s.ndimension() == 0);
-      REQUIRE(y.size(0) == 10);
-      REQUIRE(y.size(1) == 2);
+      CATCH_REQUIRE(y.ndimension() == 2);
+      CATCH_REQUIRE(s.ndimension() == 0);
+      CATCH_REQUIRE(y.size(0) == 10);
+      CATCH_REQUIRE(y.size(1) == 2);
 
-      REQUIRE(model->parameters()["weight"].grad().numel() == 2 * dict_size);
+      CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 2 * dict_size);
     }
 
-    SECTION("list") {
+    CATCH_SECTION("list") {
       Embedding model(6, 4);
       auto x = torch::full({2, 3}, 5, torch::kInt64);
       auto y = model->forward(x);
       torch::Tensor s = y.sum();
 
       s.backward();
-      REQUIRE(y.ndimension() == 3);
-      REQUIRE(y.size(0) == 2);
-      REQUIRE(y.size(1) == 3);
-      REQUIRE(y.size(2) == 4);
+      CATCH_REQUIRE(y.ndimension() == 3);
+      CATCH_REQUIRE(y.size(0) == 2);
+      CATCH_REQUIRE(y.size(1) == 3);
+      CATCH_REQUIRE(y.size(2) == 4);
     }
   }
 
-  SECTION("dropout") {
+  CATCH_SECTION("dropout") {
     Dropout dropout(0.5);
     torch::Tensor x = torch::ones(100, torch::requires_grad());
     torch::Tensor y = dropout->forward(x);
 
     y.backward();
-    REQUIRE(y.ndimension() == 1);
-    REQUIRE(y.size(0) == 100);
-    REQUIRE(y.sum().toCFloat() < 130); // Probably
-    REQUIRE(y.sum().toCFloat() > 70); // Probably
+    CATCH_REQUIRE(y.ndimension() == 1);
+    CATCH_REQUIRE(y.size(0) == 100);
+    CATCH_REQUIRE(y.sum().toCFloat() < 130); // Probably
+    CATCH_REQUIRE(y.sum().toCFloat() > 70); // Probably
 
     dropout->eval();
     y = dropout->forward(x);
-    REQUIRE(y.sum().toCFloat() == 100);
+    CATCH_REQUIRE(y.sum().toCFloat() == 100);
   }
 
-  SECTION("param") {
+  CATCH_SECTION("param") {
     auto model = std::make_shared<NestedModel>();
     auto parameters = model->parameters();
-    REQUIRE(parameters["param"].size(0) == 3);
-    REQUIRE(parameters["param"].size(1) == 2);
-    REQUIRE(parameters["param"].size(2) == 21);
-    REQUIRE(parameters["l1.bias"].size(0) == 20);
-    REQUIRE(parameters["l1.weight"].size(0) == 20);
-    REQUIRE(parameters["l1.weight"].size(1) == 5);
-    REQUIRE(parameters["test.l1.bias"].size(0) == 3);
-    REQUIRE(parameters["test.l1.weight"].size(0) == 3);
-    REQUIRE(parameters["test.l1.weight"].size(1) == 10);
-    REQUIRE(parameters["test.l2.bias"].size(0) == 5);
-    REQUIRE(parameters["test.l2.weight"].size(0) == 5);
-    REQUIRE(parameters["test.l2.weight"].size(1) == 3);
-    REQUIRE(parameters["test.l3.bias"].size(0) == 100);
-    REQUIRE(parameters["test.l3.weight"].size(0) == 100);
-    REQUIRE(parameters["test.l3.weight"].size(1) == 5);
+    CATCH_REQUIRE(parameters["param"].size(0) == 3);
+    CATCH_REQUIRE(parameters["param"].size(1) == 2);
+    CATCH_REQUIRE(parameters["param"].size(2) == 21);
+    CATCH_REQUIRE(parameters["l1.bias"].size(0) == 20);
+    CATCH_REQUIRE(parameters["l1.weight"].size(0) == 20);
+    CATCH_REQUIRE(parameters["l1.weight"].size(1) == 5);
+    CATCH_REQUIRE(parameters["test.l1.bias"].size(0) == 3);
+    CATCH_REQUIRE(parameters["test.l1.weight"].size(0) == 3);
+    CATCH_REQUIRE(parameters["test.l1.weight"].size(1) == 10);
+    CATCH_REQUIRE(parameters["test.l2.bias"].size(0) == 5);
+    CATCH_REQUIRE(parameters["test.l2.weight"].size(0) == 5);
+    CATCH_REQUIRE(parameters["test.l2.weight"].size(1) == 3);
+    CATCH_REQUIRE(parameters["test.l3.bias"].size(0) == 100);
+    CATCH_REQUIRE(parameters["test.l3.weight"].size(0) == 100);
+    CATCH_REQUIRE(parameters["test.l3.weight"].size(1) == 5);
   }
 
-  SECTION("functional") {
+  CATCH_SECTION("functional") {
     {
       bool was_called = false;
       auto functional = Functional([&was_called](torch::Tensor input) {
@@ -222,30 +224,88 @@ TEST_CASE("modules") {
         return input;
       });
       auto output = functional->forward(torch::ones(5, torch::requires_grad()));
-      REQUIRE(was_called);
-      REQUIRE(output.equal(torch::ones(5, torch::requires_grad())));
+      CATCH_REQUIRE(was_called);
+      CATCH_REQUIRE(output.equal(torch::ones(5, torch::requires_grad())));
 
       was_called = false;
+      // Use the call operator overload here.
       output = functional(torch::ones(5, torch::requires_grad()));
-      REQUIRE(was_called);
-      REQUIRE(output.equal(torch::ones(5, torch::requires_grad())));
+      CATCH_REQUIRE(was_called);
+      CATCH_REQUIRE(output.equal(torch::ones(5, torch::requires_grad())));
     }
     {
       auto functional = Functional(torch::relu);
-      REQUIRE(functional(torch::ones({})).toCFloat() == 1);
-      REQUIRE(functional(torch::ones({})).toCFloat() == 1);
-      REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0);
+      CATCH_REQUIRE(functional(torch::ones({})).toCFloat() == 1);
+      CATCH_REQUIRE(functional(torch::ones({})).toCFloat() == 1);
+      CATCH_REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0);
+    }
+    {
+      auto functional =
+          Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1);
+      CATCH_REQUIRE(functional(torch::ones({})).toCFloat() == 0);
+    }
+  }
+
+  CATCH_SECTION("batchnorm") {
+    {
+      BatchNorm bn(5);
+
+      // Is stateful by default.
+      CATCH_REQUIRE(bn->options.stateful());
+
+      CATCH_REQUIRE(bn->running_mean.defined());
+      CATCH_REQUIRE(bn->running_mean.dim() == 1);
+      CATCH_REQUIRE(bn->running_mean.size(0) == 5);
+
+      CATCH_REQUIRE(bn->running_variance.defined());
+      CATCH_REQUIRE(bn->running_variance.dim() == 1);
+      CATCH_REQUIRE(bn->running_variance.size(0) == 5);
+
+      // Is affine by default.
+      CATCH_REQUIRE(bn->options.affine());
+
+      CATCH_REQUIRE(bn->weight.defined());
+      CATCH_REQUIRE(bn->weight.dim() == 1);
+      CATCH_REQUIRE(bn->weight.size(0) == 5);
+
+      CATCH_REQUIRE(bn->bias.defined());
+      CATCH_REQUIRE(bn->bias.dim() == 1);
+      CATCH_REQUIRE(bn->bias.size(0) == 5);
+    }
+    {
+      BatchNorm bn(BatchNormOptions(5).stateful(false).affine(false));
+
+      CATCH_REQUIRE(!bn->running_mean.defined());
+      CATCH_REQUIRE(!bn->running_variance.defined());
+      CATCH_REQUIRE(!bn->weight.defined());
+      CATCH_REQUIRE(!bn->bias.defined());
+
+      CATCH_REQUIRE_THROWS_WITH(
+          bn->forward(torch::ones({2, 5})),
+          StartsWith("Calling BatchNorm::forward is only permitted "
+                     "when the 'stateful' option is true (was false). "
+                     "Use BatchNorm::pure_forward instead."));
     }
     {
-      auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1);
-      REQUIRE(functional(torch::ones({})).toCFloat() == 0);
+      BatchNorm bn(BatchNormOptions(5).affine(false));
+      bn->eval();
+
+      // Want to make sure we use the supplied values in `pure_forward` even if
+      // we are stateful.
+      auto input = torch::randn({2, 5});
+      auto mean = torch::randn(5);
+      auto variance = torch::rand(5);
+      auto output = bn->pure_forward(input, mean, variance);
+      auto expected =
+          (input - mean) / torch::sqrt(variance + bn->options.eps());
+      CATCH_REQUIRE(output.allclose(expected));
     }
   }
 }
 
-TEST_CASE("modules_cuda", "[cuda]") {
+CATCH_TEST_CASE("modules_cuda", "[cuda]") {
   torch::manual_seed(0);
-  SECTION("1") {
+  CATCH_SECTION("1") {
     Linear model(5, 2);
     model->to(torch::kCUDA);
     auto x =
@@ -254,15 +314,15 @@ TEST_CASE("modules_cuda", "[cuda]") {
     torch::Tensor s = y.sum();
 
     s.backward();
-    REQUIRE(y.ndimension() == 2);
-    REQUIRE(s.ndimension() == 0);
-    REQUIRE(y.size(0) == 10);
-    REQUIRE(y.size(1) == 2);
+    CATCH_REQUIRE(y.ndimension() == 2);
+    CATCH_REQUIRE(s.ndimension() == 0);
+    CATCH_REQUIRE(y.size(0) == 10);
+    CATCH_REQUIRE(y.size(1) == 2);
 
-    REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
+    CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
   }
 
-  SECTION("2") {
+  CATCH_SECTION("2") {
     Linear model(5, 2);
     model->to(torch::kCUDA);
     model->to(torch::kCPU);
@@ -271,11 +331,11 @@ TEST_CASE("modules_cuda", "[cuda]") {
     torch::Tensor s = y.sum();
 
     s.backward();
-    REQUIRE(y.ndimension() == 2);
-    REQUIRE(s.ndimension() == 0);
-    REQUIRE(y.size(0) == 10);
-    REQUIRE(y.size(1) == 2);
+    CATCH_REQUIRE(y.ndimension() == 2);
+    CATCH_REQUIRE(s.ndimension() == 0);
+    CATCH_REQUIRE(y.size(0) == 10);
+    CATCH_REQUIRE(y.size(1) == 2);
 
-    REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
+    CATCH_REQUIRE(model->parameters()["weight"].grad().numel() == 2 * 5);
   }
 }
diff --git a/test/cpp/api/optim.cpp b/test/cpp/api/optim.cpp
index ab278180b12b34..4cb398dd89dc82 100644
--- a/test/cpp/api/optim.cpp
+++ b/test/cpp/api/optim.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/module.h>
 #include <torch/nn/modules/functional.h>
@@ -118,24 +118,24 @@ void check_exact_values(
     optimizer.step();
 
     if (i % kSampleEvery == 0) {
-      REQUIRE(
+      CATCH_REQUIRE(
           expected_parameters.at(i / kSampleEvery).size() == parameters.size());
       for (size_t p = 0; p < parameters.size(); ++p) {
-        REQUIRE(parameters.at(p)->defined());
+        CATCH_REQUIRE(parameters.at(p)->defined());
         auto computed = parameters.at(p)->flatten();
         auto expected = expected_parameters.at(i / kSampleEvery).at(p);
         if (!computed.allclose(expected, /*rtol=*/1e-3, /*atol=*/5e-4)) {
           std::cout << "Iteration " << i << ": " << computed
                     << " != " << expected << " (parameter " << p << ")"
                     << std::endl;
-          REQUIRE(false);
+          CATCH_REQUIRE(false);
         }
       }
     }
   }
 }
 
-TEST_CASE("Optim/BasicInterface") {
+CATCH_TEST_CASE("Optim/BasicInterface") {
   struct MyOptimizer : Optimizer {
     using Optimizer::Optimizer;
     void step() override {}
@@ -144,139 +144,139 @@ TEST_CASE("Optim/BasicInterface") {
       torch::ones({2, 3}), torch::zeros({2, 3}), torch::rand({2, 3})};
   {
     MyOptimizer optimizer(parameters);
-    REQUIRE(optimizer.size() == parameters.size());
+    CATCH_REQUIRE(optimizer.size() == parameters.size());
   }
   {
     MyOptimizer optimizer;
-    REQUIRE(optimizer.size() == 0);
+    CATCH_REQUIRE(optimizer.size() == 0);
     optimizer.add_parameters(parameters);
-    REQUIRE(optimizer.size() == parameters.size());
+    CATCH_REQUIRE(optimizer.size() == parameters.size());
     for (size_t p = 0; p < parameters.size(); ++p) {
-      REQUIRE(optimizer.parameters()[p].allclose(parameters[p]));
+      CATCH_REQUIRE(optimizer.parameters()[p].allclose(parameters[p]));
     }
   }
   {
     Linear linear(3, 4);
     MyOptimizer optimizer(linear->parameters());
-    REQUIRE(optimizer.size() == linear->parameters().size());
+    CATCH_REQUIRE(optimizer.size() == linear->parameters().size());
   }
 }
 
-TEST_CASE("Optim/XORConvergence/SGD") {
-  REQUIRE(test_optimizer_xor<SGD>(
+CATCH_TEST_CASE("Optim/XORConvergence/SGD") {
+  CATCH_REQUIRE(test_optimizer_xor<SGD>(
       SGDOptions(0.1).momentum(0.9).nesterov(true).weight_decay(1e-6)));
 }
 
-TEST_CASE("Optim/XORConvergence/Adagrad") {
-  REQUIRE(test_optimizer_xor<Adagrad>(
+CATCH_TEST_CASE("Optim/XORConvergence/Adagrad") {
+  CATCH_REQUIRE(test_optimizer_xor<Adagrad>(
       AdagradOptions(1.0).weight_decay(1e-6).lr_decay(1e-3)));
 }
 
-TEST_CASE("Optim/XORConvergence/RMSprop") {
-  REQUIRE(test_optimizer_xor<RMSprop>(RMSpropOptions(0.1).centered(true)));
+CATCH_TEST_CASE("Optim/XORConvergence/RMSprop") {
+  CATCH_REQUIRE(test_optimizer_xor<RMSprop>(RMSpropOptions(0.1).centered(true)));
 }
 
-TEST_CASE("Optim/XORConvergence/RMSpropWithMomentum") {
-  REQUIRE(test_optimizer_xor<RMSprop>(
+CATCH_TEST_CASE("Optim/XORConvergence/RMSpropWithMomentum") {
+  CATCH_REQUIRE(test_optimizer_xor<RMSprop>(
       RMSpropOptions(0.1).momentum(0.9).weight_decay(1e-6)));
 }
 
-TEST_CASE("Optim/XORConvergence/Adam") {
-  REQUIRE(test_optimizer_xor<Adam>(AdamOptions(0.1).weight_decay(1e-6)));
+CATCH_TEST_CASE("Optim/XORConvergence/Adam") {
+  CATCH_REQUIRE(test_optimizer_xor<Adam>(AdamOptions(0.1).weight_decay(1e-6)));
 }
 
-TEST_CASE("Optim/XORConvergence/AdamWithAmsgrad") {
-  REQUIRE(test_optimizer_xor<Adam>(
+CATCH_TEST_CASE("Optim/XORConvergence/AdamWithAmsgrad") {
+  CATCH_REQUIRE(test_optimizer_xor<Adam>(
       AdamOptions(0.1).weight_decay(1e-6).amsgrad(true)));
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/Adam") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/Adam") {
   check_exact_values<Adam>(AdamOptions(1.0), expected_parameters::Adam);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/AdamWithWeightDecay") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/AdamWithWeightDecay") {
   check_exact_values<Adam>(
       AdamOptions(1.0).weight_decay(1e-2),
       expected_parameters::Adam_with_weight_decay);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/AdamWithWeightDecayAndAMSGrad") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/AdamWithWeightDecayAndAMSGrad") {
   check_exact_values<Adam>(
       AdamOptions(1.0).weight_decay(1e-6).amsgrad(true),
       expected_parameters::Adam_with_weight_decay_and_amsgrad);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/Adagrad") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/Adagrad") {
   check_exact_values<Adagrad>(
       AdagradOptions(1.0), expected_parameters::Adagrad);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/AdagradWithWeightDecay") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/AdagradWithWeightDecay") {
   check_exact_values<Adagrad>(
       AdagradOptions(1.0).weight_decay(1e-2),
       expected_parameters::Adagrad_with_weight_decay);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/AdagradWithWeightDecayAndLRDecay") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/AdagradWithWeightDecayAndLRDecay") {
   check_exact_values<Adagrad>(
       AdagradOptions(1.0).weight_decay(1e-6).lr_decay(1e-3),
       expected_parameters::Adagrad_with_weight_decay_and_lr_decay);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/RMSprop") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/RMSprop") {
   check_exact_values<RMSprop>(
       RMSpropOptions(0.1), expected_parameters::RMSprop);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/RMSpropWithWeightDecay") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/RMSpropWithWeightDecay") {
   check_exact_values<RMSprop>(
       RMSpropOptions(0.1).weight_decay(1e-2),
       expected_parameters::RMSprop_with_weight_decay);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/RMSpropWithWeightDecayAndCentered") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/RMSpropWithWeightDecayAndCentered") {
   check_exact_values<RMSprop>(
       RMSpropOptions(0.1).weight_decay(1e-6).centered(true),
       expected_parameters::RMSprop_with_weight_decay_and_centered);
 }
 
-TEST_CASE(
+CATCH_TEST_CASE(
     "Optim/ProducesPyTorchValues/RMSpropWithWeightDecayAndCenteredAndMomentum") {
   check_exact_values<RMSprop>(
       RMSpropOptions(0.1).weight_decay(1e-6).centered(true).momentum(0.9),
       expected_parameters::RMSprop_with_weight_decay_and_centered_and_momentum);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/SGD") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/SGD") {
   check_exact_values<SGD>(SGDOptions(0.1), expected_parameters::SGD);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecay") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecay") {
   check_exact_values<SGD>(
       SGDOptions(0.1).weight_decay(1e-2),
       expected_parameters::SGD_with_weight_decay);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecayAndMomentum") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecayAndMomentum") {
   check_exact_values<SGD>(
       SGDOptions(0.1).weight_decay(1e-2).momentum(0.9),
       expected_parameters::SGD_with_weight_decay_and_momentum);
 }
 
-TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecayAndNesterovMomentum") {
+CATCH_TEST_CASE("Optim/ProducesPyTorchValues/SGDWithWeightDecayAndNesterovMomentum") {
   check_exact_values<SGD>(
       SGDOptions(0.1).weight_decay(1e-6).momentum(0.9).nesterov(true),
       expected_parameters::SGD_with_weight_decay_and_nesterov_momentum);
 }
 
-TEST_CASE("Optim/ZeroGrad") {
+CATCH_TEST_CASE("Optim/ZeroGrad") {
   torch::manual_seed(0);
 
   Linear model(2, 8);
   SGD optimizer(model->parameters(), 0.1);
 
   for (const auto& parameter : model->parameters()) {
-    REQUIRE(!parameter->grad().defined());
+    CATCH_REQUIRE(!parameter->grad().defined());
   }
 
   auto output = model->forward(torch::ones({5, 2}));
@@ -284,19 +284,19 @@ TEST_CASE("Optim/ZeroGrad") {
   loss.backward();
 
   for (const auto& parameter : model->parameters()) {
-    REQUIRE(parameter->grad().defined());
-    REQUIRE(parameter->grad().sum().toCFloat() > 0);
+    CATCH_REQUIRE(parameter->grad().defined());
+    CATCH_REQUIRE(parameter->grad().sum().toCFloat() > 0);
   }
 
   optimizer.zero_grad();
 
   for (const auto& parameter : model->parameters()) {
-    REQUIRE(parameter->grad().defined());
-    REQUIRE(parameter->grad().sum().toCFloat() == 0);
+    CATCH_REQUIRE(parameter->grad().defined());
+    CATCH_REQUIRE(parameter->grad().sum().toCFloat() == 0);
   }
 }
 
-TEST_CASE("Optim/ExternalVectorOfParameters") {
+CATCH_TEST_CASE("Optim/ExternalVectorOfParameters") {
   torch::manual_seed(0);
 
   std::vector<torch::Tensor> parameters = {
@@ -313,12 +313,12 @@ TEST_CASE("Optim/ExternalVectorOfParameters") {
 
   optimizer.step();
 
-  REQUIRE(parameters[0].allclose(original_parameters[0] - 1.0));
-  REQUIRE(parameters[1].allclose(original_parameters[1] - 1.0));
-  REQUIRE(parameters[2].allclose(original_parameters[2] - 1.0));
+  CATCH_REQUIRE(parameters[0].allclose(original_parameters[0] - 1.0));
+  CATCH_REQUIRE(parameters[1].allclose(original_parameters[1] - 1.0));
+  CATCH_REQUIRE(parameters[2].allclose(original_parameters[2] - 1.0));
 }
 
-TEST_CASE("Optim/AddParameter/LBFGS") {
+CATCH_TEST_CASE("Optim/AddParameter/LBFGS") {
   torch::manual_seed(0);
 
   std::vector<torch::Tensor> parameters = {torch::randn({5, 5})};
diff --git a/test/cpp/api/parallel.cpp b/test/cpp/api/parallel.cpp
index a1517586fc9718..33e3a16d9a8040 100644
--- a/test/cpp/api/parallel.cpp
+++ b/test/cpp/api/parallel.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/csrc/autograd/functions/comm.h>
 #include <torch/nn/module.h>
@@ -19,92 +19,92 @@ using namespace torch::nn;
 
 #ifdef USE_CUDA
 
-TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") {
+CATCH_TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") {
   Scatter scatter(
       {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
 
   auto input = torch::ones(10, torch::requires_grad(true));
   auto output = scatter.apply({input});
 
-  REQUIRE(output.size() == 2);
-  REQUIRE(output[0].size(0) == 5);
-  REQUIRE(output[1].size(0) == 5);
+  CATCH_REQUIRE(output.size() == 2);
+  CATCH_REQUIRE(output[0].size(0) == 5);
+  CATCH_REQUIRE(output[1].size(0) == 5);
 
-  REQUIRE(torch::cat({output[0].to(torch::kCPU), output[1].to(torch::kCPU)})
+  CATCH_REQUIRE(torch::cat({output[0].to(torch::kCPU), output[1].to(torch::kCPU)})
               .allclose(input));
 
   auto sum = output[0].to({torch::kCUDA, 1}) + output[1];
   sum.backward();
 
-  REQUIRE(input.grad().defined());
-  REQUIRE(input.grad().device().is_cpu());
-  REQUIRE(input.grad().sum().toCInt() == 10);
+  CATCH_REQUIRE(input.grad().defined());
+  CATCH_REQUIRE(input.grad().device().is_cpu());
+  CATCH_REQUIRE(input.grad().sum().toCInt() == 10);
 }
 
-TEST_CASE("Parallel/DifferentiableGather", "[multi-cuda]") {
+CATCH_TEST_CASE("Parallel/DifferentiableGather", "[multi-cuda]") {
   Gather gather(torch::Device(torch::kCUDA, 1));
 
   auto a = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 0}));
   auto b = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 1}));
 
   auto outputs = gather.apply({a, b});
-  REQUIRE(outputs.size() == 1);
+  CATCH_REQUIRE(outputs.size() == 1);
   auto& output = outputs.front();
 
-  REQUIRE(output.size(0) == 10);
-  REQUIRE(output.device() == torch::Device(torch::kCUDA, 1));
+  CATCH_REQUIRE(output.size(0) == 10);
+  CATCH_REQUIRE(output.device() == torch::Device(torch::kCUDA, 1));
 
   auto chunks = output.chunk(2);
-  REQUIRE(chunks[0].to({torch::kCUDA, 0}).allclose(a));
-  REQUIRE(chunks[1].allclose(b));
+  CATCH_REQUIRE(chunks[0].to({torch::kCUDA, 0}).allclose(a));
+  CATCH_REQUIRE(chunks[1].allclose(b));
 
   output.backward();
 
-  REQUIRE(a.grad().defined());
-  REQUIRE(a.grad().device() == torch::Device(torch::kCUDA, 0));
-  REQUIRE(a.grad().sum().toCInt() == 5);
+  CATCH_REQUIRE(a.grad().defined());
+  CATCH_REQUIRE(a.grad().device() == torch::Device(torch::kCUDA, 0));
+  CATCH_REQUIRE(a.grad().sum().toCInt() == 5);
 
-  REQUIRE(b.grad().defined());
-  REQUIRE(b.grad().device() == torch::Device(torch::kCUDA, 1));
-  REQUIRE(b.grad().sum().toCInt() == 5);
+  CATCH_REQUIRE(b.grad().defined());
+  CATCH_REQUIRE(b.grad().device() == torch::Device(torch::kCUDA, 1));
+  CATCH_REQUIRE(b.grad().sum().toCInt() == 5);
 }
 
-TEST_CASE("Parallel/Replicate", "[multi-cuda]") {
+CATCH_TEST_CASE("Parallel/Replicate", "[multi-cuda]") {
   Linear linear(3, 4);
   auto replicas = parallel::replicate(
       linear, {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
-  REQUIRE(replicas.size() == 2);
+  CATCH_REQUIRE(replicas.size() == 2);
 
   auto original_parameters = linear->parameters();
 
   auto replica1_parameters = replicas[0]->parameters();
   for (auto& parameter : replica1_parameters) {
-    REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 0));
+    CATCH_REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 0));
   }
   replicas[0]->to(torch::kCPU);
-  REQUIRE(replica1_parameters.size() == original_parameters.size());
+  CATCH_REQUIRE(replica1_parameters.size() == original_parameters.size());
   for (size_t i = 0; i < original_parameters.size(); ++i) {
-    REQUIRE(replica1_parameters[i]->allclose(*original_parameters[i]));
-    REQUIRE(
+    CATCH_REQUIRE(replica1_parameters[i]->allclose(*original_parameters[i]));
+    CATCH_REQUIRE(
         replica1_parameters[i].data<float>() !=
         original_parameters[i].data<float>());
   }
 
   auto replica2_parameters = replicas[1]->parameters();
   for (auto& parameter : replica2_parameters) {
-    REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 1));
+    CATCH_REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 1));
   }
   replicas[1]->to(torch::kCPU);
-  REQUIRE(replica2_parameters.size() == original_parameters.size());
+  CATCH_REQUIRE(replica2_parameters.size() == original_parameters.size());
   for (size_t i = 0; i < original_parameters.size(); ++i) {
-    REQUIRE(replica2_parameters[i]->allclose(*original_parameters[i]));
-    REQUIRE(
+    CATCH_REQUIRE(replica2_parameters[i]->allclose(*original_parameters[i]));
+    CATCH_REQUIRE(
         replica2_parameters[i].data<float>() !=
         original_parameters[i].data<float>());
   }
 }
 
-TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") {
+CATCH_TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") {
   Linear a(3, 4);
 
   Linear b(std::static_pointer_cast<LinearImpl>(a->clone()));
@@ -121,17 +121,17 @@ TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") {
 
   auto outputs = parallel::parallel_apply(modules, inputs);
 
-  REQUIRE(outputs.size() == 3);
-  REQUIRE(outputs[0].device().is_cpu());
+  CATCH_REQUIRE(outputs.size() == 3);
+  CATCH_REQUIRE(outputs[0].device().is_cpu());
 
-  REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
-  REQUIRE(outputs[1].to(torch::kCPU).allclose(outputs[0]));
+  CATCH_REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
+  CATCH_REQUIRE(outputs[1].to(torch::kCPU).allclose(outputs[0]));
 
-  REQUIRE(outputs[2].device() == torch::Device(torch::kCUDA, 1));
-  REQUIRE(outputs[2].to(torch::kCPU).allclose(outputs[0]));
+  CATCH_REQUIRE(outputs[2].device() == torch::Device(torch::kCUDA, 1));
+  CATCH_REQUIRE(outputs[2].to(torch::kCPU).allclose(outputs[0]));
 }
 
-TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") {
+CATCH_TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") {
   struct M : torch::nn::Module {
     torch::Tensor forward(torch::Tensor input) {
       return torch::ones({5}, torch::dtype(torch::kInt32));
@@ -147,17 +147,17 @@ TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") {
 
   auto outputs = parallel::parallel_apply(modules, inputs, devices);
 
-  REQUIRE(outputs.size() == 3);
-  REQUIRE(outputs[0].device().is_cuda());
-  REQUIRE(outputs[0].device() == torch::Device(torch::kCUDA, 1));
+  CATCH_REQUIRE(outputs.size() == 3);
+  CATCH_REQUIRE(outputs[0].device().is_cuda());
+  CATCH_REQUIRE(outputs[0].device() == torch::Device(torch::kCUDA, 1));
 
-  REQUIRE(outputs[1].device().is_cuda());
-  REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
+  CATCH_REQUIRE(outputs[1].device().is_cuda());
+  CATCH_REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
 
-  REQUIRE(outputs[2].device().is_cpu());
+  CATCH_REQUIRE(outputs[2].device().is_cpu());
 }
 
-TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") {
+CATCH_TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") {
   struct M : torch::nn::Cloneable<M> {
     void reset() override {}
     torch::Tensor forward(torch::Tensor input) {
@@ -167,11 +167,11 @@ TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") {
 
   auto m = std::make_shared<M>();
   auto input = torch::ones({10, 3});
-  REQUIRE_THROWS_WITH(
+  CATCH_REQUIRE_THROWS_WITH(
       parallel::data_parallel(m, input), StartsWith("Badness!"));
 }
 
-TEST_CASE(
+CATCH_TEST_CASE(
     "Parallel/DataParallelPlacesTheOutputOnTheRequestedDevice",
     "[multi-cuda]") {
   struct M : torch::nn::Cloneable<M> {
@@ -192,9 +192,9 @@ TEST_CASE(
         input,
         /*devices=*/at::nullopt,
         /*output_device=*/torch::Device(torch::kCUDA, 1));
-    REQUIRE(output.defined());
-    REQUIRE(output.device().is_cuda());
-    REQUIRE(output.device().index() == 1);
+    CATCH_REQUIRE(output.defined());
+    CATCH_REQUIRE(output.device().is_cuda());
+    CATCH_REQUIRE(output.device().index() == 1);
   }
   {
     // Verify for the single-device case (where we don't scatter/gather).
@@ -203,16 +203,16 @@ TEST_CASE(
         input,
         /*devices=*/std::vector<torch::Device>{torch::Device(torch::kCUDA, 0)},
         /*output_device=*/torch::Device(torch::kCUDA, 1));
-    REQUIRE(m->intermediate_tensor.defined());
-    REQUIRE(m->intermediate_tensor.device().is_cuda());
-    REQUIRE(m->intermediate_tensor.device().index() == 0);
-    REQUIRE(output.defined());
-    REQUIRE(output.device().is_cuda());
-    REQUIRE(output.device().index() == 1);
+    CATCH_REQUIRE(m->intermediate_tensor.defined());
+    CATCH_REQUIRE(m->intermediate_tensor.device().is_cuda());
+    CATCH_REQUIRE(m->intermediate_tensor.device().index() == 0);
+    CATCH_REQUIRE(output.defined());
+    CATCH_REQUIRE(output.device().is_cuda());
+    CATCH_REQUIRE(output.device().index() == 1);
   }
 }
 
-TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") {
+CATCH_TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") {
   struct M : torch::nn::Cloneable<M> {
     void reset() override {}
     torch::Tensor forward(torch::Tensor input) {
@@ -225,9 +225,9 @@ TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") {
   auto output = parallel::data_parallel(m, input);
 
   const auto device_count = torch::cuda::device_count();
-  REQUIRE(output.numel() == device_count);
+  CATCH_REQUIRE(output.numel() == device_count);
   for (size_t i = 0; i < device_count; ++i) {
-    REQUIRE(output[i].toCInt() == i);
+    CATCH_REQUIRE(output[i].toCInt() == i);
   }
 }
 
diff --git a/test/cpp/api/rnn.cpp b/test/cpp/api/rnn.cpp
index 96685728484a39..a307851afbae46 100644
--- a/test/cpp/api/rnn.cpp
+++ b/test/cpp/api/rnn.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/modules/linear.h>
 #include <torch/nn/modules/rnn.h>
@@ -71,22 +71,22 @@ void check_lstm_sizes(RNNOutput output) {
   // Expect the LSTM to have 64 outputs and 3 layers, with an input of batch
   // 10 and 16 time steps (10 x 16 x n)
 
-  REQUIRE(output.output.ndimension() == 3);
-  REQUIRE(output.output.size(0) == 10);
-  REQUIRE(output.output.size(1) == 16);
-  REQUIRE(output.output.size(2) == 64);
+  CATCH_REQUIRE(output.output.ndimension() == 3);
+  CATCH_REQUIRE(output.output.size(0) == 10);
+  CATCH_REQUIRE(output.output.size(1) == 16);
+  CATCH_REQUIRE(output.output.size(2) == 64);
 
-  REQUIRE(output.state.ndimension() == 4);
-  REQUIRE(output.state.size(0) == 2); // (hx, cx)
-  REQUIRE(output.state.size(1) == 3); // layers
-  REQUIRE(output.state.size(2) == 16); // Batchsize
-  REQUIRE(output.state.size(3) == 64); // 64 hidden dims
+  CATCH_REQUIRE(output.state.ndimension() == 4);
+  CATCH_REQUIRE(output.state.size(0) == 2); // (hx, cx)
+  CATCH_REQUIRE(output.state.size(1) == 3); // layers
+  CATCH_REQUIRE(output.state.size(2) == 16); // Batchsize
+  CATCH_REQUIRE(output.state.size(3) == 64); // 64 hidden dims
 
   // Something is in the hiddens
-  REQUIRE(output.state.norm().toCFloat() > 0);
+  CATCH_REQUIRE(output.state.norm().toCFloat() > 0);
 }
 
-TEST_CASE("RNN/CheckOutputSizes") {
+CATCH_TEST_CASE("RNN/CheckOutputSizes") {
   torch::manual_seed(0);
   LSTM model(LSTMOptions(128, 64).layers(3).dropout(0.2));
   // Input size is: sequence length, batch size, input size
@@ -104,10 +104,10 @@ TEST_CASE("RNN/CheckOutputSizes") {
   torch::Tensor diff = next.state - output.state;
 
   // Hiddens changed
-  REQUIRE(diff.abs().sum().toCFloat() > 1e-3);
+  CATCH_REQUIRE(diff.abs().sum().toCFloat() > 1e-3);
 }
 
-TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") {
+CATCH_TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") {
   torch::manual_seed(0);
   // Make sure the outputs match pytorch outputs
   LSTM model(2, 2);
@@ -127,10 +127,10 @@ TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") {
   }
 
   auto out = model->forward(x);
-  REQUIRE(out.output.ndimension() == 3);
-  REQUIRE(out.output.size(0) == 3);
-  REQUIRE(out.output.size(1) == 4);
-  REQUIRE(out.output.size(2) == 2);
+  CATCH_REQUIRE(out.output.ndimension() == 3);
+  CATCH_REQUIRE(out.output.size(0) == 3);
+  CATCH_REQUIRE(out.output.size(1) == 4);
+  CATCH_REQUIRE(out.output.size(2) == 2);
 
   auto flat = out.output.view(3 * 4 * 2);
   float c_out[] = {0.4391, 0.5402, 0.4330, 0.5324, 0.4261, 0.5239,
@@ -138,14 +138,14 @@ TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") {
                    0.6620, 0.7860, 0.6501, 0.7741, 0.7889, 0.9003,
                    0.7769, 0.8905, 0.7635, 0.8794, 0.7484, 0.8666};
   for (size_t i = 0; i < 3 * 4 * 2; i++) {
-    REQUIRE(std::abs(flat[i].toCFloat() - c_out[i]) < 1e-3);
+    CATCH_REQUIRE(std::abs(flat[i].toCFloat() - c_out[i]) < 1e-3);
   }
 
-  REQUIRE(out.state.ndimension() == 4); // (hx, cx) x layers x B x 2
-  REQUIRE(out.state.size(0) == 2);
-  REQUIRE(out.state.size(1) == 1);
-  REQUIRE(out.state.size(2) == 4);
-  REQUIRE(out.state.size(3) == 2);
+  CATCH_REQUIRE(out.state.ndimension() == 4); // (hx, cx) x layers x B x 2
+  CATCH_REQUIRE(out.state.size(0) == 2);
+  CATCH_REQUIRE(out.state.size(1) == 1);
+  CATCH_REQUIRE(out.state.size(2) == 4);
+  CATCH_REQUIRE(out.state.size(3) == 2);
   flat = out.state.view(16);
   float h_out[] = {0.7889,
                    0.9003,
@@ -164,33 +164,33 @@ TEST_CASE("RNN/CheckOutputValuesMatchPyTorch") {
                    1.0931,
                    1.4911};
   for (size_t i = 0; i < 16; i++) {
-    REQUIRE(std::abs(flat[i].toCFloat() - h_out[i]) < 1e-3);
+    CATCH_REQUIRE(std::abs(flat[i].toCFloat() - h_out[i]) < 1e-3);
   }
 }
 
-TEST_CASE("RNN/integration/LSTM") {
-  REQUIRE(test_RNN_xor<LSTM>(
+CATCH_TEST_CASE("RNN/integration/LSTM") {
+  CATCH_REQUIRE(test_RNN_xor<LSTM>(
       [](int s) { return LSTM(LSTMOptions(s, s).layers(2)); }));
 }
 
-TEST_CASE("RNN/integration/GRU") {
-  REQUIRE(
+CATCH_TEST_CASE("RNN/integration/GRU") {
+  CATCH_REQUIRE(
       test_RNN_xor<GRU>([](int s) { return GRU(GRUOptions(s, s).layers(2)); }));
 }
 
-TEST_CASE("RNN/integration/RNN") {
-  SECTION("relu") {
-    REQUIRE(test_RNN_xor<RNN>(
+CATCH_TEST_CASE("RNN/integration/RNN") {
+  CATCH_SECTION("relu") {
+    CATCH_REQUIRE(test_RNN_xor<RNN>(
         [](int s) { return RNN(RNNOptions(s, s).relu().layers(2)); }));
   }
-  SECTION("tanh") {
-    REQUIRE(test_RNN_xor<RNN>(
+  CATCH_SECTION("tanh") {
+    CATCH_REQUIRE(test_RNN_xor<RNN>(
         [](int s) { return RNN(RNNOptions(s, s).tanh().layers(2)); }));
   }
 }
 
-TEST_CASE("rnn_cuda", "[cuda]") {
-  SECTION("sizes") {
+CATCH_TEST_CASE("rnn_cuda", "[cuda]") {
+  CATCH_SECTION("sizes") {
     torch::manual_seed(0);
     LSTM model(LSTMOptions(128, 64).layers(3).dropout(0.2));
     model->to(torch::kCUDA);
@@ -209,26 +209,26 @@ TEST_CASE("rnn_cuda", "[cuda]") {
     torch::Tensor diff = next.state - output.state;
 
     // Hiddens changed
-    REQUIRE(diff.abs().sum().toCFloat() > 1e-3);
+    CATCH_REQUIRE(diff.abs().sum().toCFloat() > 1e-3);
   }
 
-  SECTION("lstm") {
-    REQUIRE(test_RNN_xor<LSTM>(
+  CATCH_SECTION("lstm") {
+    CATCH_REQUIRE(test_RNN_xor<LSTM>(
         [](int s) { return LSTM(LSTMOptions(s, s).layers(2)); }, true));
   }
 
-  SECTION("gru") {
-    REQUIRE(test_RNN_xor<GRU>(
+  CATCH_SECTION("gru") {
+    CATCH_REQUIRE(test_RNN_xor<GRU>(
         [](int s) { return GRU(GRUOptions(s, s).layers(2)); }, true));
   }
 
-  SECTION("rnn") {
-    SECTION("relu") {
-      REQUIRE(test_RNN_xor<RNN>(
+  CATCH_SECTION("rnn") {
+    CATCH_SECTION("relu") {
+      CATCH_REQUIRE(test_RNN_xor<RNN>(
           [](int s) { return RNN(RNNOptions(s, s).relu().layers(2)); }, true));
     }
-    SECTION("tanh") {
-      REQUIRE(test_RNN_xor<RNN>(
+    CATCH_SECTION("tanh") {
+      CATCH_REQUIRE(test_RNN_xor<RNN>(
           [](int s) { return RNN(RNNOptions(s, s).tanh().layers(2)); }, true));
     }
   }
diff --git a/test/cpp/api/sequential.cpp b/test/cpp/api/sequential.cpp
index 389393fe480b21..777d6e28024b9b 100644
--- a/test/cpp/api/sequential.cpp
+++ b/test/cpp/api/sequential.cpp
@@ -1,11 +1,16 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/modules.h>
+#include <torch/nn/modules/batchnorm.h>
+#include <torch/nn/modules/conv.h>
+#include <torch/nn/modules/dropout.h>
 #include <torch/nn/modules/linear.h>
+#include <torch/nn/modules/rnn.h>
 #include <torch/nn/modules/sequential.h>
 #include <torch/tensor.h>
 #include <torch/utils.h>
 
+#include <algorithm>
 #include <memory>
 #include <vector>
 
@@ -16,306 +21,312 @@ using namespace torch::test;
 
 using Catch::StartsWith;
 
-TEST_CASE("sequential") {
-  SECTION("construction from shared pointer") {
-    struct M : torch::nn::Module {
-      explicit M(int value_) : value(value_) {}
-      int value;
-      int forward() {
-        return value;
-      }
-    };
-    Sequential sequential(
-        std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3));
-    REQUIRE(sequential->size() == 3);
-  }
-  SECTION("construction from concrete type") {
-    struct M : torch::nn::Module {
-      explicit M(int value_) : value(value_) {}
-      int value;
-      int forward() {
-        return value;
-      }
-    };
-
-    Sequential sequential(M(1), M(2), M(3));
-    REQUIRE(sequential->size() == 3);
-  }
-  SECTION("construction from module holders") {
-    struct MImpl : torch::nn::Module {
-      explicit MImpl(int value_) : value(value_) {}
-      int forward() {
-        return value;
-      }
-      int value;
-    };
-
-    struct M : torch::nn::ModuleHolder<MImpl> {
-      using torch::nn::ModuleHolder<MImpl>::ModuleHolder;
-      using torch::nn::ModuleHolder<MImpl>::get;
-    };
-
-    Sequential sequential(M(1), M(2), M(3));
-    REQUIRE(sequential->size() == 3);
-  }
-  SECTION("push_back") {
-    struct M : torch::nn::Module {
-      explicit M(int value_) : value(value_) {}
-      int forward() {
-        return value;
-      }
-      int value;
-    };
-    Sequential sequential;
-    REQUIRE(sequential->size() == 0);
-    REQUIRE(sequential->is_empty());
-    sequential->push_back(Linear(3, 4));
-    REQUIRE(sequential->size() == 1);
-    sequential->push_back(std::make_shared<M>(1));
-    REQUIRE(sequential->size() == 2);
-    sequential->push_back(M(2));
-    REQUIRE(sequential->size() == 3);
-  }
-  SECTION("access") {
-    struct M : torch::nn::Module {
-      explicit M(int value_) : value(value_) {}
-      int forward() {
-        return value;
-      }
-      int value;
-    };
-    std::vector<std::shared_ptr<M>> modules = {
-        std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3)};
-
-    Sequential sequential;
-    for (auto& module : modules) {
-      sequential->push_back(module);
-    }
-    REQUIRE(sequential->size() == 3);
-
-    SECTION("at()") {
-      SECTION("returns the correct module for a given index") {
-        for (size_t i = 0; i < modules.size(); ++i) {
-          REQUIRE(&sequential->at<M>(i) == modules[i].get());
-        }
-      }
-      SECTION("throws for a bad index") {
-        REQUIRE_THROWS_WITH(
-            sequential->at<M>(modules.size() + 1),
-            StartsWith("Index out of range"));
-        REQUIRE_THROWS_WITH(
-            sequential->at<M>(modules.size() + 1000000),
-            StartsWith("Index out of range"));
-      }
+CATCH_TEST_CASE("Sequential/ConstructsFromSharedPointer") {
+  struct M : torch::nn::Module {
+    explicit M(int value_) : value(value_) {}
+    int value;
+    int forward() {
+      return value;
     }
+  };
+  Sequential sequential(
+      std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3));
+  CATCH_REQUIRE(sequential->size() == 3);
+}
 
-    SECTION("ptr()") {
-      SECTION("returns the correct module for a given index") {
-        for (size_t i = 0; i < modules.size(); ++i) {
-          REQUIRE(sequential->ptr(i).get() == modules[i].get());
-          REQUIRE(sequential[i].get() == modules[i].get());
-          REQUIRE(sequential->ptr<M>(i).get() == modules[i].get());
-        }
-      }
-      SECTION("throws for a bad index") {
-        REQUIRE_THROWS_WITH(
-            sequential->ptr(modules.size() + 1),
-            StartsWith("Index out of range"));
-        REQUIRE_THROWS_WITH(
-            sequential->ptr(modules.size() + 1000000),
-            StartsWith("Index out of range"));
-      }
+CATCH_TEST_CASE("Sequential/ConstructsFromConcreteType") {
+  struct M : torch::nn::Module {
+    explicit M(int value_) : value(value_) {}
+    int value;
+    int forward() {
+      return value;
     }
-  }
-  SECTION("forward") {
-    SECTION("calling forward() on an empty sequential is disallowed") {
-      Sequential empty;
-      REQUIRE_THROWS_WITH(
-          empty->forward<int>(),
-          StartsWith("Cannot call forward() on an empty Sequential"));
+  };
+
+  Sequential sequential(M(1), M(2), M(3));
+  CATCH_REQUIRE(sequential->size() == 3);
+}
+CATCH_TEST_CASE("Sequential/ConstructsFromModuleHolder") {
+  struct MImpl : torch::nn::Module {
+    explicit MImpl(int value_) : value(value_) {}
+    int forward() {
+      return value;
     }
+    int value;
+  };
 
-    SECTION("calling forward() on a non-empty sequential chains correctly") {
-      struct MockModule : torch::nn::Module {
-        explicit MockModule(int value) : expected(value) {}
-        int expected;
-        int forward(int value) {
-          REQUIRE(value == expected);
-          return value + 1;
-        }
-      };
+  struct M : torch::nn::ModuleHolder<MImpl> {
+    using torch::nn::ModuleHolder<MImpl>::ModuleHolder;
+    using torch::nn::ModuleHolder<MImpl>::get;
+  };
 
-      Sequential sequential(MockModule{1}, MockModule{2}, MockModule{3});
+  Sequential sequential(M(1), M(2), M(3));
+  CATCH_REQUIRE(sequential->size() == 3);
+}
 
-      REQUIRE(sequential->forward<int>(1) == 4);
+CATCH_TEST_CASE("Sequential/PushBackAddsAnElement") {
+  struct M : torch::nn::Module {
+    explicit M(int value_) : value(value_) {}
+    int forward() {
+      return value;
     }
+    int value;
+  };
+  Sequential sequential;
+  CATCH_REQUIRE(sequential->size() == 0);
+  CATCH_REQUIRE(sequential->is_empty());
+  sequential->push_back(Linear(3, 4));
+  CATCH_REQUIRE(sequential->size() == 1);
+  sequential->push_back(std::make_shared<M>(1));
+  CATCH_REQUIRE(sequential->size() == 2);
+  sequential->push_back(M(2));
+  CATCH_REQUIRE(sequential->size() == 3);
+}
 
-    SECTION("calling forward() with the wrong return type throws") {
-      struct M : public torch::nn::Module {
-        int forward() {
-          return 5;
-        }
-      };
-
-      Sequential sequential(M{});
-      REQUIRE(sequential->forward<int>() == 5);
-      REQUIRE_THROWS_WITH(
-          sequential->forward<float>(),
-          StartsWith("The type of the return value "
-                     "is int, but you asked for type float"));
+CATCH_TEST_CASE("Sequential/AccessWithAt") {
+  struct M : torch::nn::Module {
+    explicit M(int value_) : value(value_) {}
+    int forward() {
+      return value;
     }
+    int value;
+  };
+  std::vector<std::shared_ptr<M>> modules = {
+      std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3)};
+
+  Sequential sequential;
+  for (auto& module : modules) {
+    sequential->push_back(module);
+  }
+  CATCH_REQUIRE(sequential->size() == 3);
 
-    SECTION("The return type of forward() defaults to Tensor") {
-      struct M : public torch::nn::Module {
-        torch::Tensor forward(torch::Tensor v) {
-          return v;
-        }
-      };
-
-      Sequential sequential(M{});
-      auto variable = torch::ones({3, 3}, torch::requires_grad());
-      REQUIRE(sequential->forward(variable).equal(variable));
-    }
+  // returns the correct module for a given index
+  for (size_t i = 0; i < modules.size(); ++i) {
+    CATCH_REQUIRE(&sequential->at<M>(i) == modules[i].get());
   }
 
-  SECTION("returns the last value") {
-    torch::manual_seed(0);
-    Sequential sequential(Linear(10, 3), Linear(3, 5), Linear(5, 100));
+  // throws for a bad index
+  CATCH_REQUIRE_THROWS_WITH(
+      sequential->at<M>(modules.size() + 1), StartsWith("Index out of range"));
+  CATCH_REQUIRE_THROWS_WITH(
+      sequential->at<M>(modules.size() + 1000000),
+      StartsWith("Index out of range"));
+}
 
-    auto x = torch::randn({1000, 10}, torch::requires_grad());
-    auto y = sequential->forward(x);
-    REQUIRE(y.ndimension() == 2);
-    REQUIRE(y.size(0) == 1000);
-    REQUIRE(y.size(1) == 100);
+CATCH_TEST_CASE("Sequential/AccessWithPtr") {
+  struct M : torch::nn::Module {
+    explicit M(int value_) : value(value_) {}
+    int forward() {
+      return value;
+    }
+    int value;
+  };
+  std::vector<std::shared_ptr<M>> modules = {
+      std::make_shared<M>(1), std::make_shared<M>(2), std::make_shared<M>(3)};
+
+  Sequential sequential;
+  for (auto& module : modules) {
+    sequential->push_back(module);
   }
+  CATCH_REQUIRE(sequential->size() == 3);
 
-  SECTION("can hold other important modules") {
-    Sequential sequential(
-        Linear(10, 3),
-        Conv2d(1, 2, 3),
-        Dropout(0.5),
-        BatchNorm(5),
-        Embedding(4, 10),
-        LSTM(4, 5));
+  // returns the correct module for a given index
+  for (size_t i = 0; i < modules.size(); ++i) {
+    CATCH_REQUIRE(sequential->ptr(i).get() == modules[i].get());
+    CATCH_REQUIRE(sequential[i].get() == modules[i].get());
+    CATCH_REQUIRE(sequential->ptr<M>(i).get() == modules[i].get());
   }
 
-  SECTION("converts at::Tensor to torch::Tensor correctly") {
-    struct M : torch::nn::Module {
-      torch::Tensor forward(torch::Tensor input) {
-        return input;
-      }
-    };
-
-    Sequential sequential(M{});
-    torch::Tensor variable = torch::ones(5);
-    REQUIRE(sequential->forward(variable).sum().toCFloat() == 5);
-
-    at::Tensor tensor_that_is_actually_a_variable = variable * 2;
-    REQUIRE(
-        sequential->forward(tensor_that_is_actually_a_variable)
-            .sum()
-            .toCFloat() == 10);
-  }
-  SECTION("extend() pushes modules from other Sequential") {
-    struct A : torch::nn::Module {
-      int forward(int x) {
-        return x;
-      }
-    };
-    struct B : torch::nn::Module {
-      int forward(int x) {
-        return x;
-      }
-    };
-    struct C : torch::nn::Module {
-      int forward(int x) {
-        return x;
-      }
-    };
-    struct D : torch::nn::Module {
-      int forward(int x) {
-        return x;
-      }
-    };
-    Sequential a(A{}, B{});
-    Sequential b(C{}, D{});
-    a->extend(*b);
-
-    REQUIRE(a->size() == 4);
-    REQUIRE(a[0]->as<A>());
-    REQUIRE(a[1]->as<B>());
-    REQUIRE(a[2]->as<C>());
-    REQUIRE(a[3]->as<D>());
-
-    REQUIRE(b->size() == 2);
-    REQUIRE(b[0]->as<C>());
-    REQUIRE(b[1]->as<D>());
-
-    std::vector<std::shared_ptr<A>> c = {std::make_shared<A>(),
-                                         std::make_shared<A>()};
-    b->extend(c);
-
-    REQUIRE(b->size() == 4);
-    REQUIRE(b[0]->as<C>());
-    REQUIRE(b[1]->as<D>());
-    REQUIRE(b[2]->as<A>());
-    REQUIRE(b[3]->as<A>());
-  }
-  SECTION("has reference semantics") {
-    Sequential first(Linear(2, 3), Linear(4, 4), Linear(4, 5));
-    Sequential second(first);
-
-    REQUIRE(first.get() == second.get());
-    REQUIRE(first->size() == second->size());
-    REQUIRE(std::equal(
-        first->begin(),
-        first->end(),
-        second->begin(),
-        [](const AnyModule& first, const AnyModule& second) {
-          return &first == &second;
-        }));
-  }
-  SECTION("Is cloneable") {
-    Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3));
-    Sequential clone =
-        std::dynamic_pointer_cast<SequentialImpl>(sequential->clone());
-    REQUIRE(sequential->size() == clone->size());
-
-    for (size_t i = 0; i < sequential->size(); ++i) {
-      // The modules should be the same kind (type).
-      REQUIRE(sequential[i]->name() == clone[i]->name());
-      // But not pointer-equal (distinct objects).
-      REQUIRE(sequential[i] != clone[i]);
+  // throws for a bad index
+  CATCH_REQUIRE_THROWS_WITH(
+      sequential->ptr(modules.size() + 1), StartsWith("Index out of range"));
+  CATCH_REQUIRE_THROWS_WITH(
+      sequential->ptr(modules.size() + 1000000),
+      StartsWith("Index out of range"));
+}
+
+CATCH_TEST_CASE("Sequential/CallingForwardOnEmptySequentialIsDisallowed") {
+  Sequential empty;
+  CATCH_REQUIRE_THROWS_WITH(
+      empty->forward<int>(),
+      StartsWith("Cannot call forward() on an empty Sequential"));
+}
+
+CATCH_TEST_CASE("Sequential/CallingForwardChainsCorrectly") {
+  struct MockModule : torch::nn::Module {
+    explicit MockModule(int value) : expected(value) {}
+    int expected;
+    int forward(int value) {
+      CATCH_REQUIRE(value == expected);
+      return value + 1;
     }
+  };
 
-    // Verify that the clone is deep, i.e. parameters of modules are cloned too.
+  Sequential sequential(MockModule{1}, MockModule{2}, MockModule{3});
 
-    torch::NoGradGuard no_grad;
+  CATCH_REQUIRE(sequential->forward<int>(1) == 4);
+}
 
-    auto params1 = sequential->parameters();
-    auto params2 = clone->parameters();
-    REQUIRE(params1.size() == params2.size());
-    for (auto& param : params1) {
-      REQUIRE(!pointer_equal(param.value, params2[param.key]));
-      REQUIRE(param->device() == params2[param.key].device());
-      REQUIRE(param->allclose(params2[param.key]));
-      param->add_(2);
+CATCH_TEST_CASE("Sequential/CallingForwardWithTheWrongReturnTypeThrows") {
+  struct M : public torch::nn::Module {
+    int forward() {
+      return 5;
     }
-    for (auto& param : params1) {
-      REQUIRE(!param->allclose(params2[param.key]));
+  };
+
+  Sequential sequential(M{});
+  CATCH_REQUIRE(sequential->forward<int>() == 5);
+  CATCH_REQUIRE_THROWS_WITH(
+      sequential->forward<float>(),
+      StartsWith("The type of the return value "
+                 "is int, but you asked for type float"));
+}
+
+CATCH_TEST_CASE("Sequential/TheReturnTypeOfForwardDefaultsToTensor") {
+  struct M : public torch::nn::Module {
+    torch::Tensor forward(torch::Tensor v) {
+      return v;
     }
+  };
+
+  Sequential sequential(M{});
+  auto variable = torch::ones({3, 3}, torch::requires_grad());
+  CATCH_REQUIRE(sequential->forward(variable).equal(variable));
+}
+
+CATCH_TEST_CASE("Sequential/ForwardReturnsTheLastValue") {
+  torch::manual_seed(0);
+  Sequential sequential(Linear(10, 3), Linear(3, 5), Linear(5, 100));
+
+  auto x = torch::randn({1000, 10}, torch::requires_grad());
+  auto y = sequential->forward(x);
+  CATCH_REQUIRE(y.ndimension() == 2);
+  CATCH_REQUIRE(y.size(0) == 1000);
+  CATCH_REQUIRE(y.size(1) == 100);
+}
+
+CATCH_TEST_CASE("Sequential/SanityCheckForHoldingStandardModules") {
+  Sequential sequential(
+      Linear(10, 3),
+      Conv2d(1, 2, 3),
+      Dropout(0.5),
+      BatchNorm(5),
+      Embedding(4, 10),
+      LSTM(4, 5));
+}
+
+CATCH_TEST_CASE("Sequential/ExtendPushesModulesFromOtherSequential") {
+  struct A : torch::nn::Module {
+    int forward(int x) {
+      return x;
+    }
+  };
+  struct B : torch::nn::Module {
+    int forward(int x) {
+      return x;
+    }
+  };
+  struct C : torch::nn::Module {
+    int forward(int x) {
+      return x;
+    }
+  };
+  struct D : torch::nn::Module {
+    int forward(int x) {
+      return x;
+    }
+  };
+  Sequential a(A{}, B{});
+  Sequential b(C{}, D{});
+  a->extend(*b);
+
+  CATCH_REQUIRE(a->size() == 4);
+  CATCH_REQUIRE(a[0]->as<A>());
+  CATCH_REQUIRE(a[1]->as<B>());
+  CATCH_REQUIRE(a[2]->as<C>());
+  CATCH_REQUIRE(a[3]->as<D>());
+
+  CATCH_REQUIRE(b->size() == 2);
+  CATCH_REQUIRE(b[0]->as<C>());
+  CATCH_REQUIRE(b[1]->as<D>());
+
+  std::vector<std::shared_ptr<A>> c = {std::make_shared<A>(),
+                                       std::make_shared<A>()};
+  b->extend(c);
+
+  CATCH_REQUIRE(b->size() == 4);
+  CATCH_REQUIRE(b[0]->as<C>());
+  CATCH_REQUIRE(b[1]->as<D>());
+  CATCH_REQUIRE(b[2]->as<A>());
+  CATCH_REQUIRE(b[3]->as<A>());
+}
+
+CATCH_TEST_CASE("Sequential/HasReferenceSemantics") {
+  Sequential first(Linear(2, 3), Linear(4, 4), Linear(4, 5));
+  Sequential second(first);
+
+  CATCH_REQUIRE(first.get() == second.get());
+  CATCH_REQUIRE(first->size() == second->size());
+  CATCH_REQUIRE(std::equal(
+      first->begin(),
+      first->end(),
+      second->begin(),
+      [](const AnyModule& first, const AnyModule& second) {
+        return &first == &second;
+      }));
+}
+
+CATCH_TEST_CASE("Sequential/IsCloneable") {
+  Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3));
+  Sequential clone =
+      std::dynamic_pointer_cast<SequentialImpl>(sequential->clone());
+  CATCH_REQUIRE(sequential->size() == clone->size());
+
+  for (size_t i = 0; i < sequential->size(); ++i) {
+    // The modules should be the same kind (type).
+    CATCH_REQUIRE(sequential[i]->name() == clone[i]->name());
+    // But not pointer-equal (distinct objects).
+    CATCH_REQUIRE(sequential[i] != clone[i]);
   }
+
+  // Verify that the clone is deep, i.e. parameters of modules are cloned too.
+
+  torch::NoGradGuard no_grad;
+
+  auto params1 = sequential->parameters();
+  auto params2 = clone->parameters();
+  CATCH_REQUIRE(params1.size() == params2.size());
+  for (auto& param : params1) {
+    CATCH_REQUIRE(!pointer_equal(param.value, params2[param.key]));
+    CATCH_REQUIRE(param->device() == params2[param.key].device());
+    CATCH_REQUIRE(param->allclose(params2[param.key]));
+    param->add_(2);
+  }
+  for (auto& param : params1) {
+    CATCH_REQUIRE(!param->allclose(params2[param.key]));
+  }
+}
+
+CATCH_TEST_CASE("Sequential/RegistersElementsAsSubmodules") {
+  Sequential sequential(Linear(10, 3), Conv2d(1, 2, 3), FeatureDropout(0.5));
+
+  auto modules = sequential->modules();
+  CATCH_REQUIRE(modules.size() == sequential->children().size());
+
+  CATCH_REQUIRE(modules[0]->as<Linear>());
+  CATCH_REQUIRE(modules[1]->as<Conv2d>());
+  CATCH_REQUIRE(modules[2]->as<FeatureDropout>());
 }
 
-TEST_CASE("sequential/clone-to-device", "[cuda]") {
+CATCH_TEST_CASE("Sequential/CloneToDevice", "[cuda]") {
   Sequential sequential(Linear(3, 4), Functional(torch::relu), BatchNorm(3));
   torch::Device device(torch::kCUDA, 0);
   Sequential clone =
       std::dynamic_pointer_cast<SequentialImpl>(sequential->clone(device));
   for (const auto& p : clone->parameters()) {
-    REQUIRE(p->device() == device);
+    CATCH_REQUIRE(p->device() == device);
   }
   for (const auto& b : clone->buffers()) {
-    REQUIRE(b->device() == device);
+    CATCH_REQUIRE(b->device() == device);
   }
 }
diff --git a/test/cpp/api/serialization.cpp b/test/cpp/api/serialization.cpp
index 354108991f42cd..fda133b8d9bf00 100644
--- a/test/cpp/api/serialization.cpp
+++ b/test/cpp/api/serialization.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/nn/modules/functional.h>
 #include <torch/nn/modules/linear.h>
@@ -30,12 +30,12 @@ Sequential xor_model() {
 }
 } // namespace
 
-TEST_CASE("serialization") {
+CATCH_TEST_CASE("serialization") {
   torch::manual_seed(0);
-  SECTION("undefined") {
+  CATCH_SECTION("undefined") {
     auto x = torch::Tensor();
 
-    REQUIRE(!x.defined());
+    CATCH_REQUIRE(!x.defined());
 
     auto y = torch::randn({5});
 
@@ -43,10 +43,10 @@ TEST_CASE("serialization") {
     torch::save(ss, &x);
     torch::load(ss, &y);
 
-    REQUIRE(!y.defined());
+    CATCH_REQUIRE(!y.defined());
   }
 
-  SECTION("cputypes") {
+  CATCH_SECTION("cputypes") {
     for (int i = 0; i < static_cast<int>(torch::Dtype::NumOptions); i++) {
       if (i == static_cast<int>(torch::Dtype::Half)) {
         // XXX can't serialize half tensors at the moment since contiguous() is
@@ -69,17 +69,17 @@ TEST_CASE("serialization") {
       torch::save(ss, &x);
       torch::load(ss, &y);
 
-      REQUIRE(y.defined());
-      REQUIRE(x.sizes().vec() == y.sizes().vec());
+      CATCH_REQUIRE(y.defined());
+      CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec());
       if (torch::isIntegralType(static_cast<torch::Dtype>(i))) {
-        REQUIRE(x.equal(y));
+        CATCH_REQUIRE(x.equal(y));
       } else {
-        REQUIRE(x.allclose(y));
+        CATCH_REQUIRE(x.allclose(y));
       }
     }
   }
 
-  SECTION("binary") {
+  CATCH_SECTION("binary") {
     auto x = torch::randn({5, 5});
     auto y = torch::Tensor();
 
@@ -93,11 +93,11 @@ TEST_CASE("serialization") {
       archive(y);
     }
 
-    REQUIRE(y.defined());
-    REQUIRE(x.sizes().vec() == y.sizes().vec());
-    REQUIRE(x.allclose(y));
+    CATCH_REQUIRE(y.defined());
+    CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec());
+    CATCH_REQUIRE(x.allclose(y));
   }
-  SECTION("portable_binary") {
+  CATCH_SECTION("portable_binary") {
     auto x = torch::randn({5, 5});
     auto y = torch::Tensor();
 
@@ -111,12 +111,12 @@ TEST_CASE("serialization") {
       archive(y);
     }
 
-    REQUIRE(y.defined());
-    REQUIRE(x.sizes().vec() == y.sizes().vec());
-    REQUIRE(x.allclose(y));
+    CATCH_REQUIRE(y.defined());
+    CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec());
+    CATCH_REQUIRE(x.allclose(y));
   }
 
-  SECTION("resized") {
+  CATCH_SECTION("resized") {
     auto x = torch::randn({11, 5});
     x.resize_({5, 5});
     auto y = torch::Tensor();
@@ -131,11 +131,11 @@ TEST_CASE("serialization") {
       archive(y);
     }
 
-    REQUIRE(y.defined());
-    REQUIRE(x.sizes().vec() == y.sizes().vec());
-    REQUIRE(x.allclose(y));
+    CATCH_REQUIRE(y.defined());
+    CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec());
+    CATCH_REQUIRE(x.allclose(y));
   }
-  SECTION("sliced") {
+  CATCH_SECTION("sliced") {
     auto x = torch::randn({11, 5});
     x = x.slice(0, 1, 3);
     auto y = torch::Tensor();
@@ -150,12 +150,12 @@ TEST_CASE("serialization") {
       archive(y);
     }
 
-    REQUIRE(y.defined());
-    REQUIRE(x.sizes().vec() == y.sizes().vec());
-    REQUIRE(x.allclose(y));
+    CATCH_REQUIRE(y.defined());
+    CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec());
+    CATCH_REQUIRE(x.allclose(y));
   }
 
-  SECTION("noncontig") {
+  CATCH_SECTION("noncontig") {
     auto x = torch::randn({11, 5});
     x = x.slice(1, 1, 4);
     auto y = torch::Tensor();
@@ -170,12 +170,12 @@ TEST_CASE("serialization") {
       archive(y);
     }
 
-    REQUIRE(y.defined());
-    REQUIRE(x.sizes().vec() == y.sizes().vec());
-    REQUIRE(x.allclose(y));
+    CATCH_REQUIRE(y.defined());
+    CATCH_REQUIRE(x.sizes().vec() == y.sizes().vec());
+    CATCH_REQUIRE(x.allclose(y));
   }
 
-  SECTION("xor") {
+  CATCH_SECTION("xor") {
     // We better be able to save and load a XOR model!
     auto getLoss = [](Sequential model, uint32_t batch_size) {
       auto inputs = torch::empty({batch_size, 2});
@@ -207,7 +207,7 @@ TEST_CASE("serialization") {
       optimizer.step();
 
       running_loss = running_loss * 0.99 + loss.sum().toCFloat() * 0.01;
-      REQUIRE(epoch < 3000);
+      CATCH_REQUIRE(epoch < 3000);
       epoch++;
     }
 
@@ -216,10 +216,10 @@ TEST_CASE("serialization") {
     torch::load(ss, model2);
 
     auto loss = getLoss(model2, 100);
-    REQUIRE(loss.toCFloat() < 0.1);
+    CATCH_REQUIRE(loss.toCFloat() < 0.1);
   }
 
-  SECTION("optim") {
+  CATCH_SECTION("optim") {
     auto model1 = Linear(5, 2);
     auto model2 = Linear(5, 2);
     auto model3 = Linear(5, 2);
@@ -235,8 +235,8 @@ TEST_CASE("serialization") {
     auto param2 = model2->parameters();
     auto param3 = model3->parameters();
     for (const auto& p : param1) {
-      REQUIRE(param1[p.key].allclose(param2[p.key]));
-      REQUIRE(param2[p.key].allclose(param3[p.key]));
+      CATCH_REQUIRE(param1[p.key].allclose(param2[p.key]));
+      CATCH_REQUIRE(param2[p.key].allclose(param3[p.key]));
     }
 
     // Make some optimizers with momentum (and thus state)
@@ -281,13 +281,13 @@ TEST_CASE("serialization") {
     for (const auto& p : param1) {
       const auto& name = p.key;
       // Model 1 and 3 should be the same
-      REQUIRE(param1[name].norm().toCFloat() == param3[name].norm().toCFloat());
-      REQUIRE(param1[name].norm().toCFloat() != param2[name].norm().toCFloat());
+      CATCH_REQUIRE(param1[name].norm().toCFloat() == param3[name].norm().toCFloat());
+      CATCH_REQUIRE(param1[name].norm().toCFloat() != param2[name].norm().toCFloat());
     }
   }
 }
 
-TEST_CASE("serialization_cuda", "[cuda]") {
+CATCH_TEST_CASE("serialization_cuda", "[cuda]") {
   torch::manual_seed(0);
   // We better be able to save and load a XOR model!
   auto getLoss = [](Sequential model, uint32_t batch_size) {
@@ -318,7 +318,7 @@ TEST_CASE("serialization_cuda", "[cuda]") {
     optimizer.step();
 
     running_loss = running_loss * 0.99 + loss.sum().toCFloat() * 0.01;
-    REQUIRE(epoch < 3000);
+    CATCH_REQUIRE(epoch < 3000);
     epoch++;
   }
 
@@ -327,7 +327,7 @@ TEST_CASE("serialization_cuda", "[cuda]") {
   torch::load(ss, model2);
 
   auto loss = getLoss(model2, 100);
-  REQUIRE(loss.toCFloat() < 0.1);
+  CATCH_REQUIRE(loss.toCFloat() < 0.1);
 
   model2->to(torch::kCUDA);
   ss.clear();
@@ -335,5 +335,5 @@ TEST_CASE("serialization_cuda", "[cuda]") {
   torch::load(ss, model3);
 
   loss = getLoss(model3, 100);
-  REQUIRE(loss.toCFloat() < 0.1);
+  CATCH_REQUIRE(loss.toCFloat() < 0.1);
 }
diff --git a/test/cpp/api/static.cpp b/test/cpp/api/static.cpp
index 121478c928ac1b..827ff251a7c483 100644
--- a/test/cpp/api/static.cpp
+++ b/test/cpp/api/static.cpp
@@ -1,4 +1,5 @@
-#include <catch.hpp>
+
+#include "gtest/gtest.h"
 
 #include <torch/detail/static.h>
 #include <torch/nn/module.h>
@@ -22,43 +23,35 @@ torch::detail::enable_if_module_t<T, bool> f(T&& m) {
   return true;
 }
 
-TEST_CASE("static") {
-  SECTION("all_of") {
-    REQUIRE(torch::all_of<>::value == true);
-    REQUIRE(torch::all_of<true>::value == true);
-    REQUIRE(torch::all_of<true, true, true>::value == true);
-    REQUIRE(torch::all_of<false>::value == false);
-    REQUIRE(torch::all_of<false, false, false>::value == false);
-    REQUIRE(torch::all_of<true, true, false>::value == false);
-  }
-  SECTION("any_of") {
-    REQUIRE(torch::any_of<>::value == false);
-    REQUIRE(torch::any_of<true>::value == true);
-    REQUIRE(torch::any_of<true, true, true>::value == true);
-    REQUIRE(torch::any_of<false>::value == false);
-    REQUIRE(torch::any_of<true, true, false>::value == true);
-  }
-  SECTION("enable_if_module_t") {
-    REQUIRE(f(torch::nn::LinearImpl(1, 2)) == true);
-    REQUIRE(f(5) == false);
-  }
-  SECTION("check_not_lvalue_references") {
-    REQUIRE(torch::detail::check_not_lvalue_references<int>() == true);
-    REQUIRE(
-        torch::detail::check_not_lvalue_references<float, int, char>() == true);
-    REQUIRE(
-        torch::detail::check_not_lvalue_references<float, int&, char>() ==
-        false);
-    REQUIRE(torch::detail::check_not_lvalue_references<std::string>() == true);
-    REQUIRE(
-        torch::detail::check_not_lvalue_references<std::string&>() == false);
-  }
-  SECTION("apply") {
-    std::vector<int> v;
-    torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5);
-    REQUIRE(v.size() == 5);
-    for (size_t i = 0; i < v.size(); ++i) {
-      REQUIRE(v.at(i) == 1 + i);
-    }
+TEST(TestStatic, All_Of){
+  EXPECT_TRUE(torch::all_of<>::value);
+  EXPECT_TRUE(torch::all_of<true>::value);
+  EXPECT_TRUE((torch::all_of<true, true, true>::value));
+  EXPECT_FALSE(torch::all_of<false>::value);
+  EXPECT_FALSE((torch::all_of<false, false, false>::value));
+  EXPECT_FALSE((torch::all_of<true, true, false>::value));
+}
+TEST(TestStatic, Any_Of){
+  EXPECT_FALSE(torch::any_of<>::value);
+  EXPECT_TRUE(bool((torch::any_of<true>::value)));
+  EXPECT_TRUE(bool((torch::any_of<true, true, true>::value)));
+  EXPECT_FALSE(bool((torch::any_of<false>::value)));
+}
+TEST(TestStatic, Enable_If_Module){
+  EXPECT_TRUE(f(torch::nn::LinearImpl(1, 2)));
+  EXPECT_FALSE(f(5));
+  EXPECT_TRUE(torch::detail::check_not_lvalue_references<int>());
+  EXPECT_TRUE((torch::detail::check_not_lvalue_references<float, int, char>()));
+  EXPECT_FALSE(
+      (torch::detail::check_not_lvalue_references<float, int&, char>()));
+  EXPECT_TRUE(torch::detail::check_not_lvalue_references<std::string>());
+  EXPECT_FALSE(torch::detail::check_not_lvalue_references<std::string&>());
+}
+TEST(TestStatic, Apply){
+  std::vector<int> v;
+  torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5);
+  EXPECT_EQ(v.size(), 5);
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v.at(i), i + 1);
   }
 }
diff --git a/test/cpp/api/tensor.cpp b/test/cpp/api/tensor.cpp
index f08a30d13c1e6e..57605562db1258 100644
--- a/test/cpp/api/tensor.cpp
+++ b/test/cpp/api/tensor.cpp
@@ -1,4 +1,4 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <torch/tensor.h>
 
@@ -19,12 +19,12 @@ bool almost_equal(at::Tensor left, T right, T tolerance = 1e-4) {
 }
 
 #define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_)                \
-  REQUIRE(tensor.device().type() == at::Device((device_), (index_)).type());   \
-  REQUIRE(tensor.device().index() == at::Device((device_), (index_)).index()); \
-  REQUIRE(tensor.dtype() == (type_));                                          \
-  REQUIRE(tensor.layout() == (layout_))
+  CATCH_REQUIRE(tensor.device().type() == at::Device((device_), (index_)).type());   \
+  CATCH_REQUIRE(tensor.device().index() == at::Device((device_), (index_)).index()); \
+  CATCH_REQUIRE(tensor.dtype() == (type_));                                          \
+  CATCH_REQUIRE(tensor.layout() == (layout_))
 
-TEST_CASE("Tensor/ToDtype") {
+CATCH_TEST_CASE("Tensor/ToDtype") {
   auto tensor = at::empty({3, 4});
   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
 
@@ -39,7 +39,7 @@ TEST_CASE("Tensor/ToDtype") {
 }
 
 // Not currently supported.
-// TEST_CASE("Tensor/ToLayout") {
+// CATCH_TEST_CASE("Tensor/ToLayout") {
 //   auto tensor = at::empty({3, 4});
 //   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
 //
@@ -50,7 +50,7 @@ TEST_CASE("Tensor/ToDtype") {
 //   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
 // }
 
-TEST_CASE("Tensor/ToDevice", "[cuda]") {
+CATCH_TEST_CASE("Tensor/ToDevice", "[cuda]") {
   auto tensor = at::empty({3, 4});
   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
 
@@ -67,7 +67,7 @@ TEST_CASE("Tensor/ToDevice", "[cuda]") {
   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
 }
 
-TEST_CASE("Tensor/ToDeviceAndDtype", "[cuda]") {
+CATCH_TEST_CASE("Tensor/ToDeviceAndDtype", "[cuda]") {
   auto tensor = at::empty({3, 4});
   REQUIRE_TENSOR_OPTIONS(at::kCPU, -1, at::kFloat, at::kStrided);
 
@@ -75,119 +75,119 @@ TEST_CASE("Tensor/ToDeviceAndDtype", "[cuda]") {
   REQUIRE_TENSOR_OPTIONS(at::kCUDA, 1, at::kInt, at::kStrided);
 }
 
-TEST_CASE("Tensor/ToOptionsRespectsRequiresGrad") {
+CATCH_TEST_CASE("Tensor/ToOptionsRespectsRequiresGrad") {
   {
     auto tensor = torch::empty({3, 4}, at::requires_grad());
-    REQUIRE(tensor.requires_grad());
+    CATCH_REQUIRE(tensor.requires_grad());
 
     tensor = tensor.to(at::kDouble);
-    REQUIRE(tensor.requires_grad());
+    CATCH_REQUIRE(tensor.requires_grad());
   }
   {
     auto tensor = torch::empty({3, 4});
-    REQUIRE(!tensor.requires_grad());
+    CATCH_REQUIRE(!tensor.requires_grad());
 
     tensor = tensor.to(at::kDouble);
-    REQUIRE(!tensor.requires_grad());
+    CATCH_REQUIRE(!tensor.requires_grad());
   }
 }
 
-TEST_CASE("Tensor/ToDoesNotCopyWhenOptionsAreAllTheSame") {
+CATCH_TEST_CASE("Tensor/ToDoesNotCopyWhenOptionsAreAllTheSame") {
   auto tensor = at::empty({3, 4}, at::kFloat);
   auto hopefully_not_copy = tensor.to(at::kFloat);
-  REQUIRE(hopefully_not_copy.data<float>() == tensor.data<float>());
+  CATCH_REQUIRE(hopefully_not_copy.data<float>() == tensor.data<float>());
 }
 
-TEST_CASE("Tensor/ContainsCorrectValueForSingleValue") {
+CATCH_TEST_CASE("Tensor/ContainsCorrectValueForSingleValue") {
   auto tensor = at::tensor(123);
-  REQUIRE(tensor.numel() == 1);
-  REQUIRE(tensor.dtype() == at::kInt);
-  REQUIRE(tensor[0].toCInt() == 123);
+  CATCH_REQUIRE(tensor.numel() == 1);
+  CATCH_REQUIRE(tensor.dtype() == at::kInt);
+  CATCH_REQUIRE(tensor[0].toCInt() == 123);
 
   tensor = at::tensor(123.456f);
-  REQUIRE(tensor.numel() == 1);
-  REQUIRE(tensor.dtype() == at::kFloat);
-  REQUIRE(almost_equal(tensor[0], 123.456f));
+  CATCH_REQUIRE(tensor.numel() == 1);
+  CATCH_REQUIRE(tensor.dtype() == at::kFloat);
+  CATCH_REQUIRE(almost_equal(tensor[0], 123.456f));
 
   tensor = at::tensor(123.456);
-  REQUIRE(tensor.numel() == 1);
-  REQUIRE(tensor.dtype() == at::kDouble);
-  REQUIRE(almost_equal(tensor[0], 123.456));
+  CATCH_REQUIRE(tensor.numel() == 1);
+  CATCH_REQUIRE(tensor.dtype() == at::kDouble);
+  CATCH_REQUIRE(almost_equal(tensor[0], 123.456));
 }
 
-TEST_CASE("Tensor/ContainsCorrectValuesForManyValues") {
+CATCH_TEST_CASE("Tensor/ContainsCorrectValuesForManyValues") {
   auto tensor = at::tensor({1, 2, 3});
-  REQUIRE(tensor.numel() == 3);
-  REQUIRE(tensor.dtype() == at::kInt);
-  REQUIRE(exactly_equal(tensor[0], 1));
-  REQUIRE(exactly_equal(tensor[1], 2));
-  REQUIRE(exactly_equal(tensor[2], 3));
+  CATCH_REQUIRE(tensor.numel() == 3);
+  CATCH_REQUIRE(tensor.dtype() == at::kInt);
+  CATCH_REQUIRE(exactly_equal(tensor[0], 1));
+  CATCH_REQUIRE(exactly_equal(tensor[1], 2));
+  CATCH_REQUIRE(exactly_equal(tensor[2], 3));
 
   tensor = at::tensor({1.5, 2.25, 3.125});
-  REQUIRE(tensor.numel() == 3);
-  REQUIRE(tensor.dtype() == at::kDouble);
-  REQUIRE(almost_equal(tensor[0], 1.5));
-  REQUIRE(almost_equal(tensor[1], 2.25));
-  REQUIRE(almost_equal(tensor[2], 3.125));
+  CATCH_REQUIRE(tensor.numel() == 3);
+  CATCH_REQUIRE(tensor.dtype() == at::kDouble);
+  CATCH_REQUIRE(almost_equal(tensor[0], 1.5));
+  CATCH_REQUIRE(almost_equal(tensor[1], 2.25));
+  CATCH_REQUIRE(almost_equal(tensor[2], 3.125));
 }
 
-TEST_CASE("Tensor/ContainsCorrectValuesForManyValuesVariable") {
+CATCH_TEST_CASE("Tensor/ContainsCorrectValuesForManyValuesVariable") {
   auto tensor = torch::tensor({1, 2, 3});
-  REQUIRE(tensor.is_variable());
-  REQUIRE(tensor.numel() == 3);
-  REQUIRE(tensor.dtype() == at::kInt);
-  REQUIRE(exactly_equal(tensor[0], 1));
-  REQUIRE(exactly_equal(tensor[1], 2));
-  REQUIRE(exactly_equal(tensor[2], 3));
+  CATCH_REQUIRE(tensor.is_variable());
+  CATCH_REQUIRE(tensor.numel() == 3);
+  CATCH_REQUIRE(tensor.dtype() == at::kInt);
+  CATCH_REQUIRE(exactly_equal(tensor[0], 1));
+  CATCH_REQUIRE(exactly_equal(tensor[1], 2));
+  CATCH_REQUIRE(exactly_equal(tensor[2], 3));
 
   tensor = torch::tensor({1.5, 2.25, 3.125});
-  REQUIRE(tensor.is_variable());
-  REQUIRE(tensor.numel() == 3);
-  REQUIRE(tensor.dtype() == at::kDouble);
-  REQUIRE(almost_equal(tensor[0], 1.5));
-  REQUIRE(almost_equal(tensor[1], 2.25));
-  REQUIRE(almost_equal(tensor[2], 3.125));
+  CATCH_REQUIRE(tensor.is_variable());
+  CATCH_REQUIRE(tensor.numel() == 3);
+  CATCH_REQUIRE(tensor.dtype() == at::kDouble);
+  CATCH_REQUIRE(almost_equal(tensor[0], 1.5));
+  CATCH_REQUIRE(almost_equal(tensor[1], 2.25));
+  CATCH_REQUIRE(almost_equal(tensor[2], 3.125));
 }
 
-TEST_CASE("Tensor/ContainsCorrectValuesWhenConstructedFromVector") {
+CATCH_TEST_CASE("Tensor/ContainsCorrectValuesWhenConstructedFromVector") {
   std::vector<int> v = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
   auto tensor = at::tensor(v);
-  REQUIRE(tensor.numel() == v.size());
-  REQUIRE(tensor.dtype() == at::kInt);
+  CATCH_REQUIRE(tensor.numel() == v.size());
+  CATCH_REQUIRE(tensor.dtype() == at::kInt);
   for (size_t i = 0; i < v.size(); ++i) {
-    REQUIRE(exactly_equal(tensor[i], v.at(i)));
+    CATCH_REQUIRE(exactly_equal(tensor[i], v.at(i)));
   }
 
   std::vector<float> w = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.0};
   tensor = at::tensor(w);
-  REQUIRE(tensor.numel() == w.size());
-  REQUIRE(tensor.dtype() == at::kFloat);
+  CATCH_REQUIRE(tensor.numel() == w.size());
+  CATCH_REQUIRE(tensor.dtype() == at::kFloat);
   for (size_t i = 0; i < w.size(); ++i) {
-    REQUIRE(almost_equal(tensor[i], w.at(i)));
+    CATCH_REQUIRE(almost_equal(tensor[i], w.at(i)));
   }
 }
 
-TEST_CASE("Tensor/UsesOptionsThatAreSupplied") {
+CATCH_TEST_CASE("Tensor/UsesOptionsThatAreSupplied") {
   auto tensor = at::tensor(123, dtype(at::kFloat)) + 0.5;
-  REQUIRE(tensor.numel() == 1);
-  REQUIRE(tensor.dtype() == at::kFloat);
-  REQUIRE(almost_equal(tensor[0], 123.5));
+  CATCH_REQUIRE(tensor.numel() == 1);
+  CATCH_REQUIRE(tensor.dtype() == at::kFloat);
+  CATCH_REQUIRE(almost_equal(tensor[0], 123.5));
 
   tensor = at::tensor({1.1, 2.2, 3.3}, dtype(at::kInt));
-  REQUIRE(tensor.numel() == 3);
-  REQUIRE(tensor.dtype() == at::kInt);
-  REQUIRE(tensor.layout() == at::kStrided);
-  REQUIRE(exactly_equal(tensor[0], 1));
-  REQUIRE(exactly_equal(tensor[1], 2));
-  REQUIRE(exactly_equal(tensor[2], 3));
+  CATCH_REQUIRE(tensor.numel() == 3);
+  CATCH_REQUIRE(tensor.dtype() == at::kInt);
+  CATCH_REQUIRE(tensor.layout() == at::kStrided);
+  CATCH_REQUIRE(exactly_equal(tensor[0], 1));
+  CATCH_REQUIRE(exactly_equal(tensor[1], 2));
+  CATCH_REQUIRE(exactly_equal(tensor[2], 3));
 }
 
-TEST_CASE("FromBlob") {
+CATCH_TEST_CASE("FromBlob") {
   std::vector<int32_t> v = {1, 2, 3};
   auto tensor = torch::from_blob(v.data(), v.size(), torch::kInt32);
-  REQUIRE(tensor.is_variable());
-  REQUIRE(tensor.numel() == 3);
-  REQUIRE(tensor[0].toCInt() == 1);
-  REQUIRE(tensor[1].toCInt() == 2);
-  REQUIRE(tensor[2].toCInt() == 3);
+  CATCH_REQUIRE(tensor.is_variable());
+  CATCH_REQUIRE(tensor.numel() == 3);
+  CATCH_REQUIRE(tensor[0].toCInt() == 1);
+  CATCH_REQUIRE(tensor[1].toCInt() == 2);
+  CATCH_REQUIRE(tensor[2].toCInt() == 3);
 }
diff --git a/test/cpp/api/tensor_cuda.cpp b/test/cpp/api/tensor_cuda.cpp
index 82d874e74b11b0..8f85014499d201 100644
--- a/test/cpp/api/tensor_cuda.cpp
+++ b/test/cpp/api/tensor_cuda.cpp
@@ -1,11 +1,11 @@
-#include <catch.hpp>
+#include "catch_utils.hpp"
 
 #include <ATen/ATen.h>
 
 #include <cmath>
 
-TEST_CASE("Tensor/AllocatesTensorOnTheCorrectDevice", "[multi-cuda]") {
+CATCH_TEST_CASE("Tensor/AllocatesTensorOnTheCorrectDevice", "[multi-cuda]") {
   auto tensor = at::tensor({1, 2, 3}, at::device({at::kCUDA, 1}));
-  REQUIRE(tensor.device().type() == at::Device::Type::CUDA);
-  REQUIRE(tensor.device().index() == 1);
+  CATCH_REQUIRE(tensor.device().type() == at::Device::Type::CUDA);
+  CATCH_REQUIRE(tensor.device().index() == 1);
 }
diff --git a/test/cpp/api/tensor_options.cpp b/test/cpp/api/tensor_options.cpp
index ab80c5f45ab396..7118a3599a5d1a 100644
--- a/test/cpp/api/tensor_options.cpp
+++ b/test/cpp/api/tensor_options.cpp
@@ -1,4 +1,4 @@
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include <torch/tensor.h>
 
@@ -14,28 +14,28 @@ using namespace at;
 
 // A macro so we don't lose location information when an assertion fails.
 #define REQUIRE_OPTIONS(device_, index_, type_, layout_)                    \
-  REQUIRE(options.device().type() == Device((device_), (index_)).type());   \
-  REQUIRE(options.device().index() == Device((device_), (index_)).index()); \
-  REQUIRE(options.dtype() == (type_));                                      \
-  REQUIRE(options.layout() == (layout_))
+  CATCH_REQUIRE(options.device().type() == Device((device_), (index_)).type());   \
+  CATCH_REQUIRE(options.device().index() == Device((device_), (index_)).index()); \
+  CATCH_REQUIRE(options.dtype() == (type_));                                      \
+  CATCH_REQUIRE(options.layout() == (layout_))
 
 #define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_)            \
-  REQUIRE(tensor.device().type() == Device((device_), (index_)).type());   \
-  REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \
-  REQUIRE(tensor.type().scalarType() == (type_));                          \
-  REQUIRE(tensor.type().layout() == (layout_))
+  CATCH_REQUIRE(tensor.device().type() == Device((device_), (index_)).type());   \
+  CATCH_REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \
+  CATCH_REQUIRE(tensor.type().scalarType() == (type_));                          \
+  CATCH_REQUIRE(tensor.type().layout() == (layout_))
 
-TEST_CASE("TensorOptions/DefaultsToTheRightValues") {
+CATCH_TEST_CASE("TensorOptions/DefaultsToTheRightValues") {
   TensorOptions options;
   REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided);
 }
 
-TEST_CASE("TensorOptions/ReturnsTheCorrectType") {
+CATCH_TEST_CASE("TensorOptions/ReturnsTheCorrectType") {
   auto options = TensorOptions().device(kCPU).dtype(kInt).layout(kSparse);
-  REQUIRE(at::getType(options) == getNonVariableType(Backend::SparseCPU, kInt));
+  CATCH_REQUIRE(at::getType(options) == getNonVariableType(Backend::SparseCPU, kInt));
 }
 
-TEST_CASE("TensorOptions/UtilityFunctionsReturnTheRightTensorOptions") {
+CATCH_TEST_CASE("TensorOptions/UtilityFunctionsReturnTheRightTensorOptions") {
   auto options = dtype(kInt);
   REQUIRE_OPTIONS(kCPU, -1, kInt, kStrided);
 
@@ -52,7 +52,7 @@ TEST_CASE("TensorOptions/UtilityFunctionsReturnTheRightTensorOptions") {
   REQUIRE_OPTIONS(kCUDA, 3, kByte, kSparse);
 }
 
-TEST_CASE("TensorOptions/ConstructsWellFromCPUTypes") {
+CATCH_TEST_CASE("TensorOptions/ConstructsWellFromCPUTypes") {
   TensorOptions options;
   REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided);
 
@@ -69,7 +69,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCPUTypes") {
   REQUIRE_OPTIONS(kCPU, -1, kByte, kSparse);
 }
 
-TEST_CASE("TensorOptions/ConstructsWellFromCPUTensors") {
+CATCH_TEST_CASE("TensorOptions/ConstructsWellFromCPUTensors") {
   auto options = empty(5, kDouble).options();
   REQUIRE_OPTIONS(kCPU, -1, kDouble, kStrided);
 
@@ -77,37 +77,37 @@ TEST_CASE("TensorOptions/ConstructsWellFromCPUTensors") {
   REQUIRE_OPTIONS(kCPU, -1, kByte, kSparse);
 }
 
-TEST_CASE("TensorOptions/ConstructsWellFromVariables") {
+CATCH_TEST_CASE("TensorOptions/ConstructsWellFromVariables") {
   auto options = torch::empty(5).options();
   REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided);
-  REQUIRE(!options.requires_grad());
+  CATCH_REQUIRE(!options.requires_grad());
 
   options = torch::empty(5, at::requires_grad()).options();
   REQUIRE_OPTIONS(kCPU, -1, kFloat, kStrided);
-  REQUIRE(!options.requires_grad());
+  CATCH_REQUIRE(!options.requires_grad());
 }
 
-TEST_CASE("Device/ParsesCorrectlyFromString") {
+CATCH_TEST_CASE("Device/ParsesCorrectlyFromString") {
   Device device("cpu:0");
-  REQUIRE(device == Device(kCPU, 0));
+  CATCH_REQUIRE(device == Device(kCPU, 0));
 
   device = Device("cpu");
-  REQUIRE(device == Device(kCPU));
+  CATCH_REQUIRE(device == Device(kCPU));
 
   device = Device("cuda:123");
-  REQUIRE(device == Device(kCUDA, 123));
+  CATCH_REQUIRE(device == Device(kCUDA, 123));
 
   device = Device("cuda");
-  REQUIRE(device == Device(kCUDA));
+  CATCH_REQUIRE(device == Device(kCUDA));
 
   std::vector<std::string> badnesses = {
       "", "cud:1", "cuda:", "cpu::1", ":1", "3", "tpu:4", "??"};
   for (const auto& badness : badnesses) {
-    REQUIRE_THROWS(Device(badness));
+    _CATCH_REQUIRE_THROWS(Device(badness));
   }
 }
 
-TEST_CASE("OptionsGuard") {
+CATCH_TEST_CASE("OptionsGuard") {
   Tensor tensor;
   {
     OptionsGuard guard(TensorOptions{});
@@ -132,5 +132,5 @@ TEST_CASE("OptionsGuard") {
     tensor = torch::empty({10});
   }
   REQUIRE_TENSOR_OPTIONS(kCPU, -1, kFloat, kStrided);
-  REQUIRE(tensor.requires_grad());
+  CATCH_REQUIRE(tensor.requires_grad());
 }
diff --git a/test/cpp/api/tensor_options_cuda.cpp b/test/cpp/api/tensor_options_cuda.cpp
index ea33321b08e2f7..edeede8423c392 100644
--- a/test/cpp/api/tensor_options_cuda.cpp
+++ b/test/cpp/api/tensor_options_cuda.cpp
@@ -1,4 +1,4 @@
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 #include <ATen/Context.h>
 #include <ATen/DeviceGuard.h>
@@ -10,18 +10,18 @@ using namespace at;
 
 // A macro so we don't lose location information when an assertion fails.
 #define REQUIRE_OPTIONS(device_, index_, type_, layout_)                    \
-  REQUIRE(options.device().type() == Device((device_), (index_)).type());   \
-  REQUIRE(options.device().index() == Device((device_), (index_)).index()); \
-  REQUIRE(options.dtype() == (type_));                                      \
-  REQUIRE(options.layout() == (layout_))
+  CATCH_REQUIRE(options.device().type() == Device((device_), (index_)).type());   \
+  CATCH_REQUIRE(options.device().index() == Device((device_), (index_)).index()); \
+  CATCH_REQUIRE(options.dtype() == (type_));                                      \
+  CATCH_REQUIRE(options.layout() == (layout_))
 
 #define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_)            \
-  REQUIRE(tensor.device().type() == Device((device_), (index_)).type());   \
-  REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \
-  REQUIRE(tensor.type().scalarType() == (type_));                          \
-  REQUIRE(tensor.type().layout() == (layout_))
+  CATCH_REQUIRE(tensor.device().type() == Device((device_), (index_)).type());   \
+  CATCH_REQUIRE(tensor.device().index() == Device((device_), (index_)).index()); \
+  CATCH_REQUIRE(tensor.type().scalarType() == (type_));                          \
+  CATCH_REQUIRE(tensor.type().layout() == (layout_))
 
-TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") {
+CATCH_TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") {
   auto options = CUDA(kFloat).options();
   REQUIRE_OPTIONS(kCUDA, -1, kFloat, kStrided);
 
@@ -41,7 +41,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCUDATypes", "[cuda]") {
   REQUIRE_OPTIONS(kCUDA, 5, kFloat, kSparse);
 }
 
-TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[multi-cuda]") {
+CATCH_TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[multi-cuda]") {
   auto options = empty(5, device(kCUDA).dtype(kDouble)).options();
   REQUIRE_OPTIONS(kCUDA, 0, kDouble, kStrided);
 
@@ -66,7 +66,7 @@ TEST_CASE("TensorOptions/ConstructsWellFromCUDATensors", "[multi-cuda]") {
   }
 }
 
-TEST_CASE("OptionsGuardCUDA", "[multi-cuda]") {
+CATCH_TEST_CASE("OptionsGuardCUDA", "[multi-cuda]") {
   Tensor tensor;
   {
     OptionsGuard guard(device(kCUDA));
@@ -87,7 +87,7 @@ TEST_CASE("OptionsGuardCUDA", "[multi-cuda]") {
   REQUIRE_TENSOR_OPTIONS(kCUDA, 0, kInt, kStrided);
 }
 
-TEST_CASE("DeviceGuardOptionsGuardInteraction", "[multi-cuda]") {
+CATCH_TEST_CASE("DeviceGuardOptionsGuardInteraction", "[multi-cuda]") {
   Tensor tensor;
   {
     // Check that OptionsGuard respects any active device before construction.
@@ -112,17 +112,17 @@ TEST_CASE("DeviceGuardOptionsGuardInteraction", "[multi-cuda]") {
   }
 }
 
-TEST_CASE("DeviceGuardIsMovable", "[cuda]") {
+CATCH_TEST_CASE("DeviceGuardIsMovable", "[cuda]") {
   DeviceGuard first(1);
-  REQUIRE(first.original_index() == 0);
-  REQUIRE(first.last_index() == 1);
+  CATCH_REQUIRE(first.original_index() == 0);
+  CATCH_REQUIRE(first.last_index() == 1);
   DeviceGuard second(std::move(first));
-  REQUIRE(second.original_index() == 0);
-  REQUIRE(second.last_index() == 1);
-  REQUIRE(first.original_index() == -1);
+  CATCH_REQUIRE(second.original_index() == 0);
+  CATCH_REQUIRE(second.last_index() == 1);
+  CATCH_REQUIRE(first.original_index() == -1);
   DeviceGuard third;
   third = std::move(second);
-  REQUIRE(third.original_index() == 0);
-  REQUIRE(third.last_index() == 1);
-  REQUIRE(second.original_index() == -1);
+  CATCH_REQUIRE(third.original_index() == 0);
+  CATCH_REQUIRE(third.last_index() == 1);
+  CATCH_REQUIRE(second.original_index() == -1);
 }
diff --git a/test/cpp_extensions/complex_registration_extension.cpp b/test/cpp_extensions/complex_registration_extension.cpp
index 4f7cd29cc5a180..db75e3f67f7772 100644
--- a/test/cpp_extensions/complex_registration_extension.cpp
+++ b/test/cpp_extensions/complex_registration_extension.cpp
@@ -10,7 +10,7 @@
 #include "ATen/DeviceGuard.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/TensorImpl.h"
-#include "ATen/UndefinedTensor.h"
+#include "ATen/core/UndefinedTensorImpl.h"
 #include "ATen/Utils.h"
 #include "ATen/WrapDimUtils.h"
 #include "ATen/core/Half.h"
@@ -33,6 +33,7 @@ struct CPUComplexFloatType : public at::CPUTypeDefault {
             /*is_undefined=*/false) {}
 
   ScalarType scalarType() const override;
+  caffe2::TypeMeta typeMeta() const override;
   Backend backend() const override;
   const char* toString() const override;
   size_t elementSizeInBytes() const override;
@@ -49,11 +50,11 @@ struct CPUComplexFloatType : public at::CPUTypeDefault {
       numel *= s;
     }
     Storage s{c10::make_intrusive<StorageImpl>(
-        scalarTypeToDataType(ScalarType::ComplexFloat),
+        scalarTypeToTypeMeta(ScalarType::ComplexFloat),
         numel,
         getCPUAllocator(),
         /* resizable */ true)};
-    Tensor t{c10::make_intrusive<TensorImpl, UndefinedTensor>(
+    Tensor t{c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
         std::move(s),
         at::CPUTensorId(),
         /* is_variable */ false)};
@@ -73,6 +74,10 @@ ScalarType CPUComplexFloatType::scalarType() const {
   return ScalarType::ComplexFloat;
 }
 
+caffe2::TypeMeta CPUComplexFloatType::typeMeta() const {
+  return scalarTypeToTypeMeta(ScalarType::ComplexFloat);
+}
+
 Backend CPUComplexFloatType::backend() const {
   return Backend::CPU;
 }
@@ -80,6 +85,7 @@ Backend CPUComplexFloatType::backend() const {
 const char* CPUComplexFloatType::toString() const {
   return "CPUComplexFloatType";
 }
+
 TypeID CPUComplexFloatType::ID() const {
   return TypeID::CPUComplexFloat;
 }
diff --git a/test/cpp_extensions/half_support.cpp b/test/cpp_extensions/half_support.cpp
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/test/cpp_extensions/half_support.cu b/test/cpp_extensions/half_support.cu
new file mode 100644
index 00000000000000..a3621bfe7c55fb
--- /dev/null
+++ b/test/cpp_extensions/half_support.cu
@@ -0,0 +1,19 @@
+#include <torch/torch.h>
+
+#include <THC/THCNumerics.cuh>
+
+template <typename T, typename U>
+__global__ void half_test_kernel(const T* input, U* output) {
+  if (input[0] < input[1] || input[0] >= input[1]) {
+    output[0] = 123;
+  }
+}
+
+at::Tensor half_test(at::Tensor input) {
+  auto output = at::empty(1, input.options().dtype(at::kFloat));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "half_test", [&] {
+    half_test_kernel<scalar_t>
+        <<<1, 1>>>(input.data<scalar_t>(), output.data<float>());
+  });
+  return output;
+}
diff --git a/test/custom_operator/CMakeLists.txt b/test/custom_operator/CMakeLists.txt
index 114cfda92d83d6..f692bdfae123b9 100644
--- a/test/custom_operator/CMakeLists.txt
+++ b/test/custom_operator/CMakeLists.txt
@@ -1,10 +1,12 @@
 # Basic CMake setup
-cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(custom_ops)
 
 find_package(Torch REQUIRED)
 
-torch_add_custom_op_library(custom_ops op.cpp)
+add_library(custom_ops SHARED op.cpp)
+target_compile_features(custom_ops PUBLIC cxx_range_for)
+target_link_libraries(custom_ops ${TORCH_LIBRARIES})
 
 add_executable(test_custom_ops test_custom_ops.cpp)
 target_link_libraries(test_custom_ops custom_ops)
diff --git a/test/custom_operator/model.py b/test/custom_operator/model.py
index 6f91bd81c577cd..c803c18d467e13 100644
--- a/test/custom_operator/model.py
+++ b/test/custom_operator/model.py
@@ -1,9 +1,20 @@
 import argparse
 import os.path
+import sys
 
 import torch
 
 
+SHARED_LIBRARY_EXTENSIONS = {'linux': 'so', 'darwin': 'dylib', 'win32': 'dll'}
+
+
+def get_custom_op_library_path():
+    extension = SHARED_LIBRARY_EXTENSIONS[sys.platform]
+    path = os.path.abspath('build/libcustom_ops.{}'.format(extension))
+    assert os.path.exists(path), path
+    return path
+
+
 class Model(torch.jit.ScriptModule):
     def __init__(self):
         super(Model, self).__init__()
@@ -20,7 +31,7 @@ def main():
     parser.add_argument("--export-script-module-to", required=True)
     options = parser.parse_args()
 
-    torch.ops.load_library(os.path.abspath('build/libcustom_ops.so'))
+    torch.ops.load_library(get_custom_op_library_path())
 
     model = Model()
     model.save(options.export_script_module_to)
diff --git a/test/custom_operator/op.cpp b/test/custom_operator/op.cpp
index 113476f581c52d..c7f7ee1e7ab3f2 100644
--- a/test/custom_operator/op.cpp
+++ b/test/custom_operator/op.cpp
@@ -1,4 +1,4 @@
-#include <torch/op.h>
+#include <torch/script.h>
 
 #include <cstddef>
 #include <vector>
@@ -16,7 +16,7 @@ std::vector<at::Tensor> custom_op(
 }
 
 static auto registry =
-    torch::RegisterOperators()
+    torch::jit::RegisterOperators()
         // We parse the schema for the user.
         .op("custom::op", &custom_op)
         // User provided schema. Among other things, allows defaulting values,
diff --git a/test/custom_operator/op.h b/test/custom_operator/op.h
index d45123d3afccb4..daa38b6965c5d8 100644
--- a/test/custom_operator/op.h
+++ b/test/custom_operator/op.h
@@ -1,9 +1,9 @@
-#include <torch/op.h>
+#include <torch/script.h>
 
 #include <cstddef>
 #include <vector>
 
-std::vector<at::Tensor> custom_op(
+TORCH_API std::vector<at::Tensor> custom_op(
     at::Tensor tensor,
     double scalar,
     int64_t repeat);
diff --git a/test/custom_operator/test_custom_ops.cpp b/test/custom_operator/test_custom_ops.cpp
index d653d0e2ccf9e0..918eadd1e869fd 100644
--- a/test/custom_operator/test_custom_ops.cpp
+++ b/test/custom_operator/test_custom_ops.cpp
@@ -1,4 +1,4 @@
-#include <torch/op.h>
+#include <torch/script.h>
 
 #include "op.h"
 
@@ -22,9 +22,12 @@ void get_operator_from_registry_and_execute() {
   std::vector<at::Tensor> output;
   torch::jit::pop(stack, output);
 
+  const auto manual = custom_op(torch::ones(5), 2.0, 3);
+
   assert(output.size() == 3);
-  for (const auto& tensor : output) {
-    assert(tensor.allclose(torch::ones(5) * 2));
+  for (size_t i = 0; i < output.size(); ++i) {
+    assert(output[i].allclose(torch::ones(5) * 2));
+    assert(output[i].allclose(manual[i]));
   }
 }
 
@@ -71,10 +74,9 @@ void test_argument_checking_for_serialized_modules(
     module->forward({});
     assert(false);
   } catch (const at::Error& error) {
-    std::cout << error.what_without_backtrace() << std::endl;
     assert(
         std::string(error.what_without_backtrace())
-            .find("custom::op() is missing value for argument 'tensor'") == 0);
+            .find("forward() is missing value for argument 'input'") == 0);
   }
 }
 
diff --git a/test/custom_operator/test_custom_ops.py b/test/custom_operator/test_custom_ops.py
index f7e6b2fa7b9e5f..ab6c958acd55fc 100644
--- a/test/custom_operator/test_custom_ops.py
+++ b/test/custom_operator/test_custom_ops.py
@@ -5,12 +5,12 @@
 
 import torch
 
-from model import Model
+from model import Model, get_custom_op_library_path
 
 
 class TestCustomOperators(unittest.TestCase):
     def setUp(self):
-        self.library_path = os.path.abspath('build/libcustom_ops.so')
+        self.library_path = get_custom_op_library_path()
         torch.ops.load_library(self.library_path)
 
     def test_custom_library_is_loaded(self):
diff --git a/test/expect/TestJit.test_cpp_cuda.expect b/test/expect/TestJit.test_cpp_cuda.expect
index 54a3c16d459c69..451f1f9329601c 100644
--- a/test/expect/TestJit.test_cpp_cuda.expect
+++ b/test/expect/TestJit.test_cpp_cuda.expect
@@ -105,19 +105,19 @@ graph(%0 : Float(2, 3, 4)
       %3 : Float(2, 3, 4)
       %4 : Float(2, 3, 4)) {
   %5 : int = prim::Constant[value=1]()
-  %6 : Float(2, 3, 4), %7 : Float(2, 3, 4) = prim::GradOf[name=aten::add](%0)
+  %6 : Float(2, 3, 4), %7 : Float(2, 3, 4) = prim::GradOf[name="aten::add"](%0)
     block0() {
       %8 : Float(2, 3, 4) = aten::mul(%0, %5)
       -> (%0, %8)
     }
-  %9 : Float(2, 3, 4), %10 : Float(2, 3, 4) = prim::GradOf[name=aten::mul](%6)
+  %9 : Float(2, 3, 4), %10 : Float(2, 3, 4) = prim::GradOf[name="aten::mul"](%6)
     block0() {
       %11 : Float(2, 3, 4) = aten::mul(%6, %2)
       %12 : Float(2, 3, 4) = aten::mul(%6, %4)
       -> (%11, %12)
     }
   %13 : Dynamic = prim::AutogradAdd(%1, %9)
-  %14 : Float(2, 3, 4), %15 : Float(2, 3, 4) = prim::GradOf[name=aten::mul](%13)
+  %14 : Float(2, 3, 4), %15 : Float(2, 3, 4) = prim::GradOf[name="aten::mul"](%13)
     block0() {
       %16 : Float(2, 3, 4) = aten::mul(%13, %3)
       %17 : Float(2, 3, 4) = aten::mul(%13, %2)
@@ -129,38 +129,38 @@ graph(%0 : Float(2, 3, 4)
 }
 
 testDifferentiateWithRequiresGrad
-graph(%0 : Float(2, 3, 4)
-      %1 : Float(2, 3, 4)) {
-  %2 : Float(2, 3, 4) = aten::mul(%1, %1)
+graph(%0 : Float(*)
+      %1 : Float(*)) {
+  %2 : Float(*) = aten::mul(%1, %1)
   %3 : int = prim::Constant[value=1]()
-  %4 : Float(2, 3, 4) = aten::add(%2, %1, %3)
+  %4 : Float(*) = aten::add(%2, %1, %3)
   %5 : int = prim::Constant[value=1]()
-  %6 : Float(2, 3, 4) = aten::add(%4, %0, %5)
-  %7 : Float(2, 3, 4) = aten::mul(%6, %0)
+  %6 : Float(*) = aten::add(%4, %0, %5)
+  %7 : Float(*) = aten::mul(%6, %0)
   %8 : int = prim::Constant[value=1]()
-  %9 : Float(2, 3, 4) = aten::add(%7, %1, %8)
+  %9 : Float(*) = aten::add(%7, %1, %8)
   return (%4, %9, %6);
 }
-graph(%0 : Float(2, 3, 4)
-      %1 : Float(2, 3, 4)
-      %2 : Float(2, 3, 4)
-      %3 : Float(2, 3, 4)) {
+graph(%0 : Float(*)
+      %1 : Float(*)
+      %2 : Float(*)
+      %3 : Float(*)) {
   %4 : int = prim::Constant[value=1]()
-  %5 : Float(2, 3, 4), %6 : Float(2, 3, 4) = prim::GradOf[name=aten::add](%0)
+  %5 : Float(*), %6 : Dynamic = prim::GradOf[name="aten::add"](%0)
     block0() {
-      %7 : Float(2, 3, 4) = aten::mul(%0, %4)
+      %7 : Dynamic = aten::mul(%0, %4)
       -> (%0, %7)
     }
-  %8 : Float(2, 3, 4), %9 : Float(2, 3, 4) = prim::GradOf[name=aten::mul](%5)
+  %8 : Dynamic, %9 : Dynamic = prim::GradOf[name="aten::mul"](%5)
     block0() {
-      %10 : Float(2, 3, 4) = aten::mul(%5, %2)
-      %11 : Float(2, 3, 4) = aten::mul(%5, %3)
+      %10 : Dynamic = aten::mul(%5, %2)
+      %11 : Dynamic = aten::mul(%5, %3)
       -> (%10, %11)
     }
   %12 : Dynamic = prim::AutogradAdd(%1, %8)
-  %13 : Float(2, 3, 4), %14 : Float(2, 3, 4) = prim::GradOf[name=aten::add](%12)
+  %13 : Dynamic, %14 : Dynamic = prim::GradOf[name="aten::add"](%12)
     block0() {
-      %15 : Float(2, 3, 4) = aten::mul(%12, %4)
+      %15 : Dynamic = aten::mul(%12, %4)
       -> (%12, %15)
     }
   %16 : Dynamic = prim::AutogradAdd(%9, %14)
diff --git a/test/expect/TestScript.test_export_dynamic_slice.expect b/test/expect/TestScript.test_export_dynamic_slice.expect
new file mode 100644
index 00000000000000..7182fec515ae74
--- /dev/null
+++ b/test/expect/TestScript.test_export_dynamic_slice.expect
@@ -0,0 +1,40 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "x", type:Tensor dims: 3 4 5}]
+      outputs: [{name: "7", type:Tensor dims: 4 5}]
+      initializers: []
+      nodes: [
+        Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Gather", inputs: [x,2], outputs: [3], attributes: [{ name: 'axis', type: int, value: 0}]},
+        Node {type: "Shape", inputs: [x], outputs: [4], attributes: []},
+        Node {type: "Gather", inputs: [4,1], outputs: [5], attributes: [{ name: 'axis', type: int, value: 0}]},
+        Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Loop", inputs: [5,6,3], outputs: [7], attributes: [{ name: 'body', type: graph, value:
+            GraphProto {
+              name: "torch-jit-export1"
+              inputs: [{name: "i", type:Tensor dims: },{name: "cond", type:Tensor dims: },{name: "10", type:Tensor dims: }]
+              outputs: [{name: "18", type:Tensor dims: },{name: "17", type:Tensor dims: }]
+              initializers: []
+              nodes: [
+                Node {type: "Constant", inputs: [], outputs: [11], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+                Node {type: "Unsqueeze", inputs: [2], outputs: [12], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "Unsqueeze", inputs: [i], outputs: [13], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "Unsqueeze", inputs: [11], outputs: [14], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "DynamicSlice", inputs: [x,12,13,14], outputs: [15], attributes: []},
+                Node {type: "ReduceSum", inputs: [15], outputs: [16], attributes: [{ name: 'axes', type: ints, values: [0]},{ name: 'keepdims', type: int, value: 0}]},
+                Node {type: "Add", inputs: [10,16], outputs: [17], attributes: []},
+                Node {type: "Constant", inputs: [], outputs: [18], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}
+              ]
+            }
+
+          }]}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_index_put_trace_with_view.expect b/test/expect/TestScript.test_index_put_trace_with_view.expect
index 37f08643f139a4..cc03d3d5296d08 100644
--- a/test/expect/TestScript.test_index_put_trace_with_view.expect
+++ b/test/expect/TestScript.test_index_put_trace_with_view.expect
@@ -7,6 +7,6 @@ graph(%0 : Double(100)
   %6 : int = prim::Constant[value=0]()
   %7 : Long(4) = aten::_cast_Long(%1, %6)
   %8 : Dynamic[] = prim::ListConstruct(%7)
-  %20 : Double(100) = aten::index_put(%0, %8, %5)
-  return (%20);
+  %9 : Double(100) = aten::index_put(%0, %8, %5)
+  return (%9);
 }
diff --git a/test/expect/TestScript.test_index_put_trace_without_view.expect b/test/expect/TestScript.test_index_put_trace_without_view.expect
index 772308223b454b..c72506796064b2 100644
--- a/test/expect/TestScript.test_index_put_trace_without_view.expect
+++ b/test/expect/TestScript.test_index_put_trace_without_view.expect
@@ -4,6 +4,6 @@ graph(%0 : Double(100)
   %3 : int = prim::Constant[value=0]()
   %4 : Long(4) = aten::_cast_Long(%1, %3)
   %5 : Dynamic[] = prim::ListConstruct(%4)
-  %17 : Double(100) = aten::index_put(%0, %5, %2)
-  return (%17);
+  %6 : Double(100) = aten::index_put(%0, %5, %2)
+  return (%6);
 }
diff --git a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
index 5a099e3b3e6903..fb41a574947080 100644
--- a/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
+++ b/test/expect/TestScript.test_milstm_fusion_cuda-backward.expect
@@ -26,19 +26,15 @@ graph(%0 : Float(*, *)
       %cellgate : Float(*, *)
       %outgate : Float(*, *)
       %27 : Float(*, *)) {
-  %28 : Float(*, *), %29 : Float(*, *) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %17, %1, %27, %0)
-  %30 : Float(*, *), %31 : Float(*, *), %32 : Float(*, *), %33 : Float(*, *), %34 : Float(*, *), %35 : Float(*, *) = prim::FusionGroup_1[device=0](%14, %15, %Wx, %28, %Uz, %22, %16)
-  %36 : Float(*, *) = aten::t(%20)
-  %37 : Float(*, *) = aten::mm(%32, %36)
-  %38 : Float(*, *) = aten::t(%13)
-  %39 : Float(*, *) = aten::mm(%38, %32)
+  %28 : Float(*, *) = prim::FusionGroup_0[device=0](%ingate, %forgetgate, %cellgate, %outgate, %17, %0, %27, %1)
+  %29 : Float(*, *), %30 : Float(*, *), %31 : Float(*, *), %32 : Float(*, *), %33 : Float(*, *), %34 : Float(*, *) = prim::FusionGroup_1[device=0](%14, %15, %Wx, %28, %Uz, %22, %16)
+  %35 : Float(*, *) = aten::t(%13)
+  %36 : Float(*, *) = aten::mm(%35, %31)
+  %37 : Float(*, *) = aten::t(%36)
+  %38 : Float(*, *) = aten::t(%12)
+  %39 : Float(*, *) = aten::mm(%38, %29)
   %40 : Float(*, *) = aten::t(%39)
-  %41 : Float(*, *) = aten::t(%18)
-  %42 : Float(*, *) = aten::mm(%30, %41)
-  %43 : Float(*, *) = aten::t(%12)
-  %44 : Float(*, *) = aten::mm(%43, %30)
-  %45 : Float(*, *) = aten::t(%44)
-  return (%45, %42, %40, %37, %31, %33, %34, %35, %29);
+  return (%40, %37, %30, %32, %33, %34);
 }
 with prim::FusionGroup_0 = graph(%0 : Float(*, *)
       %1 : Float(*, *)
@@ -48,45 +44,42 @@ with prim::FusionGroup_0 = graph(%0 : Float(*, *)
       %5 : Float(*, *)
       %6 : Float(*, *)
       %7 : Float(*, *)) {
-  %8 : Float(*, *) = aten::mul(%5, %3)
-  %9 : Float(*, *) = aten::mul(%6, %6)
-  %10 : Float(*, *) = aten::neg(%9)
-  %11 : int = prim::Constant[value=1]()
+  %8 : Float(*, *) = aten::mul(%7, %6)
+  %9 : Float(*, *) = aten::mul(%7, %3)
+  %10 : Float(*, *) = aten::mul(%6, %6)
+  %11 : Float(*, *) = aten::neg(%10)
   %12 : int = prim::Constant[value=1]()
-  %13 : Float(*, *) = aten::add(%10, %12, %12)
-  %14 : Float(*, *) = aten::mul(%8, %13)
+  %13 : Float(*, *) = aten::add(%11, %12, %12)
+  %14 : Float(*, *) = aten::mul(%9, %13)
   %15 : int = prim::Constant[value=1]()
-  %16 : int = prim::Constant[value=1]()
-  %17 : Float(*, *) = aten::add(%7, %14, %16)
-  %18 : Float(*, *) = aten::mul(%17, %1)
-  %19 : Float(*, *) = aten::mul(%5, %6)
-  %20 : int = prim::Constant[value=1]()
-  %21 : Float(*, *) = aten::mul(%17, %20)
-  %22 : Float(*, *) = aten::mul(%21, %2)
-  %23 : Float(*, *) = aten::mul(%21, %0)
-  %24 : Float(*, *) = aten::mul(%17, %4)
-  %25 : Float(*, *) = aten::neg(%3)
-  %26 : int = prim::Constant[value=1]()
-  %27 : Float(*, *) = aten::add(%25, %26, %26)
-  %28 : Float(*, *) = aten::mul(%19, %3)
-  %29 : Float(*, *) = aten::mul(%28, %27)
-  %30 : Float(*, *) = aten::mul(%2, %2)
-  %31 : Float(*, *) = aten::neg(%30)
-  %32 : int = prim::Constant[value=1]()
-  %33 : Float(*, *) = aten::add(%31, %32, %32)
-  %34 : Float(*, *) = aten::mul(%23, %33)
-  %35 : Float(*, *) = aten::neg(%1)
-  %36 : int = prim::Constant[value=1]()
-  %37 : Float(*, *) = aten::add(%35, %36, %36)
-  %38 : Float(*, *) = aten::mul(%24, %1)
-  %39 : Float(*, *) = aten::mul(%38, %37)
-  %40 : Float(*, *) = aten::neg(%0)
-  %41 : int = prim::Constant[value=1]()
-  %42 : Float(*, *) = aten::add(%40, %41, %41)
-  %43 : Float(*, *) = aten::mul(%22, %0)
-  %44 : Float(*, *) = aten::mul(%43, %42)
-  %45 : Float(*, *) = prim::FusedConcat[dim=1](%44, %39, %34, %29)
-  return (%45, %18);
+  %16 : Float(*, *) = aten::add(%5, %14, %15)
+  %17 : int = prim::Constant[value=1]()
+  %18 : Float(*, *) = aten::mul(%16, %17)
+  %19 : Float(*, *) = aten::mul(%18, %2)
+  %20 : Float(*, *) = aten::mul(%18, %0)
+  %21 : Float(*, *) = aten::mul(%16, %4)
+  %22 : Float(*, *) = aten::neg(%3)
+  %23 : int = prim::Constant[value=1]()
+  %24 : Float(*, *) = aten::add(%22, %23, %23)
+  %25 : Float(*, *) = aten::mul(%8, %3)
+  %26 : Float(*, *) = aten::mul(%25, %24)
+  %27 : Float(*, *) = aten::mul(%2, %2)
+  %28 : Float(*, *) = aten::neg(%27)
+  %29 : int = prim::Constant[value=1]()
+  %30 : Float(*, *) = aten::add(%28, %29, %29)
+  %31 : Float(*, *) = aten::mul(%20, %30)
+  %32 : Float(*, *) = aten::neg(%1)
+  %33 : int = prim::Constant[value=1]()
+  %34 : Float(*, *) = aten::add(%32, %33, %33)
+  %35 : Float(*, *) = aten::mul(%21, %1)
+  %36 : Float(*, *) = aten::mul(%35, %34)
+  %37 : Float(*, *) = aten::neg(%0)
+  %38 : int = prim::Constant[value=1]()
+  %39 : Float(*, *) = aten::add(%37, %38, %38)
+  %40 : Float(*, *) = aten::mul(%19, %0)
+  %41 : Float(*, *) = aten::mul(%40, %39)
+  %42 : Float(*, *) = prim::FusedConcat[dim=1](%41, %36, %31, %26)
+  return (%42);
 }
 with prim::FusionGroup_1 = graph(%0 : Float(*)
       %1 : Float(*)
diff --git a/test/expect/TestScript.test_onnx_export_script_non_alpha_add_sub.expect b/test/expect/TestScript.test_onnx_export_script_non_alpha_add_sub.expect
new file mode 100644
index 00000000000000..1c2b3c655d3324
--- /dev/null
+++ b/test/expect/TestScript.test_onnx_export_script_non_alpha_add_sub.expect
@@ -0,0 +1,22 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "x", type:Tensor dims: 3 4}]
+      outputs: [{name: "7", type:Tensor dims: 1}]
+      initializers: []
+      nodes: [
+        Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Shape", inputs: [x], outputs: [2], attributes: []},
+        Node {type: "Gather", inputs: [2,1], outputs: [3], attributes: [{ name: 'axis', type: int, value: 0}]},
+        Node {type: "Constant", inputs: [], outputs: [4], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Add", inputs: [3,4], outputs: [5], attributes: []},
+        Node {type: "Constant", inputs: [], outputs: [6], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Sub", inputs: [5,6], outputs: [7], attributes: []}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/expect/TestScript.test_string_cu.expect b/test/expect/TestScript.test_string_cu.expect
index 0fd663ca5cc661..cefcd07bbba130 100644
--- a/test/expect/TestScript.test_string_cu.expect
+++ b/test/expect/TestScript.test_string_cu.expect
@@ -1,7 +1,7 @@
 graph(%a : Dynamic) {
   %2 : int = prim::Constant[value=2]()
-  %1 : string = prim::Constant[string=a\n\tb\n]()
-  %3 : string = prim::Constant[string=aa]()
+  %1 : string = prim::Constant[value="a\n\tb\n"]()
+  %3 : string = prim::Constant[value="aa"]()
    = prim::Print(%a, %1, %2, %3)
   return (%a);
 }
diff --git a/test/onnx/expect/TestOperators.test_acos.expect b/test/onnx/expect/TestOperators.test_acos.expect
index 219f74f6b2fcb1..acecfeba820a25 100644
--- a/test/onnx/expect/TestOperators.test_acos.expect
+++ b/test/onnx/expect/TestOperators.test_acos.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_add_broadcast.expect b/test/onnx/expect/TestOperators.test_add_broadcast.expect
index 097cd600278781..4062b9ed9fe87b 100644
--- a/test/onnx/expect/TestOperators.test_add_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_broadcast.expect
@@ -56,5 +56,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_add_left_broadcast.expect b/test/onnx/expect/TestOperators.test_add_left_broadcast.expect
index 5b6ade0101acfc..b78cf56b5d4723 100644
--- a/test/onnx/expect/TestOperators.test_add_left_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_left_broadcast.expect
@@ -56,5 +56,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect
index edad6530a8083f..181800f7bba55e 100644
--- a/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect
index 097cd600278781..4062b9ed9fe87b 100644
--- a/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect
@@ -56,5 +56,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect
index 832e076c2e3ad4..19557dc56aff51 100644
--- a/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect
+++ b/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_addconstant.expect b/test/onnx/expect/TestOperators.test_addconstant.expect
index 6aa04d81a0cbe0..5657e671e47653 100644
--- a/test/onnx/expect/TestOperators.test_addconstant.expect
+++ b/test/onnx/expect/TestOperators.test_addconstant.expect
@@ -55,5 +55,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_addmm.expect b/test/onnx/expect/TestOperators.test_addmm.expect
index 972e2b07217a7d..deb017b257f321 100644
--- a/test/onnx/expect/TestOperators.test_addmm.expect
+++ b/test/onnx/expect/TestOperators.test_addmm.expect
@@ -100,5 +100,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_asin.expect b/test/onnx/expect/TestOperators.test_asin.expect
index 2f9438488521da..230374f5c9b857 100644
--- a/test/onnx/expect/TestOperators.test_asin.expect
+++ b/test/onnx/expect/TestOperators.test_asin.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_at_op.expect b/test/onnx/expect/TestOperators.test_at_op.expect
index 61478b558dcfaf..b3e06b0be8fb9b 100644
--- a/test/onnx/expect/TestOperators.test_at_op.expect
+++ b/test/onnx/expect/TestOperators.test_at_op.expect
@@ -48,5 +48,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_atan.expect b/test/onnx/expect/TestOperators.test_atan.expect
index b265c17a1a4a5b..6b5f71786aea5b 100644
--- a/test/onnx/expect/TestOperators.test_atan.expect
+++ b/test/onnx/expect/TestOperators.test_atan.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_basic.expect b/test/onnx/expect/TestOperators.test_basic.expect
index 94b569e8f1b925..be9eb4647779a6 100644
--- a/test/onnx/expect/TestOperators.test_basic.expect
+++ b/test/onnx/expect/TestOperators.test_basic.expect
@@ -71,5 +71,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm.expect b/test/onnx/expect/TestOperators.test_batchnorm.expect
index 18d9e5d56ac6c2..0ddd04cd86141e 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm.expect
@@ -159,5 +159,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm_1d.expect b/test/onnx/expect/TestOperators.test_batchnorm_1d.expect
index c4be699a7111ac..88af65817b710b 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm_1d.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm_1d.expect
@@ -167,5 +167,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect b/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect
index d807b30800d398..a96638c7bf4c8f 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect
@@ -147,5 +147,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_batchnorm_training.expect b/test/onnx/expect/TestOperators.test_batchnorm_training.expect
index 79211b2b3cc3fa..1c53cfe0aa4a93 100644
--- a/test/onnx/expect/TestOperators.test_batchnorm_training.expect
+++ b/test/onnx/expect/TestOperators.test_batchnorm_training.expect
@@ -163,5 +163,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_chunk.expect b/test/onnx/expect/TestOperators.test_chunk.expect
index ddd8ba28ec6517..cb31cfcbeb7cb9 100644
--- a/test/onnx/expect/TestOperators.test_chunk.expect
+++ b/test/onnx/expect/TestOperators.test_chunk.expect
@@ -61,5 +61,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_clip.expect b/test/onnx/expect/TestOperators.test_clip.expect
index 9581da849754a0..6a2d3e544b1e44 100644
--- a/test/onnx/expect/TestOperators.test_clip.expect
+++ b/test/onnx/expect/TestOperators.test_clip.expect
@@ -52,5 +52,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_clip_max.expect b/test/onnx/expect/TestOperators.test_clip_max.expect
index 30160f26432abe..48f3ddc37a4e69 100644
--- a/test/onnx/expect/TestOperators.test_clip_max.expect
+++ b/test/onnx/expect/TestOperators.test_clip_max.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_clip_min.expect b/test/onnx/expect/TestOperators.test_clip_min.expect
index 015c474ebed9f4..55516c3f8f1aba 100644
--- a/test/onnx/expect/TestOperators.test_clip_min.expect
+++ b/test/onnx/expect/TestOperators.test_clip_min.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_concat2.expect b/test/onnx/expect/TestOperators.test_concat2.expect
index af84517fcdd867..3efe305e422abd 100644
--- a/test/onnx/expect/TestOperators.test_concat2.expect
+++ b/test/onnx/expect/TestOperators.test_concat2.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_conv.expect b/test/onnx/expect/TestOperators.test_conv.expect
index 9686a55318160e..bc07077b8a9535 100644
--- a/test/onnx/expect/TestOperators.test_conv.expect
+++ b/test/onnx/expect/TestOperators.test_conv.expect
@@ -117,5 +117,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_convtranspose.expect b/test/onnx/expect/TestOperators.test_convtranspose.expect
index cdaf80988335a2..323dd426a4c510 100644
--- a/test/onnx/expect/TestOperators.test_convtranspose.expect
+++ b/test/onnx/expect/TestOperators.test_convtranspose.expect
@@ -123,5 +123,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_cos.expect b/test/onnx/expect/TestOperators.test_cos.expect
index fc4d799ec0e87d..db4ddebecf2e6a 100644
--- a/test/onnx/expect/TestOperators.test_cos.expect
+++ b/test/onnx/expect/TestOperators.test_cos.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_elu.expect b/test/onnx/expect/TestOperators.test_elu.expect
index 6bb3b42496c5c8..f80dabe1cdbb56 100644
--- a/test/onnx/expect/TestOperators.test_elu.expect
+++ b/test/onnx/expect/TestOperators.test_elu.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_equal.expect b/test/onnx/expect/TestOperators.test_equal.expect
index eb66a1149599f6..1577dae0fdd20d 100644
--- a/test/onnx/expect/TestOperators.test_equal.expect
+++ b/test/onnx/expect/TestOperators.test_equal.expect
@@ -71,5 +71,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_exp.expect b/test/onnx/expect/TestOperators.test_exp.expect
index 326273b565ac7a..5536e52b300402 100644
--- a/test/onnx/expect/TestOperators.test_exp.expect
+++ b/test/onnx/expect/TestOperators.test_exp.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_flatten.expect b/test/onnx/expect/TestOperators.test_flatten.expect
index 87075f3aeb1134..f127eebbbee288 100644
--- a/test/onnx/expect/TestOperators.test_flatten.expect
+++ b/test/onnx/expect/TestOperators.test_flatten.expect
@@ -192,5 +192,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect
index 9e02d7d92e1bd6..63f1f3cc563951 100644
--- a/test/onnx/expect/TestOperators.test_ge.expect
+++ b/test/onnx/expect/TestOperators.test_ge.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_gt.expect b/test/onnx/expect/TestOperators.test_gt.expect
index a1958aeb7d1f1a..ab107f9e0233c5 100644
--- a/test/onnx/expect/TestOperators.test_gt.expect
+++ b/test/onnx/expect/TestOperators.test_gt.expect
@@ -71,5 +71,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_hardtanh.expect b/test/onnx/expect/TestOperators.test_hardtanh.expect
index 9581da849754a0..6a2d3e544b1e44 100644
--- a/test/onnx/expect/TestOperators.test_hardtanh.expect
+++ b/test/onnx/expect/TestOperators.test_hardtanh.expect
@@ -52,5 +52,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_index.expect b/test/onnx/expect/TestOperators.test_index.expect
index 6d2a95332cbb94..577244747d9aef 100644
--- a/test/onnx/expect/TestOperators.test_index.expect
+++ b/test/onnx/expect/TestOperators.test_index.expect
@@ -57,5 +57,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect
index 881c199d949f3c..fb36f3449f2664 100644
--- a/test/onnx/expect/TestOperators.test_le.expect
+++ b/test/onnx/expect/TestOperators.test_le.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_logsoftmax.expect b/test/onnx/expect/TestOperators.test_logsoftmax.expect
index 650662a76685af..fcfa594b8ea881 100644
--- a/test/onnx/expect/TestOperators.test_logsoftmax.expect
+++ b/test/onnx/expect/TestOperators.test_logsoftmax.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_lt.expect b/test/onnx/expect/TestOperators.test_lt.expect
index 4edf94b2aa4e6c..901aa61666c7df 100644
--- a/test/onnx/expect/TestOperators.test_lt.expect
+++ b/test/onnx/expect/TestOperators.test_lt.expect
@@ -71,5 +71,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_max.expect b/test/onnx/expect/TestOperators.test_max.expect
index fda649076cb375..62378fbae6afe6 100644
--- a/test/onnx/expect/TestOperators.test_max.expect
+++ b/test/onnx/expect/TestOperators.test_max.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_maxpool.expect b/test/onnx/expect/TestOperators.test_maxpool.expect
index e788300b089586..3e7ce68b8d42c8 100644
--- a/test/onnx/expect/TestOperators.test_maxpool.expect
+++ b/test/onnx/expect/TestOperators.test_maxpool.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_mean.expect b/test/onnx/expect/TestOperators.test_mean.expect
index c575109faa8d86..5476ca2298cba6 100644
--- a/test/onnx/expect/TestOperators.test_mean.expect
+++ b/test/onnx/expect/TestOperators.test_mean.expect
@@ -47,5 +47,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_min.expect b/test/onnx/expect/TestOperators.test_min.expect
index c0a51ca2bb7bf6..7b66f3661bd3d0 100644
--- a/test/onnx/expect/TestOperators.test_min.expect
+++ b/test/onnx/expect/TestOperators.test_min.expect
@@ -59,5 +59,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_mm.expect b/test/onnx/expect/TestOperators.test_mm.expect
index 129981ea61a1d6..0dcede4964c410 100644
--- a/test/onnx/expect/TestOperators.test_mm.expect
+++ b/test/onnx/expect/TestOperators.test_mm.expect
@@ -83,5 +83,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_norm.expect b/test/onnx/expect/TestOperators.test_norm.expect
index c53938f94a8fca..6b6ff1846fdeb4 100644
--- a/test/onnx/expect/TestOperators.test_norm.expect
+++ b/test/onnx/expect/TestOperators.test_norm.expect
@@ -61,5 +61,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_pad.expect b/test/onnx/expect/TestOperators.test_pad.expect
index f94b67f3616057..343230d4c02a66 100644
--- a/test/onnx/expect/TestOperators.test_pad.expect
+++ b/test/onnx/expect/TestOperators.test_pad.expect
@@ -71,5 +71,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_params.expect b/test/onnx/expect/TestOperators.test_params.expect
index c4a718f940944e..91b64310a3006c 100644
--- a/test/onnx/expect/TestOperators.test_params.expect
+++ b/test/onnx/expect/TestOperators.test_params.expect
@@ -87,5 +87,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_permute2.expect b/test/onnx/expect/TestOperators.test_permute2.expect
index 4e2e7e19d27b6f..d20f35b1077e04 100644
--- a/test/onnx/expect/TestOperators.test_permute2.expect
+++ b/test/onnx/expect/TestOperators.test_permute2.expect
@@ -76,5 +76,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_pow.expect b/test/onnx/expect/TestOperators.test_pow.expect
index 595c51dae740d8..49bec073317701 100644
--- a/test/onnx/expect/TestOperators.test_pow.expect
+++ b/test/onnx/expect/TestOperators.test_pow.expect
@@ -77,5 +77,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_prod.expect b/test/onnx/expect/TestOperators.test_prod.expect
index 2a01bf76bfbe1b..235dde0d4462de 100644
--- a/test/onnx/expect/TestOperators.test_prod.expect
+++ b/test/onnx/expect/TestOperators.test_prod.expect
@@ -47,5 +47,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_mean.expect b/test/onnx/expect/TestOperators.test_reduced_mean.expect
index 7449a2a53c6fbb..6c846801887ee8 100644
--- a/test/onnx/expect/TestOperators.test_reduced_mean.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_mean.expect
@@ -61,5 +61,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect
index 45d4eaa4663882..dd4cbb90efbba7 100644
--- a/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_prod.expect b/test/onnx/expect/TestOperators.test_reduced_prod.expect
index d835c77b86c209..3dbabb1fcb0890 100644
--- a/test/onnx/expect/TestOperators.test_reduced_prod.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_prod.expect
@@ -61,5 +61,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect
index 138e39ec9b5134..fe612211c7c8cb 100644
--- a/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_sum.expect b/test/onnx/expect/TestOperators.test_reduced_sum.expect
index b602edbb2fc451..b42d577db7f534 100644
--- a/test/onnx/expect/TestOperators.test_reduced_sum.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_sum.expect
@@ -61,5 +61,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect
index 0e4bf5da96be3a..9e238ad24da393 100644
--- a/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect
+++ b/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect
@@ -64,5 +64,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_repeat.expect b/test/onnx/expect/TestOperators.test_repeat.expect
index eb95f268857c5d..d8e9337890f3f1 100644
--- a/test/onnx/expect/TestOperators.test_repeat.expect
+++ b/test/onnx/expect/TestOperators.test_repeat.expect
@@ -68,5 +68,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
index 33b15851531c09..235a5a13d42d36 100644
--- a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
+++ b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
@@ -81,5 +81,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_rsub.expect b/test/onnx/expect/TestOperators.test_rsub.expect
index 9e6c112c8768c4..7f2e5284e0cef8 100644
--- a/test/onnx/expect/TestOperators.test_rsub.expect
+++ b/test/onnx/expect/TestOperators.test_rsub.expect
@@ -8,8 +8,8 @@ graph {
     attribute {
       name: "value"
       t {
-        data_type: DOUBLE
-        raw_data: "\000\000\000\000\000\000\360?"
+        data_type: INT64
+        raw_data: "\001\000\000\000\000\000\000\000"
       }
       type: TENSOR
     }
@@ -55,5 +55,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_selu.expect b/test/onnx/expect/TestOperators.test_selu.expect
index 92af508521a538..117d25a0d19192 100644
--- a/test/onnx/expect/TestOperators.test_selu.expect
+++ b/test/onnx/expect/TestOperators.test_selu.expect
@@ -54,5 +54,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_sin.expect b/test/onnx/expect/TestOperators.test_sin.expect
index a6d733d7b7e031..32ceff124dde7d 100644
--- a/test/onnx/expect/TestOperators.test_sin.expect
+++ b/test/onnx/expect/TestOperators.test_sin.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_slice.expect b/test/onnx/expect/TestOperators.test_slice.expect
index 5965137e8de6ec..0e24cafa6e5c97 100644
--- a/test/onnx/expect/TestOperators.test_slice.expect
+++ b/test/onnx/expect/TestOperators.test_slice.expect
@@ -77,5 +77,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_sqrt.expect b/test/onnx/expect/TestOperators.test_sqrt.expect
index 5b56bea7eb24b3..5344a0aff20c46 100644
--- a/test/onnx/expect/TestOperators.test_sqrt.expect
+++ b/test/onnx/expect/TestOperators.test_sqrt.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_sum.expect b/test/onnx/expect/TestOperators.test_sum.expect
index dff021eb26af16..caba3c16ba70a0 100644
--- a/test/onnx/expect/TestOperators.test_sum.expect
+++ b/test/onnx/expect/TestOperators.test_sum.expect
@@ -47,5 +47,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_symbolic_override.expect b/test/onnx/expect/TestOperators.test_symbolic_override.expect
index f4e6965a6cafbb..6ccb4ff03f8d5d 100644
--- a/test/onnx/expect/TestOperators.test_symbolic_override.expect
+++ b/test/onnx/expect/TestOperators.test_symbolic_override.expect
@@ -99,5 +99,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect b/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect
index 757609e63740f2..6bec0381a55fa0 100644
--- a/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect
+++ b/test/onnx/expect/TestOperators.test_symbolic_override_nested.expect
@@ -100,5 +100,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_tan.expect b/test/onnx/expect/TestOperators.test_tan.expect
index 58b7ff689b9930..e36c718bb00bc4 100644
--- a/test/onnx/expect/TestOperators.test_tan.expect
+++ b/test/onnx/expect/TestOperators.test_tan.expect
@@ -42,5 +42,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_transpose.expect b/test/onnx/expect/TestOperators.test_transpose.expect
index f2c634daebf589..394bb253f4e3f6 100644
--- a/test/onnx/expect/TestOperators.test_transpose.expect
+++ b/test/onnx/expect/TestOperators.test_transpose.expect
@@ -37,5 +37,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_type_as.expect b/test/onnx/expect/TestOperators.test_type_as.expect
index 5107617395e3cb..bdc6dd44a3f97b 100644
--- a/test/onnx/expect/TestOperators.test_type_as.expect
+++ b/test/onnx/expect/TestOperators.test_type_as.expect
@@ -31,5 +31,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_unsqueeze.expect b/test/onnx/expect/TestOperators.test_unsqueeze.expect
index 3a8e01092f8d0b..8ecb4314d347f0 100644
--- a/test/onnx/expect/TestOperators.test_unsqueeze.expect
+++ b/test/onnx/expect/TestOperators.test_unsqueeze.expect
@@ -50,5 +50,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_upsample.expect b/test/onnx/expect/TestOperators.test_upsample.expect
index 8e7a6a8fa1f7a7..7cc176ff57b8dc 100644
--- a/test/onnx/expect/TestOperators.test_upsample.expect
+++ b/test/onnx/expect/TestOperators.test_upsample.expect
@@ -67,5 +67,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/expect/TestOperators.test_view.expect b/test/onnx/expect/TestOperators.test_view.expect
index 1354b4271067af..be69ffb4cdab95 100644
--- a/test/onnx/expect/TestOperators.test_view.expect
+++ b/test/onnx/expect/TestOperators.test_view.expect
@@ -44,5 +44,5 @@ graph {
   }
 }
 opset_import {
-  version: 7
+  version: 9
 }
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index f8038d2465dbc6..a889e7b1fa0f89 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -127,13 +127,14 @@ def convert_cuda(self, model, input):
         return cuda_model, cuda_input
 
     def run_debug_test(self, model, train, batch_size, state_dict=None,
-                       input=None, use_gpu=True):
+                       input=None, use_gpu=True, example_outputs=None):
         """
         # TODO: remove this from the final release version
         This test is for our debugging only for the case where
         embed_params=False
         """
-        model.train(train)
+        if not isinstance(model, torch.jit.ScriptModule):
+            model.train(train)
         if state_dict is not None:
             model.load_state_dict(state_dict)
 
@@ -144,7 +145,8 @@ def run_debug_test(self, model, train, batch_size, state_dict=None,
         if use_gpu:
             model, input = self.convert_cuda(model, input)
 
-        onnxir, torch_out = do_export(model, input, export_params=self.embed_params, verbose=False)
+        onnxir, torch_out = do_export(model, input, export_params=self.embed_params, verbose=False,
+                                      example_outputs=example_outputs)
         if isinstance(torch_out, torch.autograd.Variable):
             torch_out = (torch_out,)
 
@@ -153,12 +155,14 @@ def run_debug_test(self, model, train, batch_size, state_dict=None,
             np.testing.assert_almost_equal(x.data.cpu().numpy(), y, decimal=3)
 
     def run_actual_test(self, model, train, batch_size, state_dict=None,
-                        input=None, use_gpu=True, rtol=0.001, atol=1e-7):
+                        input=None, use_gpu=True, rtol=0.001, atol=1e-7,
+                        example_outputs=None):
         """
         This is what the user facing version will look like
         """
         # set the training/test mode for the model
-        model.train(train)
+        if not isinstance(model, torch.jit.ScriptModule):
+            model.train(train)
         # use the pre-trained model params if available
         if state_dict is not None:
             model.load_state_dict(state_dict)
@@ -175,14 +179,16 @@ def run_actual_test(self, model, train, batch_size, state_dict=None,
         verify.verify(model, input, c2, rtol=rtol, atol=atol)
 
     def run_model_test(self, model, train, batch_size, state_dict=None,
-                       input=None, use_gpu=True, rtol=0.001, atol=1e-7):
+                       input=None, use_gpu=True, rtol=0.001, atol=1e-7,
+                       example_outputs=None):
         use_gpu_ = torch.cuda.is_available() and use_gpu
         if self.embed_params:
             self.run_actual_test(model, train, batch_size, state_dict, input,
-                                 use_gpu=use_gpu_, rtol=rtol, atol=atol)
+                                 use_gpu=use_gpu_, rtol=rtol, atol=atol,
+                                 example_outputs=example_outputs)
         else:
             self.run_debug_test(model, train, batch_size, state_dict, input,
-                                use_gpu=use_gpu_)
+                                use_gpu=use_gpu_, example_outputs=example_outputs)
 
     def test_linear(self):
         model = nn.Linear(1, 1)
@@ -346,11 +352,11 @@ def test_rnn_init_predict_split(self):
         mp = onnx.ModelProto.FromString(do_export(model, input, export_params=self.embed_params)[0])
         prepared = c2.prepare(mp, device='CPU')
         if self.embed_params:
-            assert len(prepared.init_net.op) == 1038
-            assert len(prepared.predict_net.op) == 101
+            assert len(prepared.init_net.op) == 875
+            assert len(prepared.predict_net.op) == 130
         else:
-            assert len(prepared.init_net.op) == 27
-            assert len(prepared.predict_net.op) == 1112
+            assert len(prepared.init_net.op) == 8
+            assert len(prepared.predict_net.op) == 997
 
     def test_alexnet(self):
         state_dict = model_zoo.load_url(model_urls['alexnet'], progress=False)
@@ -891,6 +897,28 @@ def forward(self, x):
         x = torch.randn(3, 4, 5, 6, 7)
         self.run_model_test(NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
 
+    def test_dynamic_slice(self):
+        class DynamicSliceExportMod(torch.nn.Module):
+            def forward(self, x):
+                results = []
+                for i in range(4):
+                    results.append(x[:x.size(0) - i, i:x.size(2), i:3])
+                return tuple(results)
+
+        x = torch.rand(5, 5, 5)
+        self.run_model_test(DynamicSliceExportMod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+
+    def test_dynamic_slice_to_the_end(self):
+        class DynamicSliceExportMod(torch.nn.Module):
+            def forward(self, x):
+                results = []
+                for i in range(4):
+                    results.append(x[:, i:, x.size(2) - 5])
+                return tuple(results)
+
+        x = torch.rand(5, 5, 5)
+        self.run_model_test(DynamicSliceExportMod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
+
 # a bit of metaprogramming to set up all the rnn tests
 
 
diff --git a/test/run_test.py b/test/run_test.py
index e7db74ee5e363b..12b5a4bee9ae0b 100644
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -14,13 +14,14 @@
 import torch
 from torch.utils import cpp_extension
 from common import TEST_WITH_ROCM
-import torch.distributed.c10d as c10d
+import torch.distributed as dist
 
 TESTS = [
     'autograd',
     'cpp_extensions',
     'c10d',
     'cuda',
+    'cuda_primary_ctx',
     'dataloader',
     'distributed',
     'distributions',
@@ -59,12 +60,12 @@
 }
 
 
-if c10d.is_available():
-    if c10d.is_mpi_available():
+if dist.is_available():
+    if dist.is_mpi_available():
         DISTRIBUTED_TESTS_CONFIG['mpi'] = {
             'WORLD_SIZE': '3'
         }
-    if c10d.is_nccl_available():
+    if dist.is_nccl_available():
         DISTRIBUTED_TESTS_CONFIG['nccl'] = {
             'WORLD_SIZE': '2' if torch.cuda.device_count() == 2 else '3'
         }
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 8ee75579a887fa..8f0b2a99787084 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -14,8 +14,12 @@
 from torch.autograd.gradcheck import gradgradcheck, gradcheck
 from torch.autograd.function import once_differentiable
 from torch.autograd.profiler import profile
-from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \
-    suppress_warnings, skipIfRocm
+from common import (TEST_MKL, TestCase, run_tests, skipIfNoLapack,
+                    suppress_warnings, skipIfRocm,
+                    prod_single_zero, random_square_matrix_of_rank,
+                    random_symmetric_matrix, random_symmetric_psd_matrix,
+                    random_symmetric_pd_matrix, make_nonzero_det,
+                    random_fullrank_matrix_distinct_singular_value)
 from torch.autograd import Variable, Function, detect_anomaly
 from torch.autograd.function import InplaceFunction
 from torch.testing import make_non_contiguous, randn_like
@@ -724,6 +728,12 @@ def adder(x, y):
         self.assertRaises(RuntimeError, lambda: z.backward(torch.ones(5, 5)))
         self.assertIsNone(z.grad_fn)
 
+        # test nested decorator and with-statement on no_grad
+        with torch.no_grad():
+            self.assertFalse(torch.is_grad_enabled())
+            w = adder(x, y)
+            self.assertFalse(torch.is_grad_enabled())
+
     def test_no_grad_python_function(self):
         """Python Functions should respect grad mode."""
         x = torch.ones(5, 5, requires_grad=True)
@@ -1369,6 +1379,36 @@ def test_unused_output(self):
         expected_grad[:2] = grad_output
         self.assertEqual(x.grad.data, expected_grad)
 
+    @skipIfRocm
+    def test_ctc_loss(self):
+        batch_size = 64
+        num_labels = 101
+        target_length = 15
+        gradcheck_input_size = 10
+
+        # device, input_length
+        tests = [('cpu', 150)]
+        if torch.cuda.is_available():
+            tests += [('cuda', 50),
+                      ('cuda', 150)]
+
+        for device, input_length in tests:
+            targets = torch.randint(1, num_labels, (batch_size, target_length),
+                                    device=device, dtype=torch.long)
+            x = torch.randn(gradcheck_input_size, device=device, requires_grad=True)
+            tile_factors = torch.randn(input_length * batch_size * num_labels // gradcheck_input_size + 1,
+                                       device=device)
+            input_lengths = [input_length for _ in range(batch_size)]
+            target_lengths = [target_length for _ in range(batch_size)]
+
+            def ctc_after_softmax(x):
+                x_full = ((x[:, None] * tile_factors[None, :]).view(-1)[:input_length * batch_size * num_labels]
+                          .view(input_length, batch_size, num_labels))
+                log_probs = torch.log_softmax(x_full, 2)
+                return torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)
+
+            gradcheck(ctc_after_softmax, [x])
+
     def test_gc_in_destructor(self):
         """
         Previously, if a Function destructor triggered a garbage collection,
@@ -2557,60 +2597,6 @@ def prod_zeros(dim_size, dim_select):
     return result
 
 
-def prod_single_zero(dim_size):
-    result = torch.randn(dim_size, dim_size)
-    result[0, 1] = 0
-    return result
-
-
-def random_square_matrix_of_rank(l, rank):
-    assert rank <= l
-    A = torch.randn(l, l)
-    u, s, v = A.svd()
-    for i in range(l):
-        if i >= rank:
-            s[i] = 0
-        elif s[i] == 0:
-            s[i] = 1
-    return u.mm(torch.diag(s)).mm(v.transpose(0, 1))
-
-
-def random_symmetric_matrix(l):
-    A = torch.randn(l, l)
-    for i in range(l):
-        for j in range(i):
-            A[i, j] = A[j, i]
-    return A
-
-
-def random_symmetric_psd_matrix(l):
-    A = torch.randn(l, l)
-    return A.mm(A.transpose(0, 1))
-
-
-def random_symmetric_pd_matrix(l, eps=1e-5):
-    A = torch.randn(l, l)
-    return A.mm(A.transpose(0, 1)) + torch.eye(l) * eps
-
-
-def make_nonzero_det(A, sign=None, min_singular_value=0.1):
-    u, s, v = A.svd()
-    s[s < min_singular_value] = min_singular_value
-    A = u.mm(torch.diag(s)).mm(v.t())
-    det = A.det().item()
-    if sign is not None:
-        if (det < 0) ^ (sign < 0):
-            A[0, :].neg_()
-    return A
-
-
-def random_fullrank_matrix_distinct_singular_value(l):
-    A = torch.randn(l, l)
-    u, _, v = A.svd()
-    s = torch.arange(1., l + 1).mul_(1.0 / (l + 1))
-    return u.mm(torch.diag(s)).mm(v.t())
-
-
 def uniform_scalar(offset=0, requires_grad=False):
     v = torch.rand(()) + offset
     v.requires_grad = requires_grad
@@ -3142,11 +3128,15 @@ class dont_convert(tuple):
      'tall_all', NO_ARGS, [skipIfNoLapack], lambda usv: (usv[0][:, :(S - 2)], usv[1], usv[2])),
     ('svd', lambda: random_fullrank_matrix_distinct_singular_value(M), NO_ARGS,
      'large', NO_ARGS, [skipIfNoLapack]),
-    ('gesv', (S, S), ((S, S),), '', NO_ARGS, [skipIfNoLapack]),
-    ('gesv', (S, S, S), ((S, S, S),), 'batched', NO_ARGS, [skipIfNoLapack]),
-    ('gesv', (2, 3, S, S), ((2, 3, S, S),), 'batched_dims', NO_ARGS, [skipIfNoLapack]),
-    ('gesv', (2, 2, S, S), ((1, S, S),), 'batched_broadcast_A', NO_ARGS, [skipIfNoLapack]),
-    ('gesv', (1, S, S), ((2, 2, S, S),), 'batched_broadcast_b', NO_ARGS, [skipIfNoLapack]),
+    ('gesv', (S, S), (random_fullrank_matrix_distinct_singular_value(S),), '', NO_ARGS, [skipIfNoLapack]),
+    ('gesv', (S, S, S), (random_fullrank_matrix_distinct_singular_value(S, S),),
+     'batched', NO_ARGS, [skipIfNoLapack]),
+    ('gesv', (2, 3, S, S), (random_fullrank_matrix_distinct_singular_value(S, 2, 3),),
+     'batched_dims', NO_ARGS, [skipIfNoLapack]),
+    ('gesv', (2, 2, S, S), (random_fullrank_matrix_distinct_singular_value(S, 1),),
+     'batched_broadcast_A', NO_ARGS, [skipIfNoLapack]),
+    ('gesv', (1, S, S), (random_fullrank_matrix_distinct_singular_value(S, 2, 2),),
+     'batched_broadcast_b', NO_ARGS, [skipIfNoLapack]),
     ('fill_', (S, S, S), (1,), 'number'),
     ('fill_', (), (1,), 'number_scalar'),
     # FIXME: we should compute the derivative w.r.t torch.tensor(1)
diff --git a/test/test_c10d.py b/test/test_c10d.py
index 755982246ac4aa..0df1e3c749c6e6 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -14,8 +14,8 @@
 import common
 from torch import nn
 import torch.nn.functional as F
-from torch.distributed import c10d
-from torch.nn.parallel import distributed_c10d
+import torch.distributed as c10d
+from torch.nn.parallel import DistributedDataParallel
 
 from common import TestCase
 
@@ -379,14 +379,14 @@ def test_send_recv_all_to_all(self):
         for i in range(self.world_size):
             if i == self.rank:
                 continue
-            send_work.append(pg.send([inputs[i]], i))
+            send_work.append(pg.send([inputs[i]], i, 0))
 
         # Issue recvs
         recv_work = []
         for i in range(self.world_size):
             if i == self.rank:
                 continue
-            recv_work.append(pg.recv([outputs[i]], i))
+            recv_work.append(pg.recv([outputs[i]], i, 0))
 
         # Wait for sends to complete
         for work in send_work:
@@ -567,10 +567,9 @@ class DistributedDataParallelTest(MultiProcessTestCase):
     def world_size(self):
         return 2
 
-    def _test_ddp_with_process_group(self, process_group):
-        gpus = gpus_for_rank(self.world_size)[self.rank]
+    def _test_ddp_with_process_group(self, process_group, gpus):
         model = Net()
-        ddp_model = distributed_c10d._DistributedDataParallelC10d(
+        ddp_model = DistributedDataParallel(
             copy.deepcopy(model).cuda(gpus[0]),
             device_ids=gpus,
             process_group=process_group)
@@ -620,14 +619,18 @@ def test_gloo_backend(self):
         options = c10d.ProcessGroupGloo.Options()
         options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
-        self._test_ddp_with_process_group(process_group)
+        gpus = gpus_for_rank(self.world_size)[self.rank]
+        self._test_ddp_with_process_group(process_group, gpus)
+        self._test_ddp_with_process_group(process_group, list(map(lambda i: torch.device('cuda:' + str(i)), gpus)))
 
     @skip_if_not_multigpu
     @skip_if_not_nccl
     def test_nccl_backend(self):
         store = c10d.TCPStore('localhost', self.port, self.is_master)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        self._test_ddp_with_process_group(process_group)
+        gpus = gpus_for_rank(self.world_size)[self.rank]
+        self._test_ddp_with_process_group(process_group, gpus)
+        self._test_ddp_with_process_group(process_group, list(map(lambda i: torch.device('cuda:' + str(i)), gpus)))
 
     @skip_if_not_multigpu
     def test_dist_broadcast_coalesced(self):
@@ -722,6 +725,37 @@ def test_sync_params_with_buffers(self):
             for i, buffer in enumerate(device_data):
                 self.assertEqual(buffer, target[i])
 
+    @skip_if_not_multigpu
+    @skip_if_not_nccl
+    def test_fp16(self):
+        store = c10d.TCPStore('localhost', self.port, self.rank == 0)
+        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+
+        gpus = gpus_for_rank(self.world_size)[self.rank]
+        model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half()
+        nn.init.constant_(model.weight, 1)
+        ddp_model = DistributedDataParallel(
+            model,
+            device_ids=[gpus[0]],
+            process_group=process_group,
+            bucket_cap_mb=1,
+        )
+
+        # Input 2**15, so that the gradients will overflow with a
+        # world_size of 2, unless we normalize the gradient by the
+        # world_size before the reduction
+        input = torch.Tensor([[2**15]]).cuda(gpus[0]).half()
+
+        # Step model
+        ddp_model.train()
+        output = ddp_model(input)
+        loss = output.sum()
+        loss.backward()
+
+        self.assertFalse(
+            any(torch.isinf(p.grad).any() for p in ddp_model.parameters())
+        )
+
 if __name__ == '__main__':
     assert not torch.cuda._initialized, "test_distributed must not have initialized CUDA context on main process"
 
diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py
index a5312cd0383008..f24571e6aad899 100755
--- a/test/test_cpp_extensions.py
+++ b/test/test_cpp_extensions.py
@@ -274,6 +274,47 @@ def test_complex_registration(self):
 
         torch.empty(2, 2, dtype=torch.complex64)
 
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    def test_half_support(self):
+        '''
+        Checks for an issue with operator< ambiguity for half when certain
+        THC headers are included.
+
+        See https://github.com/pytorch/pytorch/pull/10301#issuecomment-416773333
+        for the corresponding issue.
+        '''
+        cuda_source = '''
+        #include <THC/THCNumerics.cuh>
+
+        template<typename T, typename U>
+        __global__ void half_test_kernel(const T* input, U* output) {
+            if (input[0] < input[1] || input[0] >= input[1]) {
+                output[0] = 123;
+            }
+        }
+
+        at::Tensor half_test(at::Tensor input) {
+            auto output = at::empty(1, input.options().dtype(at::kFloat));
+            AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "half_test", [&] {
+                half_test_kernel<scalar_t><<<1, 1>>>(
+                    input.data<scalar_t>(),
+                    output.data<float>());
+            });
+            return output;
+        }
+        '''
+
+        module = torch.utils.cpp_extension.load_inline(
+            name='half_test_extension',
+            cpp_sources='at::Tensor half_test(at::Tensor input);',
+            cuda_sources=cuda_source,
+            functions=['half_test'],
+            verbose=True)
+
+        x = torch.randn(3, device='cuda', dtype=torch.half)
+        result = module.half_test(x)
+        self.assertEqual(result[0], 123)
+
 
 if __name__ == '__main__':
     common.run_tests()
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 1d898548def1f2..45ca47aae600a6 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1939,6 +1939,13 @@ def leak_gpu1():
             with self.assertRaisesRegex(AssertionError, r"leaked \d+ bytes CUDA memory on device 1"):
                 leak_gpu1()
 
+    def test_cuda_memory_leak_detection_propagates_errors(self):
+        with self.assertRaisesRegex(RuntimeError, r"The size of tensor a \(3\) must match"):
+            with self.assertLeaksNoCudaTensors():
+                x = torch.randn(3, 1, device='cuda')
+                y = torch.randn(2, 1, device='cuda')
+                z = x + y
+
 
 def load_ignore_file():
     from os.path import join, dirname
diff --git a/test/test_cuda_primary_ctx.py b/test/test_cuda_primary_ctx.py
new file mode 100644
index 00000000000000..2006b340aa22de
--- /dev/null
+++ b/test/test_cuda_primary_ctx.py
@@ -0,0 +1,58 @@
+import ctypes
+import torch
+from common import TestCase, run_tests, skipIfRocm
+import unittest
+
+# NOTE: this needs to be run in a brand new process
+
+# We cannot import TEST_CUDA and TEST_MULTIGPU from common_cuda here,
+# because if we do that, the TEST_CUDNN line from common_cuda will be executed
+# multiple times as well during the execution of this test suite, and it will
+# cause CUDA OOM error on Windows.
+TEST_CUDA = torch.cuda.is_available()
+TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
+
+if not TEST_CUDA:
+    print('CUDA not available, skipping tests')
+    TestCase = object  # noqa: F811
+
+
+def get_is_primary_context_created(device):
+    flags = ctypes.cast((ctypes.c_uint * 1)(), ctypes.POINTER(ctypes.c_uint))
+    active = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
+    result = torch.cuda.cudart().cuDevicePrimaryCtxGetState(ctypes.c_int(device), flags, active)
+    assert result == 0, 'cuDevicePrimaryCtxGetState failed'
+    return bool(active[0])
+
+
+class TestCudaPrimaryCtx(TestCase):
+    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @skipIfRocm
+    def test_cuda_primary_ctx(self):
+        # Ensure context has not been created beforehand
+        self.assertFalse(get_is_primary_context_created(0))
+        self.assertFalse(get_is_primary_context_created(1))
+
+        x = torch.randn(1, device='cuda:1')
+
+        # We should have only created context on 'cuda:1'
+        self.assertFalse(get_is_primary_context_created(0))
+        self.assertTrue(get_is_primary_context_created(1))
+
+        print(x)
+
+        # We should still have only created context on 'cuda:1'
+        self.assertFalse(get_is_primary_context_created(0))
+        self.assertTrue(get_is_primary_context_created(1))
+
+        y = torch.randn(1, device='cpu')
+        y.copy_(x)
+
+        # We should still have only created context on 'cuda:1'
+        self.assertFalse(get_is_primary_context_created(0))
+        self.assertTrue(get_is_primary_context_created(1))
+
+    # DO NOT ADD ANY OTHER TESTS HERE!  ABOVE TEST REQUIRES FRESH PROCESS
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 020486c1fbda35..3d9af20c859658 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -371,6 +371,7 @@ def test_segfault(self):
         finally:
             p.terminate()
 
+    @skipIfRocm
     def test_timeout(self):
         p = ErrorTrackingProcess(target=_test_timeout)
         p.start()
diff --git a/test/test_distributed.py b/test/test_distributed.py
index c37eac2689e4b9..cc135c2ebec7a3 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -12,7 +12,7 @@
 
 import torch
 import torch.cuda
-import torch.distributed.c10d as dist
+import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -234,6 +234,36 @@ def test_get_default_group(self):
         default_grp = dist.get_default_group()
         self.assertNotEqual(default_grp, None)
 
+    def test_get_backend(self):
+        if dist.get_world_size() > 2:
+            group = [1, 2]
+        else:
+            group = [0, 1]
+        group_id = dist.new_group(group)
+        backend_str = BACKEND.lower()
+        self.assertEqual(dist.get_backend(), backend_str)
+        if dist.get_rank() in group:
+            self.assertEqual(dist.get_backend(group_id), backend_str)
+        else:
+            with self.assertRaisesRegex(RuntimeError, "Invalid process group specified"):
+                dist.get_backend(group_id)
+
+    def test_DistBackend(self):
+        # test parsing
+        backend = BACKEND.lower()
+        self.assertEqual(dist.DistBackend(BACKEND.upper()), backend)
+        self.assertEqual(dist.DistBackend(BACKEND), backend)
+        with self.assertRaisesRegex(ValueError, "Invalid backend: 'undefined'"):
+            dist.DistBackend("undefined")
+        with self.assertRaisesRegex(ValueError, "Invalid backend: 'xYz'"):
+            dist.DistBackend("xYz")
+        with self.assertRaises(ValueError):
+            dist.DistBackend(None)
+        with self.assertRaises(ValueError):
+            dist.DistBackend(3)
+        with self.assertRaises(ValueError):
+            dist.DistBackend(["gloo"])
+
     # Test destroy
     def test_destroy_group(self):
         if dist.get_world_size() > 2:
@@ -322,6 +352,26 @@ def test_send_recv_any_source(self):
         self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
         self._barrier()
 
+    # SEND RECV WITH TAG
+    @unittest.skipIf(BACKEND == "nccl", "Nccl does not support send/recv")
+    def test_send_recv_with_tag(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        tensor = _build_tensor(10, value=rank)
+
+        for dst in range(0, world_size):
+            if dst == rank:
+                # Recv mode
+                for src in range(0, world_size):
+                    if src == rank:
+                        continue
+                    output_tensor = _build_tensor(10, value=-1)
+                    dist.recv(output_tensor, src, tag=src)
+                    self.assertTrue(output_tensor.eq(src).all())
+            else:
+                # Send mode
+                dist.send(tensor, dst, tag=rank)
+
     # ISEND
     @unittest.skipIf(BACKEND == "nccl", "Nccl does not support isend")
     def test_isend(self):
@@ -1126,30 +1176,21 @@ def _test_DDP_2iter(
             # Shuffle the input so that DDP input is different
             input = input[torch.randperm(batch_size)]
 
-    @unittest.skipIf(
-        BACKEND != "nccl" and BACKEND != "gloo",
-        "Only Nccl & Gloo backend support DistributedDataParallel",
-    )
-    @skip_if_no_cuda_distributed
-    @skip_if_no_gpu
-    def test_DistributedDataParallel(self):
+    def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None):
         # Run a simple end to end DDP model, use result of single node model
         # as baseline
-        group, group_id, rank = self._init_global_test()
-        rank_to_GPU = self._init_multigpu_helper()
 
         # cpu training setup
         model = self._create_Net()
 
         # single gpu training setup
         model_gpu = copy.deepcopy(model)
-        gpu_subset = list(rank_to_GPU[rank])
         model_gpu.cuda(gpu_subset[0])
 
         # DDP training setup
         model_DDP = copy.deepcopy(model)
         model_DDP.cuda(gpu_subset[0])
-        model_DDP = nn.parallel._DistributedDataParallelC10d(
+        model_DDP = nn.parallel.DistributedDataParallel(
             model_DDP, device_ids=gpu_subset
         )
 
@@ -1183,7 +1224,7 @@ def test_DistributedDataParallelCPU(self):
 
         # DDP-CPU training setup
         model_DDP = copy.deepcopy(model_base)
-        model_DDP = nn.parallel._DistributedDataParallelC10dCPU(model_DDP)
+        model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP)
 
         # dummy data initialization
         local_bs = 2
@@ -1195,6 +1236,22 @@ def test_DistributedDataParallelCPU(self):
         )
         self._barrier()
 
+    @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo',
+                     "Only Nccl & Gloo backend support DistributedDataParallel")
+    @skip_if_no_cuda_distributed
+    @skip_if_no_gpu
+    def test_DistributedDataParallel(self):
+        group, group_id, rank = self._init_global_test()
+        rank_to_GPU = self._init_multigpu_helper()
+        gpus = list(rank_to_GPU[rank])
+        self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank)
+
+        # test output_device
+        self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'))
+
+        # test device_ids
+        gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus))
+        self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'))
 
 if BACKEND == "gloo" or BACKEND == "nccl":
     WORLD_SIZE = os.environ["WORLD_SIZE"]
@@ -1265,6 +1322,12 @@ def _run(self, rank):
                     sys.exit(SKIP_IF_BACKEND_UNAVAILABLE)
                     # sys.exit(0)
                 raise
+
+            # Execute barrier prior to running test to ensure that every process
+            # has finished initialization and that the following test
+            # immediately exiting due to a skip doesn't cause flakiness.
+            self._barrier()
+
             # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
             # We're retreiving a corresponding test and executing it.
             getattr(self, self.id().split(".")[2])()
diff --git a/test/test_distributions.py b/test/test_distributions.py
index 7ec305ff344870..5c710daa3a62b5 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -647,7 +647,7 @@ def _gradcheck_log_prob(self, dist_ctor, ctor_params):
         distribution = dist_ctor(*ctor_params)
         s = distribution.sample()
         if s.is_floating_point():
-            s.detach_().requires_grad_()
+            s = s.detach().requires_grad_()
 
         expected_shape = distribution.batch_shape + distribution.event_shape
         self.assertEqual(s.size(), expected_shape)
@@ -793,6 +793,53 @@ def test_has_examples(self):
                 self.assertIn(Dist, distributions_with_examples,
                               "Please add {} to the EXAMPLES list in test_distributions.py".format(Dist.__name__))
 
+    def test_distribution_expand(self):
+        shapes = [torch.Size(), torch.Size((2,)), torch.Size((2, 1))]
+        for Dist, params in EXAMPLES:
+            for param in params:
+                for shape in shapes:
+                    d = Dist(**param)
+                    expanded_shape = shape + d.batch_shape
+                    original_shape = d.batch_shape + d.event_shape
+                    expected_shape = shape + original_shape
+                    expanded = d.expand(batch_shape=list(expanded_shape))
+                    sample = expanded.sample()
+                    actual_shape = expanded.sample().shape
+                    self.assertEqual(expanded.__class__, d.__class__)
+                    self.assertEqual(d.sample().shape, original_shape)
+                    self.assertEqual(expanded.log_prob(sample), d.log_prob(sample))
+                    self.assertEqual(actual_shape, expected_shape)
+                    self.assertEqual(expanded.batch_shape, expanded_shape)
+                    try:
+                        self.assertEqual(expanded.mean,
+                                         d.mean.expand(expanded_shape + d.event_shape),
+                                         allow_inf=True)
+                        self.assertEqual(expanded.variance,
+                                         d.variance.expand(expanded_shape + d.event_shape),
+                                         allow_inf=True)
+                    except NotImplementedError:
+                        pass
+
+    def test_distribution_subclass_expand(self):
+        expand_by = torch.Size((2,))
+        for Dist, params in EXAMPLES:
+
+            class SubClass(Dist):
+                pass
+
+            for param in params:
+                d = SubClass(**param)
+                expanded_shape = expand_by + d.batch_shape
+                original_shape = d.batch_shape + d.event_shape
+                expected_shape = expand_by + original_shape
+                expanded = d.expand(batch_shape=expanded_shape)
+                sample = expanded.sample()
+                actual_shape = expanded.sample().shape
+                self.assertEqual(expanded.__class__, d.__class__)
+                self.assertEqual(d.sample().shape, original_shape)
+                self.assertEqual(expanded.log_prob(sample), d.log_prob(sample))
+                self.assertEqual(actual_shape, expected_shape)
+
     def test_bernoulli(self):
         p = torch.tensor([0.7, 0.2, 0.4], requires_grad=True)
         r = torch.tensor(0.3, requires_grad=True)
@@ -2180,6 +2227,24 @@ def test_independent_shape(self):
                     except NotImplementedError:
                         pass
 
+    def test_independent_expand(self):
+        for Dist, params in EXAMPLES:
+            for param in params:
+                base_dist = Dist(**param)
+                for reinterpreted_batch_ndims in range(len(base_dist.batch_shape) + 1):
+                    for s in [torch.Size(), torch.Size((2,)), torch.Size((2, 3))]:
+                        indep_dist = Independent(base_dist, reinterpreted_batch_ndims)
+                        expanded_shape = s + indep_dist.batch_shape
+                        expanded = indep_dist.expand(expanded_shape)
+                        expanded_sample = expanded.sample()
+                        expected_shape = expanded_shape + indep_dist.event_shape
+                        self.assertEqual(expanded_sample.shape, expected_shape)
+                        self.assertEqual(expanded.log_prob(expanded_sample),
+                                         indep_dist.log_prob(expanded_sample))
+                        self.assertEqual(expanded.event_shape, indep_dist.event_shape)
+                        self.assertEqual(expanded.batch_shape, expanded_shape)
+
+    @skipIfRocm
     def test_cdf_icdf_inverse(self):
         # Tests the invertibility property on the distributions
         for Dist, params in EXAMPLES:
@@ -3913,6 +3978,55 @@ def test_transformed_distribution_shapes(self):
             except NotImplementedError:
                 continue
 
+    def test_jit_fwd(self):
+        for transform in self.unique_transforms:
+            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+
+            def f(x):
+                return transform(x)
+
+            try:
+                traced_f = torch.jit.trace(f, (x,))
+            except NotImplementedError:
+                continue
+
+            # check on different inputs
+            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            self.assertEqual(f(x), traced_f(x))
+
+    def test_jit_inv(self):
+        for transform in self.unique_transforms:
+            y = torch.tensor(self._generate_data(transform.inv), requires_grad=True)
+
+            def f(y):
+                return transform.inv(y)
+
+            try:
+                traced_f = torch.jit.trace(f, (y,))
+            except NotImplementedError:
+                continue
+
+            # check on different inputs
+            y = torch.tensor(self._generate_data(transform.inv), requires_grad=True)
+            self.assertEqual(f(y), traced_f(y))
+
+    def test_jit_jacobian(self):
+        for transform in self.unique_transforms:
+            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+
+            def f(x):
+                y = transform(x)
+                return transform.log_abs_det_jacobian(x, y)
+
+            try:
+                traced_f = torch.jit.trace(f, (x,))
+            except NotImplementedError:
+                continue
+
+            # check on different inputs
+            x = torch.tensor(self._generate_data(transform), requires_grad=True)
+            self.assertEqual(f(x), traced_f(x))
+
 
 class TestConstraintRegistry(TestCase):
     def get_constraints(self, is_cuda=False):
@@ -4035,5 +4149,242 @@ def tearDown(self):
         super(TestCase, self).tearDown()
         Distribution.set_default_validate_args(False)
 
+
+class TestJit(TestCase):
+    def _examples(self):
+        for Dist, params in EXAMPLES:
+            for param in params:
+                keys = param.keys()
+                values = tuple(param[key] for key in keys)
+                if not all(isinstance(x, torch.Tensor) for x in values):
+                    continue
+                sample = Dist(**param).sample()
+                yield Dist, keys, values, sample
+
+    def _perturb_tensor(self, value, constraint):
+        if isinstance(constraint, constraints._IntegerGreaterThan):
+            return value + 1
+        if isinstance(constraint, constraints._PositiveDefinite):
+            return value + torch.eye(value.shape[-1])
+        if value.dtype in [torch.float, torch.double]:
+            transform = transform_to(constraint)
+            delta = value.new(value.shape).normal_()
+            return transform(transform.inv(value) + delta)
+        if value.dtype == torch.long:
+            result = value.clone()
+            result[value == 0] = 1
+            result[value == 1] = 0
+            return result
+        raise NotImplementedError
+
+    def _perturb(self, Dist, keys, values, sample):
+        with torch.no_grad():
+            if Dist is Uniform:
+                param = dict(zip(keys, values))
+                param['low'] = param['low'] - torch.rand(param['low'].shape)
+                param['high'] = param['high'] + torch.rand(param['high'].shape)
+                values = [param[key] for key in keys]
+            else:
+                values = [self._perturb_tensor(value, Dist.arg_constraints.get(key, constraints.real))
+                          for key, value in zip(keys, values)]
+            param = dict(zip(keys, values))
+            sample = Dist(**param).sample()
+            return values, sample
+
+    def test_sample(self):
+        for Dist, keys, values, sample in self._examples():
+
+            def f(*values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.sample()
+
+            traced_f = torch.jit.trace(f, values, check_trace=False)
+
+            # FIXME Schema not found for node
+            xfail = [
+                Cauchy,  # aten::cauchy(Double(2,1), float, float, Generator)
+                HalfCauchy,  # aten::cauchy(Double(2, 1), float, float, Generator)
+            ]
+            if Dist in xfail:
+                continue
+
+            with torch.random.fork_rng():
+                sample = f(*values)
+            traced_sample = traced_f(*values)
+            self.assertEqual(sample, traced_sample)
+
+            # FIXME no nondeterministic nodes found in trace
+            xfail = [Beta, Dirichlet]
+            if Dist not in xfail:
+                self.assertTrue(any(n.isNondeterministic() for n in traced_f.graph.nodes()))
+
+    def test_rsample(self):
+        for Dist, keys, values, sample in self._examples():
+            if not Dist.has_rsample:
+                continue
+
+            def f(*values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.rsample()
+
+            traced_f = torch.jit.trace(f, values, check_trace=False)
+
+            # FIXME Schema not found for node
+            xfail = [
+                Cauchy,  # aten::cauchy(Double(2,1), float, float, Generator)
+                HalfCauchy,  # aten::cauchy(Double(2, 1), float, float, Generator)
+            ]
+            if Dist in xfail:
+                continue
+
+            with torch.random.fork_rng():
+                sample = f(*values)
+            traced_sample = traced_f(*values)
+            self.assertEqual(sample, traced_sample)
+
+            # FIXME no nondeterministic nodes found in trace
+            xfail = [Beta, Dirichlet]
+            if Dist not in xfail:
+                self.assertTrue(any(n.isNondeterministic() for n in traced_f.graph.nodes()))
+
+    def test_log_prob(self):
+        for Dist, keys, values, sample in self._examples():
+            # FIXME traced functions produce incorrect results
+            xfail = [LowRankMultivariateNormal, MultivariateNormal]
+            if Dist in xfail:
+                continue
+
+            def f(sample, *values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.log_prob(sample)
+
+            traced_f = torch.jit.trace(f, (sample,) + values)
+
+            # check on different data
+            values, sample = self._perturb(Dist, keys, values, sample)
+            expected = f(sample, *values)
+            actual = traced_f(sample, *values)
+            self.assertEqual(expected, actual,
+                             message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+
+    def test_enumerate_support(self):
+        for Dist, keys, values, sample in self._examples():
+            # FIXME traced functions produce incorrect results
+            xfail = [Binomial]
+            if Dist in xfail:
+                continue
+
+            def f(*values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.enumerate_support()
+
+            try:
+                traced_f = torch.jit.trace(f, values)
+            except NotImplementedError:
+                continue
+
+            # check on different data
+            values, sample = self._perturb(Dist, keys, values, sample)
+            expected = f(*values)
+            actual = traced_f(*values)
+            self.assertEqual(expected, actual,
+                             message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+
+    def test_mean(self):
+        for Dist, keys, values, sample in self._examples():
+
+            def f(*values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.mean
+
+            try:
+                traced_f = torch.jit.trace(f, values)
+            except NotImplementedError:
+                continue
+
+            # check on different data
+            values, sample = self._perturb(Dist, keys, values, sample)
+            expected = f(*values)
+            actual = traced_f(*values)
+            expected[expected == float('inf')] = 0.
+            actual[actual == float('inf')] = 0.
+            self.assertEqual(expected, actual, allow_inf=True,
+                             message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+
+    def test_variance(self):
+        for Dist, keys, values, sample in self._examples():
+            if Dist in [Cauchy, HalfCauchy]:
+                continue  # infinite variance
+
+            def f(*values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.variance
+
+            try:
+                traced_f = torch.jit.trace(f, values)
+            except NotImplementedError:
+                continue
+
+            # check on different data
+            values, sample = self._perturb(Dist, keys, values, sample)
+            expected = f(*values)
+            actual = traced_f(*values)
+            expected[expected == float('inf')] = 0.
+            actual[actual == float('inf')] = 0.
+            self.assertEqual(expected, actual, allow_inf=True,
+                             message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+
+    def test_entropy(self):
+        for Dist, keys, values, sample in self._examples():
+            # FIXME traced functions produce incorrect results
+            xfail = [LowRankMultivariateNormal, MultivariateNormal]
+            if Dist in xfail:
+                continue
+
+            def f(*values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                return dist.entropy()
+
+            try:
+                traced_f = torch.jit.trace(f, values)
+            except NotImplementedError:
+                continue
+
+            # check on different data
+            values, sample = self._perturb(Dist, keys, values, sample)
+            expected = f(*values)
+            actual = traced_f(*values)
+            self.assertEqual(expected, actual, allow_inf=True,
+                             message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+
+    def test_cdf(self):
+        for Dist, keys, values, sample in self._examples():
+
+            def f(sample, *values):
+                param = dict(zip(keys, values))
+                dist = Dist(**param)
+                cdf = dist.cdf(sample)
+                return dist.icdf(cdf)
+
+            try:
+                traced_f = torch.jit.trace(f, (sample,) + values)
+            except NotImplementedError:
+                continue
+
+            # check on different data
+            values, sample = self._perturb(Dist, keys, values, sample)
+            expected = f(sample, *values)
+            actual = traced_f(sample, *values)
+            self.assertEqual(expected, actual,
+                             message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_jit.py b/test/test_jit.py
index 5eb7d4649bd763..97202239aeceee 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -10,6 +10,7 @@
 from torch.autograd.function import traceable
 from torch.testing import assert_allclose
 from torch.onnx import OperatorExportTypes
+from torch._six import inf, PY2
 from common import TestCase, run_tests, IS_WINDOWS, TEST_WITH_UBSAN, skipIfRocm, suppress_warnings
 from textwrap import dedent
 import os
@@ -56,7 +57,6 @@
 
 RUN_CUDA_MULTI_GPU = RUN_CUDA and torch.cuda.device_count() > 1
 
-PY2 = sys.version_info[0] == 2
 PY35 = sys.version_info >= (3, 5)
 WINDOWS = sys.platform == 'win32'
 
@@ -119,9 +119,9 @@ def canonical(graph):
 
 
 def get_lstm_inputs(device, training=False):
-    input = torch.randn(3, 10, dtype=torch.float, device=device)
-    hx = torch.randn(3, 20, dtype=torch.float, device=device)
-    cx = torch.randn(3, 20, dtype=torch.float, device=device)
+    input = torch.randn(3, 10, dtype=torch.float, device=device, requires_grad=training)
+    hx = torch.randn(3, 20, dtype=torch.float, device=device, requires_grad=training)
+    cx = torch.randn(3, 20, dtype=torch.float, device=device, requires_grad=training)
     module = nn.LSTMCell(10, 20).to(device, torch.float)  # Just to allocate weights with correct sizes
     if training:
         params = tuple(module.parameters())
@@ -205,6 +205,17 @@ def _construct_empty_tensor_list():
     return []
 
 
+def enable_cpu_fuser(fn):
+    def wrapper(*args, **kwargs):
+        torch._C._jit_override_can_fuse_on_cpu(True)
+        try:
+            fn(*args, **kwargs)
+        except Exception:
+            torch._C._jit_override_can_fuse_on_cpu(False)
+            raise
+    return wrapper
+
+
 class JitTestCase(TestCase):
     _do_cuda_memory_leak_check = True
     _restored_warnings = False
@@ -230,6 +241,9 @@ def getExportImportCopy(self, m):
             os.unlink(f.name)
         return imported
 
+    def assertGraphContains(self, graph, kind):
+        self.assertTrue(any(n.kind() == kind for n in graph.nodes()))
+
     def assertExpectedONNXGraph(self, trace, *args, **kwargs):
         torch.onnx._optimize_trace(trace, operator_export_type=OperatorExportTypes.ONNX)
         self.assertExpectedGraph(trace, *args, **kwargs)
@@ -351,8 +365,10 @@ def allSum(vs):
 
         return ge
 
+    def assertAllFused(self, graph):
+        self.assertTrue(all(node.kind() in {'prim::Constant', 'prim::FusionGroup'} for node in graph.nodes()))
+        self.assertTrue([node.kind() for node in graph.nodes()].count('prim::FusionGroup') == 1)
 
-class TestJit(JitTestCase):
     def assertExportImport(self, trace, inputs):
         graph = trace if isinstance(trace, torch._C.Graph) else trace.graph()
         m = torch.jit.ScriptModule()
@@ -361,6 +377,9 @@ def assertExportImport(self, trace, inputs):
 
         self.assertEqual(m.forward(*inputs), m_import.forward(*inputs))
 
+
+class TestJit(JitTestCase):
+
     def test_simple(self):
         x = torch.tensor([0.4], requires_grad=True)
         y = torch.tensor([0.7], requires_grad=True)
@@ -456,6 +475,52 @@ def method(self, x):
         finally:
             torch.jit._enabled = True
 
+    def test_train_eval(self):
+        class Sub(nn.Module):
+            def forward(self, input):
+                if self.training:
+                    return input
+                else:
+                    return -input
+
+        class MyModule(torch.jit.ScriptModule):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.sub = Sub()
+
+            @torch.jit.script_method
+            def forward(self, input):
+                return self.sub(input) + 1
+
+        m = MyModule()
+        input = torch.rand(3, 4)
+        self.assertEqual(input + 1, m(input))
+        m.eval()
+        self.assertEqual(-input + 1, m(input))
+
+    def test_train_eval_const(self):
+        class MyModule(torch.jit.ScriptModule):
+            __constants__ = ['training']
+
+            def __init__(self):
+                super(MyModule, self).__init__()
+                # TODO: it is illegal to try to call
+                # eval/train because training has already
+                # been set. Consider allowing
+                # constants to be mutable until the end of __init__
+
+            @torch.jit.script_method
+            def forward(self, input):
+                if self.training:
+                    x = 2 * input
+                else:
+                    x = -input
+                return x + 1
+
+        m = MyModule()
+        input = torch.rand(3, 4)
+        self.assertEqual(2 * input + 1, m(input))
+
     # Backwards tracing was broken for indexing by a constant,
     # because it's internally implemented using as_strided,
     # and we attempted to trace its derivative (which is not
@@ -580,6 +645,7 @@ def test_lstm_fusion_cuda(self):
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skip("Test is flaky, see https://github.com/pytorch/pytorch/issues/8746")
+    @enable_cpu_fuser
     def test_lstm_fusion_cpu(self):
         inputs = get_lstm_inputs('cpu')
         try:
@@ -704,6 +770,7 @@ def test_comparison_gt_lt_cuda(self):
         y = torch.randn(4, 4, dtype=torch.float, device='cuda')
 
         ge = self.checkTrace(self.fn_test_comparison_gt_lt, (x, y))
+        self.assertAllFused(ge.graph_for(x, y))
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
@@ -720,6 +787,24 @@ def f(x, y):
         y = torch.randn(4, 4, dtype=torch.float, device='cuda')
 
         ge = self.checkTrace(f, (x, y))
+        self.assertAllFused(ge.graph_for(x, y))
+
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    @skipIfRocm
+    def test_comparison_eq_ne(self):
+        def f(x, y):
+            mask = (x == 0).type_as(x)
+            z = x * mask + y
+            mask = (x != 0).type_as(x)
+            z = z * mask + y
+            return z
+
+        x = torch.randn(4, 4, dtype=torch.float, device='cuda')
+        y = torch.randn(4, 4, dtype=torch.float, device='cuda')
+
+        ge = self.checkTrace(f, (x, y))
+        self.assertAllFused(ge.graph_for(x, y))
 
     @staticmethod
     def fn_test_relu(x, y):
@@ -859,7 +944,7 @@ def test_recursive_cse(self):
 
         def fn(x, y):
             z = x
-            if x + y > x:
+            if bool(x + y > x):
                 z = x + y
             return z
 
@@ -915,6 +1000,7 @@ def f(x, y):
 
         torch.jit.verify(f, (x, y), loss_fn=lambda z, w: z * w, devices=[])
 
+    @suppress_warnings
     def test_constant(self):
         x = torch.randn(2, 2, requires_grad=True)
 
@@ -1030,6 +1116,34 @@ def test_trace_size(self):
     def test_trace_size_with_grad(self):
         self.do_trace_size(True)
 
+    def test_trace_casts(self):
+        casts = [
+            lambda x: x.byte(),
+            lambda x: x.float(),
+            lambda x: x.cpu(),
+            lambda x: x.to(device='cpu'),
+            lambda x: x.to(dtype=torch.int64),
+            lambda x: x.to(device='cpu', dtype=torch.float),
+            lambda x: x.to(x)
+        ]
+
+        def assertContainsCast(trace):
+            self.assertEqual(sum(n.kind() == 'aten::to' for n in trace.graph.nodes()), 1)
+
+        for cast in casts:
+            trace = torch.jit.trace(cast, torch.randn(2, 2))
+            assertContainsCast(trace)
+            x = torch.randn(2, 2)
+            self.assertEqual(trace(x), cast(x))
+
+        def to_tensor(x, y):
+            return x.to(y)
+
+        to_tensor_trace = torch.jit.trace(to_tensor, (torch.randn(2, 2), torch.randn(1, 8)))
+        assertContainsCast(to_tensor_trace)
+        x, y = torch.randn(2, 2), torch.randn(1, 10)
+        self.assertEqual(to_tensor_trace(x, y), to_tensor(x, y))
+
     def test_trace_warn(self):
         def fn(x):
             int(x)  # Warning 1.
@@ -1067,6 +1181,17 @@ def fn(x, y):
         self.assertExpectedGraph(traced_fn.graph)
         self.assertExportImport(traced_fn.graph, (x, y))
 
+    def test_trace_random(self):
+        def f(mean, std):
+            return torch.normal(mean, std)
+
+        traced = torch.jit.trace(f, (torch.zeros(2, 3), torch.ones(2, 3)), check_trace=False)
+        mean, std = torch.zeros(5, 5), torch.ones(5, 5)
+        with torch.random.fork_rng(devices=[]):
+            output = f(mean, std)
+        traced_output = traced(mean, std)
+        self.assertEqual(output, traced_output)
+
     def test_trace_tensor_factory(self):
         def run(**kwargs):
             inputs_require_grads = kwargs.pop('inputs_require_grads', True)
@@ -1280,6 +1405,7 @@ def test_ge_unoptimized(self):
         self.run_ge_tests(False, False)
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @enable_cpu_fuser
     def test_ge_optimized(self):
         self.run_ge_tests(True, False)
 
@@ -1320,6 +1446,18 @@ def foo(a):
         x = torch.randn(5, 5)
         self.assertEqual(foo(x), x + x + x)
 
+    def test_einsum(self):
+        def outer(x, y):
+            return torch.einsum('i,j->ij', (x, y))
+
+        traced = torch.jit.trace(outer, (torch.randn(4), torch.randn(5)))
+        script = torch.jit.script(outer)
+        fns = [traced, script]
+        x, y = torch.randn(10), torch.randn(2)
+        for fn in [traced, script]:
+            self.assertGraphContains(fn.graph, kind='aten::einsum')
+            self.assertEqual(fn(x, y), outer(x, y))
+
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "calls .cuda()")
     @skipIfRocm
@@ -1401,6 +1539,25 @@ def fn(x):
         fn(x)
         fn(y)
 
+    def test_python_function_tup(self):
+        class MyFn(Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x + 1, x - 1
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return grad_output, grad_output
+
+        @_trace(torch.zeros(2))
+        def fn(x):
+            a, b = MyFn.apply(x + 2)
+            return a + b + 3
+        x = torch.tensor([1., 2., 3.])
+        y = torch.randn(2, 2, requires_grad=True)
+        fn(x)
+        fn(y)
+
     def test_decompose_addmm(self):
         @torch.jit.script
         def addmm(mat, mat1, mat2, alpha, beta):
@@ -1450,6 +1607,26 @@ def test_fn(ten, mask):
         ten = torch.rand(3, 3)
         self.assertEqual(test_fn(ten, mask), traced_test_fn(ten, mask))
 
+    def test_sparse_tensors_error(self):
+        def get_sparse():
+            return torch.sparse.FloatTensor(2, 3)
+
+        @torch.jit.script
+        def sparse(input):
+            output = get_sparse()
+            return output, input
+
+        with self.assertRaisesRegex(RuntimeError, "sparse tensors not supported"):
+            sparse(get_sparse())
+
+        # has a different entry point than calling sparse directly
+        with self.assertRaisesRegex(RuntimeError, "sparse tensors not supported"):
+            torch._C._jit_pass_shape_analysis(
+                sparse.graph, (get_sparse(),), False)
+
+        with self.assertRaisesRegex(RuntimeError, "sparse tensors not supported"):
+            sparse(torch.tensor([1]))
+
     def test_constant_prop_simple(self):
         @torch.jit.script
         def constant_prop(input_tensor):
@@ -1468,12 +1645,11 @@ def test_constant_prop_nested(self):
         @torch.jit.script
         def constant_prop(a):
             b = 2 + 1
-            if a < 2:
+            if bool(a < 2):
                 c = b + 2
             else:
                 c = b - 2
             return c
-
         out_ref = constant_prop(torch.tensor(2))
         self.run_pass('constant_propagation', constant_prop.graph)
         out_test = constant_prop(torch.tensor(2))
@@ -1507,8 +1683,8 @@ def constant_prop(a, b):
             c0 = 1
             c1 = 1
             c2 = 1
-            if a:  # -> c0, c1
-                if b:  # -> c0
+            if bool(a):  # -> c0, c1
+                if bool(b):  # -> c0
                     if True:  # -> c0
                         c0 = c0 + 1
                         if False:
@@ -1613,6 +1789,32 @@ def test_export_dropout(self):
         x = torch.randn(3, 4)
         self.assertEqual(traced(x), imported(x))
 
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    def test_cuda_export_restore(self):
+        class Sub(torch.jit.ScriptModule):
+            def __init__(self):
+                super(Sub, self).__init__()
+                self.weight = nn.Parameter(torch.randn(3, 4))
+
+            @torch.jit.script_method
+            def forward(self, thing):
+                return self.weight + thing
+
+        class M(torch.jit.ScriptModule):
+            def __init__(self):
+                super(M, self).__init__()
+                self.mod = Sub()
+
+            @torch.jit.script_method
+            def forward(self, v):
+                return self.mod(v)
+        m = M()
+        m.cuda()
+        m2 = self.getExportImportCopy(m)
+        m2.cuda()
+        input = torch.rand(3, 4).cuda()
+        self.assertEqual(m(input), m2(input))
+
     def test_export_batchnorm(self):
         for mode in ['eval', 'train']:
             for clazz in [
@@ -1674,6 +1876,28 @@ def forward(self, x, lengths, hiddens):
             torch.randn(5, 3, 10), torch.LongTensor([3, 3, 2]), torch.randn(2, 3, 20), torch.randn(2, 3, 20)
         self.assertEqual(traced(x, lengths, (h0, c0)), imported(x, lengths, (h0, c0)))
 
+    def test_trace_variable_instantiation(self):
+        def random_foo(x):
+            return Variable(Variable(x) + 1.0)
+
+        random_foo_traced = torch.jit.trace(random_foo, (torch.rand(3, 4),))
+
+        x = torch.rand(5, 6)
+        self.assertEqual(random_foo(x), random_foo_traced(x))
+
+    def test_trace_slice_expr_complete_type(self):
+        def random_foo(x):
+            return x + 1.0
+
+        random_foo_traced = torch.jit.trace(random_foo, (torch.rand(3, 4),))
+
+        @torch.jit.script
+        def random_bar(x):
+            return random_foo_traced(x)[0:1]
+
+        x = torch.rand(3, 4)
+        self.assertEqual(random_bar(x), (x + 1)[0:1])
+
 
 class TestBatched(TestCase):
     # generate random examples and create an batchtensor with them
@@ -1909,7 +2133,7 @@ def batch_sum(a):
 
     def test_if_else(self):
         def single_if(a, b):
-            if a > b:
+            if bool(a > b):
                 a = a + b
             else:
                 a = a - b
@@ -1929,7 +2153,7 @@ def single_if(a, b):
 
     def test_if_else_with_scalar(self):
         def single_if(a, b):
-            if a > 0.1:
+            if bool(a > 0.1):
                 a = a + b
             else:
                 a = a - b
@@ -1949,7 +2173,7 @@ def single_if(a, b):
 
     def test_if_noelse(self):
         def single_if(a, b):
-            if a > b:
+            if bool(a > b):
                 a = a + b
             return a
 
@@ -1967,7 +2191,7 @@ def single_if(a, b):
 
     def test_if_noelse_with_scalar(self):
         def single_if(a, b):
-            if a > 0.1:
+            if bool(a > 0.1):
                 a = a + b
             return a
 
@@ -1985,7 +2209,7 @@ def single_if(a, b):
 
     def test_while(self):
         def single_while(a, b):
-            while a > b:
+            while bool(a > b):
                 a = a - b
             return a
 
@@ -2073,7 +2297,7 @@ def test_greedy_search(self):
         def greedy(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc,
                    b_i, b_f, b_o, b_c, w_hs, b_s, iter_num):
             iter_count = torch.zeros_like(iter_num)
-            while(iter_count < iter_num):
+            while bool(iter_count < iter_num):
                 iter_count += 1
                 # LSTM Cell
                 i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
@@ -2139,7 +2363,7 @@ def beam(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc,
             vocab_size = embed.size(1)
             iter_count = torch.zeros_like(iter_num)
             max_len = idx.size(2)
-            while(iter_count < iter_num):
+            while bool(iter_count < iter_num):
                 iter_count += 1
                 # LSTM Cell
                 i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
@@ -2340,6 +2564,19 @@ def stuff3(x):
             return torch.ones(x), x
         self.checkScript(stuff3, ([3, 2],))
 
+    def test_nested_list_error(self):
+        with self.assertRaisesRegex(RuntimeError, "Lists can only contain"):
+            @torch.jit.script
+            def foo(x):
+                # type: (Tuple[List[List[int]]]) -> int
+                return 4
+
+    def test_nested_list_construct_error(self):
+        with self.assertRaisesRegex(RuntimeError, "Lists can only contain"):
+            @torch.jit.script
+            def foo(x):
+                return [[4]]
+
     def test_script_cu(self):
         cu = torch.jit.CompilationUnit('''
             def foo(a):
@@ -2384,6 +2621,13 @@ def foo(a):
         s = Variable(torch.rand(2))
         self.assertEqual(s + s + s, foo(s))
 
+    def test_inf(self):
+        @torch.jit.script
+        def foo(a):
+            return a < float('inf')
+        s = torch.rand(1)
+        self.assertTrue(foo(s))
+
     def test_add(self):
         def func(a, b):
             c = a + b
@@ -2459,12 +2703,47 @@ def func(x):
         x = torch.rand(10, dtype=torch.float, requires_grad=True)
         self.checkScript(func, [x], optimize=True)
 
+    def test_random(self):
+        @torch.jit.script
+        def f(mean, std):
+            return torch.normal(mean, std)
+
+        mean, std = torch.zeros(5, 5), torch.ones(5, 5)
+        with torch.random.fork_rng(devices=[]):
+            output = torch.normal(mean, std)
+        with torch.random.fork_rng(devices=[]):
+            script_output = f(mean, std)
+        self.assertEqual(output, script_output)
+
     def _check_code(self, code_str, fn_name, inputs):
         scope = {}
         exec(code_str, globals(), scope)
         cu = torch.jit.CompilationUnit(code_str)
         self.assertEqual(cu.func(*inputs), scope[fn_name](*inputs))
 
+    @unittest.skipIf(not RUN_CUDA, 'no CUDA')
+    def test_scriptmodule_releases_tensors_cuda(self):
+        @torch.jit.script
+        def fn(x, y):
+            return x.sigmoid() * y.tanh()
+
+        def test(backward=False):
+            x = torch.randn(3, 3, dtype=torch.double, device='cuda', requires_grad=True)
+            y = torch.randn(3, 3, dtype=torch.double, device='cuda', requires_grad=True)
+            out = fn(x, y)
+            if backward:
+                out.sum().backward()
+
+        with self.assertLeaksNoCudaTensors():
+            test()
+            test()
+            test()
+
+        with self.assertLeaksNoCudaTensors():
+            test(backward=True)
+            test(backward=True)
+            test(backward=True)
+
     def test_index(self):
         def consec(size, start=0):
             numel = torch.tensor(size).prod().item()
@@ -2817,6 +3096,7 @@ def chunk_4_last(x):
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @skipIfRocm
+    @enable_cpu_fuser
     def test_chunk_fusion_correctness(self):
         return self._test_chunk_fusion_correctness(self, 'cpu')
 
@@ -3217,7 +3497,7 @@ def func(a):
 
     def test_while(self):
         def func(a, b, max):
-            while a < max:
+            while bool(a < max):
                 a = a + 1
                 b = b + 1
             c = a + b
@@ -3234,7 +3514,7 @@ def func(lim):
             somenum = 5
             dontmutateme = 3
             third = 0
-            while i < lim:
+            while bool(i < lim):
                 third = first + second
                 first = second
                 second = third
@@ -3256,7 +3536,7 @@ def test_if(self):
         def func(a, b):
             # type: (int, int) -> int
             d = 3
-            if a > 10:
+            if bool(a > 10):
                 a = 3 + d
             else:
                 b = 3 + d
@@ -3272,7 +3552,7 @@ def func(a, b):
             # type: (int, int) -> int
             d = 3
             for _ in range(20):
-                if a > 10:
+                if bool(a > 10):
                     a = 3 + d
                 else:
                     b = 3 + d
@@ -3284,7 +3564,7 @@ def func(a, b):
 
     def test_if_noelse(self):
         def func(a, b):
-            if a > 10:
+            if bool(a > 10):
                 a = 3 + b
             c = a + b
             return c
@@ -3292,11 +3572,19 @@ def func(a, b):
         inputs = self._make_scalar_vars([-1, 1], torch.int64)
         self.checkScript(func, inputs, optimize=True)
 
+    def test_explicit_bool_cast(self):
+        with self.assertRaisesRegex(RuntimeError, "expected an integer"):
+            @torch.jit.script
+            def test_bool_cast(a):
+                if a:
+                    return a + 2
+                return a + 1
+
     def test_while_nonexistent_value(self):
         with self.assertRaisesRegex(RuntimeError, "undefined value x"):
             torch.jit.CompilationUnit('''
             def test_while(a, b):
-                while a < 10:
+                while bool(a < 10):
                     a = a + x
                     b = b + 1
                 return a + b
@@ -3314,7 +3602,7 @@ def test_while(a, b):
 
     def test_while_write_outer_then_read(self):
         def func(a, b):
-            while a < 10:
+            while bool(a < 10):
                 a = a + 1
                 b = a + 1
             return a + b
@@ -3463,7 +3751,7 @@ def test_script_bool_constant():
     def test_ternary(self):
         def func(a, b):
             c = 3
-            c = a + b if a > 3 else b
+            c = a + b if bool(a > 3) else b
             return c
 
         inputs_true = self._make_scalar_vars([5, 2], torch.int64)
@@ -3486,18 +3774,18 @@ def test_logical_short_circuit(self):
         @torch.jit.script
         def testNoThrows(t):
             c1 = 1
-            if (False and t[1]) or (True or t[1]):
+            if (False and bool(t[1])) or (True or bool(t[1])):
                 c1 = 0
             return c1
 
         @torch.jit.script
         def throwsOr(t):
-            c0 = False or t[1]
+            c0 = False or bool(t[1])
             print(c0)
 
         @torch.jit.script
         def throwsAnd(t):
-            c0 = True and t[1]
+            c0 = True and bool(t[1])
             print(c0)
 
         t = torch.randn(0)
@@ -3587,19 +3875,32 @@ def test_number_math(self):
         template = dedent('''
         # int, int -> int
         def func1():
-            return 8 {op} 2
+            return 7 {op} 2
 
         def func2():
-            return 2 {op} 2
+            return 3 {op} 2
 
-        # float, float -> float
         def func3():
-            return 3.14 {op} 0.125
+            return -7 {op} 3
 
         def func4():
+            return 7 {op} -3
+
+        # float, float -> float
+        def func5():
+            return 3.14 {op} 0.125
+
+        def func6():
             return 3.14 {op} 3.14
+
+        def func7():
+            return -0.5 {op} 2.0
+
+        def func8():
+            return 3.5 {op} -2.0
+
         ''')
-        ops = ['+', '-', '*', '<', '<=', '>', '>=', '==', '!=']
+        ops = ['+', '-', '*', '%', '<', '<=', '>', '>=', '==', '!=']
 
         for op in ops:
             code = template.format(op=op)
@@ -3611,6 +3912,10 @@ def func4():
             self.assertEqual(cu.func2(), scope['func2']())
             self.assertEqual(cu.func3(), scope['func3']())
             self.assertEqual(cu.func4(), scope['func4']())
+            self.assertEqual(cu.func5(), scope['func5']())
+            self.assertEqual(cu.func6(), scope['func6']())
+            self.assertEqual(cu.func7(), scope['func7']())
+            self.assertEqual(cu.func8(), scope['func8']())
 
     def test_number_div(self):
         self.checkScript(div_int_future, (), optimize=True)
@@ -3662,10 +3967,10 @@ def test(op, const, swap_args):
             cu = torch.jit.CompilationUnit(code)
             self.assertEqual(cu.func(tensor), scope['func'](tensor))
 
-        var_int = 2
-        var_float = 1.4321
+        var_int = [2, -2]
+        var_float = [1.4321, -1.2]
 
-        ops = ['+', '-', '*', '<', '<=', '>', '>=', '==', '!=']
+        ops = ['+', '-', '*', '%', '<', '<=', '>', '>=', '==', '!=']
         # TODO: turn this on for py3 (and add PY3 division semantics)
         ops_py2_only = ['/']
         if PY2:
@@ -3677,7 +3982,7 @@ def test(op, const, swap_args):
         long_tensor[long_tensor == 0] = 2
 
         tensors = [float_tensor, double_tensor, long_tensor]
-        consts = [var_int, var_float]
+        consts = var_int + var_float
 
         for op, tensor, const, swap_args in product(ops, tensors, consts, [True, False]):
             # FIXME: things like 2 / long_tensor are not implemented correctly
@@ -3685,6 +3990,10 @@ def test(op, const, swap_args):
             if op == '/' and tensor.data_ptr() == long_tensor.data_ptr():
                 continue
 
+            # % operator does not take: const % tensor
+            if op == '%' and swap_args is True:
+                continue
+
             test(op, const, swap_args)
 
     def test_tensor_number_math(self):
@@ -3710,7 +4019,7 @@ def test_call_python(a):
             step = 1
             while i < 10:
                 b = pyfunc(b)
-                if b > 3.0:
+                if bool(b > 3.0):
                     b = pyfunc(b)
                 i = 11
             return b
@@ -4399,6 +4708,32 @@ def forward(self, x, seq_lens):
             f = io.BytesIO()
             torch.onnx._export(m, (x, seq_lens), f, verbose=False)
 
+    def test_python_call_non_tensor(self):
+        def foo(a, b, c):
+            # type: (Tensor, int, Tuple[Tensor, int]) -> Tuple[int, Tensor]
+            d, e = c
+            return b + e, a + d
+
+        @torch.jit.script
+        def bar():
+            x = torch.ones(3, 4)
+            a, b = foo(x, 3, (x, 3))
+            return a, b
+
+        self.assertEqual((6, torch.ones(3, 4) + 1), bar())
+
+    def test_python_call_non_tensor_wrong(self):
+        with self.assertRaisesRegex(RuntimeError, r"but instead got value of type tuple"):
+            def foo():
+                # type: () -> Tensor
+                return ((3, 4),)
+
+            @torch.jit.script
+            def bar():
+                return foo()
+
+            bar()
+
     def test_tuples(self):
         @torch.jit.script
         def foo(i):
@@ -4682,7 +5017,7 @@ def test_trace_of_script(self):
         @torch.jit.script
         def foo(a, c):
             b = 0.0
-            if a == 0.0:
+            if bool(a == 0.0):
                 b = 1.0
             return b + c
 
@@ -4701,7 +5036,7 @@ def use(b):
     def test_if_define(self):
         @torch.jit.script
         def foo(a):
-            if a == 0:
+            if bool(a == 0):
                 b = 1
             else:
                 b = 0
@@ -4710,14 +5045,14 @@ def foo(a):
         @torch.jit.script
         def foo2(a):
             b = 0
-            if a == 0:
+            if bool(a == 0):
                 b = 1
             return b + 1
 
         @torch.jit.script
         def foo3(a):
             b = 1
-            if a == 0:
+            if bool(a == 0):
                 c = 4
             else:
                 b = 0
@@ -4844,10 +5179,11 @@ def __init__(self):
                 self.param1 = torch.nn.Parameter(torch.rand(5, 5))
                 self.param2 = torch.nn.Parameter(self.param1[3])
                 self.param3 = torch.nn.Parameter(torch.rand(5, 5))
+                self.param4 = torch.nn.Parameter(torch.rand(11, 5)[1:6])
 
             @torch.jit.script_method
             def foo(self):
-                return self.param1 + self.param2 + self.param3
+                return self.param1 + self.param2 + self.param3 + self.param4
 
         m_orig = M()
         m_import = self.getExportImportCopy(m_orig)
@@ -4995,6 +5331,22 @@ def forward(self, x):
             mte, (torch.zeros(1, 2, 3),), None, verbose=False,
             example_outputs=outputs, export_raw_ir=True))
 
+    def test_onnx_export_script_non_alpha_add_sub(self):
+        class ModuleToExport(torch.jit.ScriptModule):
+            def __init__(self):
+                super(ModuleToExport, self).__init__()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                bs = x.size(0) + 1
+                return bs - 1
+
+        mte = ModuleToExport()
+        outputs = torch.LongTensor([mte(torch.rand(3, 4))])
+        self.assertExpected(torch.onnx.export_to_pretty_string(
+            mte, (torch.rand(3, 4),), None, verbose=False,
+            example_outputs=outputs))
+
     def test_onnx_export_script_module_if(self):
         class ModuleToExport(torch.jit.ScriptModule):
             def __init__(self):
@@ -5002,7 +5354,7 @@ def __init__(self):
 
             @torch.jit.script_method
             def forward(self, x):
-                if torch.sum(x) > 0:
+                if bool(torch.sum(x) > 0):
                     x = torch.neg(x)
                 return x
 
@@ -5117,8 +5469,8 @@ def forward(self, x):
                 # we cannot use `True` as the condition. Constant prop
                 # would remove the `if` statements.
                 c = sum(x) > 4
-                if c:
-                    if c:
+                if bool(c):
+                    if bool(c):
                         y = self.m(x)
                     else:
                         y = self.m(x)
@@ -5235,10 +5587,10 @@ def f3(a):
             def f4(a):
                 torch.cat(a)
 
-        with self.assertRaisesRegex(RuntimeError, 'argument \'tensors\' but found Tensor[][]'):
+        with self.assertRaisesRegex(RuntimeError, 'argument \'tensors\' but found int\[\]'):
             @torch.jit.script
             def f5(a):
-                torch.cat([[a]])
+                torch.cat([3])
 
         with self.assertRaisesRegex(RuntimeError, 'Lists must contain only a single type'):
             @torch.jit.script
@@ -5551,7 +5903,7 @@ def test_return_stmt_not_at_end(self):
         with self.assertRaisesRegex(RuntimeError, 'return statements can appear only at the end of the function body'):
             @torch.jit.script
             def return_stmt_wrong(x):
-                if x > 3:
+                if bool(x > 3):
                     return 3
                 else:
                     return x
@@ -5731,7 +6083,7 @@ def test_fn():
                 return foo(torch.full([1], 1), torch.full([1], 2), torch.full([1], 3))
 
     def test_wrong_return_type(self):
-        with self.assertRaisesRegex(RuntimeError, 'Python functions can currently only return Tensors'):
+        with self.assertRaisesRegex(RuntimeError, 'but instead got value of type tuple'):
             def somefunc():
                 # type: () -> Tuple[Tuple[Tensor, Tensor]]
                 return torch.zeros(3, 4), torch.zeros(4, 5)
@@ -5739,6 +6091,7 @@ def somefunc():
             @torch.jit.script
             def wrong_return_type():
                 return somefunc()
+            wrong_return_type()
 
     # Tests for calling between different front-end modes
     def test_call_python_fn_from_tracing_fn(self):
@@ -6196,7 +6549,7 @@ def test_index_put(target, indices, rhs):
             target[indices] = rhs
             return target
 
-        self.assertExpected(str(test_index_put.graph))
+        self.assertExpectedGraph(test_index_put.graph)
 
     def test_index_put_trace_without_view(self):
         @_trace(torch.rand(100), torch.tensor([1, 2, 3, 4]), torch.rand(4))
@@ -6204,7 +6557,7 @@ def test_index_put(target, indices, rhs):
             target[indices] = rhs
             return target
 
-        self.assertExpected(str(test_index_put.graph))
+        self.assertExpectedGraph(test_index_put.graph)
 
     def test_annotated_script_fn(self):
         @torch.jit.script
@@ -6577,10 +6930,66 @@ def foo(x):
 
         self.assertEqual(foo(input), input)
 
+    def test_export_dynamic_slice(self):
+        class DynamicSliceExportMod(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                retval = x[0]
+                for i in range(x.size(1)):
+                    retval += torch.sum(x[0:i], dim=0)
+                return retval
 
-class TestEndToEndHybridFrontendModels(JitTestCase):
+        mod = DynamicSliceExportMod()
 
-    def test_dcgan_models(self):
+        input = torch.rand(3, 4, 5)
+        example_outs = mod(input)
+
+        f = io.BytesIO()
+        exported = torch.onnx.export_to_pretty_string(
+            DynamicSliceExportMod(), (input,), f, example_outputs=example_outs)
+        self.assertExpected(exported)
+
+    def test_string_frontend_elif(self):
+        code = '''
+            def elif_test(niter : int):
+                rv = 0
+                for i in range(niter):
+                    if i % 3 == 0 and i % 5 == 0:
+                        rv += 35
+                    elif i % 3 == 0:
+                        rv += 3
+                    elif i % 5 == 0:
+                        rv += 5
+                    else:
+                        rv += i
+                return rv
+        '''
+
+        self.checkScript(code, (101,), name='elif_test', outputs=3028)
+
+
+class MnistNet(nn.Module):
+    def __init__(self):
+        super(MnistNet, self).__init__()
+        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+        self.conv2_drop = nn.Dropout2d()
+        self.fc1 = nn.Linear(320, 50)
+        self.fc2 = nn.Linear(50, 10)
+
+    def forward(self, x):
+        x = F.relu(F.max_pool2d(self.conv1(x), 2))
+        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
+        x = x.view(-1, 320)
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, training=self.training)
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+
+
+class TestEndToEndHybridFrontendModels(JitTestCase):
+    @staticmethod
+    def _test_dcgan_models(self, device, check_export_import=True):
         class DCGANGenerator(nn.Module):
             def __init__(self, nz, ngf, nc):
                 super(DCGANGenerator, self).__init__()
@@ -6638,12 +7047,24 @@ def forward(self, input):
                 return self.main(input).view(-1, 1).squeeze(1)
 
         bs, nz, ngf, nc, ndf = 5, 6, 9, 3, 10
-        self.checkTrace(DCGANGenerator(nz, ngf, nc), (torch.rand(bs, nz, 1, 1),))
-        example_input = DCGANGenerator(nz, ngf, nc)(torch.rand(bs, nz, 1, 1))
-        self.checkTrace(DCGANDiscriminator(nc, ndf), (example_input,))
+        self.checkTrace(DCGANGenerator(nz, ngf, nc).to(device),
+                        (torch.rand(bs, nz, 1, 1, device=device),),
+                        export_import=check_export_import)
+        example_input = DCGANGenerator(nz, ngf, nc).to(device)(torch.rand(bs, nz, 1, 1, device=device))
+        self.checkTrace(DCGANDiscriminator(nc, ndf).to(device), (example_input,),
+                        export_import=check_export_import)
 
-    @unittest.skip('https://github.com/pytorch/pytorch/issues/8439 InstanceNormalization bug')
-    def test_neural_style(self):
+    def test_dcgan_models(self):
+        self._test_dcgan_models(self, device='cpu')
+
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
+    def test_dcgan_models_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_dcgan_models(self, device='cuda', check_export_import=False)
+
+    @staticmethod
+    def _test_neural_style(self, device, check_export_import=True):
         class TransformerNet(torch.nn.Module):
             def __init__(self):
                 super(TransformerNet, self).__init__()
@@ -6740,31 +7161,59 @@ def forward(self, x):
                 out = self.conv2d(out)
                 return out
 
-        self.checkTrace(TransformerNet(), (torch.rand(5, 3, 224, 224),))
+        self.checkTrace(TransformerNet(), (torch.rand(5, 3, 64, 64),), export_import=check_export_import)
 
-    def test_mnist(self):
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
-                self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
-                self.conv2_drop = nn.Dropout2d()
-                self.fc1 = nn.Linear(320, 50)
-                self.fc2 = nn.Linear(50, 10)
+    def test_neural_style(self):
+        self._test_neural_style(self, device='cpu')
 
-            def forward(self, x):
-                x = F.relu(F.max_pool2d(self.conv1(x), 2))
-                x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
-                x = x.view(-1, 320)
-                x = F.relu(self.fc1(x))
-                x = F.dropout(x, training=self.training)
-                x = self.fc2(x)
-                return F.log_softmax(x, dim=1)
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    def test_neural_style_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_neural_style(self, device='cuda', check_export_import=False)
 
+    @staticmethod
+    def _test_mnist(self, device, check_export_import=True):
         # eval() is present because dropout makes this nondeterministic
-        self.checkTrace(Net().eval(), (torch.rand(5, 1, 28, 28),))
+        self.checkTrace(MnistNet().to(device).eval(), (torch.rand(5, 1, 28, 28, device=device),),
+                        export_import=check_export_import)
 
-    def test_reinforcement_learning(self):
+    def test_mnist(self):
+        self._test_mnist(self, device='cpu')
+
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
+    def test_mnist_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_mnist(self, device='cuda', check_export_import=False)
+
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
+    def test_mnist_training_leaks_no_memory_cuda(self):
+        net = MnistNet().cuda()
+        # MnistNet uses dropout, don't check its trace
+        traced_net = torch.jit.trace(net, [torch.randn(5, 1, 28, 28, device='cuda')],
+                                     check_trace=False)
+
+        def train(iters):
+            for _ in range(iters):
+                # Get some fake data
+                inp = torch.randn(5, 1, 28, 28, device='cuda')
+                out = traced_net(inp)
+
+                # Here's some fake loss
+                out.sum().backward()
+
+                # Zero out grads
+                traced_net.zero_grad()
+
+        # Set it up so the params have .grad fields so they are not reported as leaks
+        train(1)
+
+        with self.assertLeaksNoCudaTensors():
+            train(5)
+
+    @staticmethod
+    def _test_reinforcement_learning(self, device, test_export_import=True):
         class Policy(nn.Module):
             def __init__(self):
                 super(Policy, self).__init__()
@@ -6776,10 +7225,19 @@ def forward(self, x):
                 action_scores = self.affine2(x)
                 return F.softmax(action_scores, dim=1)
 
-        self.checkTrace(Policy(), (torch.rand(1, 4),))
+        self.checkTrace(Policy().to(device), (torch.rand(1, 4, device=device),),
+                        export_import=test_export_import)
 
-    @skipIfRocm
-    def test_snli(self):
+    def test_reinforcement_learning(self):
+        self._test_reinforcement_learning(self, device='cpu')
+
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    def test_reinforcement_learning_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_reinforcement_learning(self, device='cuda', test_export_import=False)
+
+    @staticmethod
+    def _test_snli(self, device, check_export_import=True):
         class Bottle(nn.Module):
 
             def forward(self, input):
@@ -6863,13 +7321,24 @@ class Config:
             n_layers = 2
             n_cells = 4  # 2 * n_layers because birnn = True
 
-        premise = torch.LongTensor(48, 128).random_(0, 100)
-        hypothesis = torch.LongTensor(24, 128).random_(0, 100)
+        premise = torch.LongTensor(48, 128).random_(0, 100).to(device)
+        hypothesis = torch.LongTensor(24, 128).random_(0, 100).to(device)
 
-        self.checkTrace(SNLIClassifier(Config()), (premise, hypothesis), inputs_require_grads=False)
+        self.checkTrace(SNLIClassifier(Config()).to(device), (premise, hypothesis),
+                        inputs_require_grads=False, export_import=check_export_import)
 
     @skipIfRocm
-    def test_super_resolution(self):
+    def test_snli(self):
+        self._test_snli(self, device='cpu')
+
+    @skipIfRocm
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    def test_snli_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_snli(self, device='cuda', check_export_import=False)
+
+    @staticmethod
+    def _test_super_resolution(self, device, check_export_import=True):
         import torch.nn.init as init
 
         class Net(nn.Module):
@@ -6891,9 +7360,21 @@ def forward(self, x):
                 x = self.pixel_shuffle(self.conv4(x))
                 return x
 
-        net = Net(upscale_factor=4)
-        self.checkTrace(net, (torch.rand(5, 1, 64, 64),))
+        net = Net(upscale_factor=4).to(device)
+        self.checkTrace(net, (torch.rand(5, 1, 64, 64, device=device),),
+                        export_import=check_export_import)
 
+    @skipIfRocm
+    def test_super_resolution(self):
+        self._test_super_resolution(self, device='cpu')
+
+    @skipIfRocm
+    @unittest.skipIf(not RUN_CUDA, 'no CUDA')
+    def test_super_resolution_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_super_resolution(self, device='cuda', check_export_import=False)
+
+    @suppress_warnings
     def test_time_sequence_prediction(self):
         class Sequence(torch.jit.ScriptModule):
             def __init__(self):
@@ -6948,9 +7429,11 @@ def forward(self, input):
                 return outputs
 
         # TODO: toggle export_import once above issues are fixed
-        self.checkTrace(Sequence(), (torch.rand(3, 4),), export_import=False)
+        self.checkTrace(Sequence(), (torch.rand(3, 4),),
+                        export_import=False)
 
-    def test_vae(self):
+    @staticmethod
+    def _test_vae(self, device, check_export_import=True):
         class VAE(nn.Module):
             def __init__(self):
                 super(VAE, self).__init__()
@@ -6983,7 +7466,16 @@ def forward(self, x):
                 return self.decode(z), mu, logvar
 
         # eval() is present because randn_like makes this nondeterministic
-        self.checkTrace(VAE().eval(), (torch.rand(128, 1, 28, 28),))
+        self.checkTrace(VAE().to(device).eval(), (torch.rand(128, 1, 28, 28, device=device),),
+                        export_import=check_export_import)
+
+    def test_vae(self):
+        self._test_vae(self, device='cpu')
+
+    @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    def test_vae_cuda(self):
+        # XXX: export_import on CUDA modules doesn't work (#11480)
+        self._test_vae(self, device='cuda', check_export_import=False)
 
 
 # Smoke tests for export methods
@@ -7047,6 +7539,19 @@ def forward(self, x, y):
     'test_split_dim_neg0',
 }
 
+EXCLUDE_TYPE_CHECK = {
+    # slogdet tests use itemgetter to select its only differentiable output,
+    # but this happens outside of the graph we handle, so there are fewer
+    # reference outputs than graph outputs.
+    'test_slogdet_1x1_neg_det',
+    'test_slogdet_1x1_pos_det',
+    'test_slogdet_distinct_singular_values',
+    'test_slogdet_neg_det',
+    'test_slogdet_pos_det',
+    'test_slogdet_symmetric',
+    'test_slogdet_symmetric_pd',
+}
+
 # known to be failing in script
 EXCLUDE_SCRIPT = {
     # TODO: Fix var/std
@@ -7070,30 +7575,6 @@ def forward(self, x, y):
     'test_var_dim_1d',
     'test_var_dim_1d_neg0',
     'test_var_dim_neg0',
-    'test_norm_inf',
-    'test_renorm_norm_inf',
-    'test_split',
-    'test_expand',
-    'test_expand_1_element',
-    'test_expand_new_dim',
-    'test_expand_new_dim_front_old_front_1',
-    'test_expand_scalar_to_dims',
-    'test_expand_size',
-    'test_permute',
-    'test_permute_neg_dim',
-    'test_repeat',
-    'test_repeat_scalar',
-    'test_repeat_single_number',
-    'test_repeat_unsqueeze',
-    'test_reshape_1d',
-    'test_reshape_scalar_to_1d',
-    'test_view',
-    'test_view_1d',
-    'test_view_scalar_to_1d',
-    'test_split_dim',
-    'test_split_dim_neg0',
-    'test_gesv',
-    'test_inverse',
     'test_matrix_power_n=-1',  # involves inverse
     'test_matrix_power_n=-3',  # involves inverse
     # skipped nn functional tests
@@ -7164,11 +7645,15 @@ def new_fn(*tensors_):
     return new_fn, [arg for arg in args if isinstance(arg, torch.Tensor)]
 
 
-def create_traced_fn(fn):
+# create a trace function from input fn
+def create_traced_fn(self, fn):
     def traced_fn(*inputs, **kwargs):
         fn_tensors, inputs_tensors = partial_apply_nontensors(fn, inputs, **kwargs)
         traced = torch.jit.trace(fn_tensors, inputs_tensors)
-        return traced(*inputs_tensors)
+        self.assertExportImport(traced.graph, inputs_tensors)
+        output = traced(*inputs_tensors)
+        traced_fn.last_graph = traced.graph_for(*inputs_tensors)
+        return output
     return traced_fn
 
 script_template = '''
@@ -7177,7 +7662,16 @@ def the_method({}):
 '''
 
 
-def create_script_fn(method_name, func_type, output_process_fn):
+def get_constant(x):
+    if x == inf or x == -inf:
+        return 'float(\'inf\')' if PY2 else 'math.inf'
+    return x
+
+
+# create a script function from (name, func_type, output_process_fn),
+# returns a function takes in (args, kwargs) and runs the compiled function and
+# then applies the post process fn to the outputs
+def create_script_fn(self, method_name, func_type, output_process_fn):
     def script_fn(*args, **kwargs):
         formals = []
         tensors = []
@@ -7189,7 +7683,7 @@ def script_fn(*args, **kwargs):
                 actuals.append(name)
                 tensors.append(arg)
             else:
-                actuals.append(str(arg))
+                actuals.append(str(get_constant(arg)))
         kwargs_str = ''
         for k, v in kwargs.items():
             kwargs_str += ', ' + k + '=' + str(v)
@@ -7203,12 +7697,36 @@ def script_fn(*args, **kwargs):
             raise 'Unsupported function type'
 
         script = script_template.format(', '.join(formals), call)
+
+        # for math.inf
+        import math
+
         CU = torch.jit.CompilationUnit(script)
-        return output_process_fn(CU.the_method(*tensors))
+        self.assertExportImport(CU.the_method.graph, tensors)
+        output = output_process_fn(CU.the_method(*tensors))
+        script_fn.last_graph = CU.the_method.graph_for(*tensors)
+        return output
     return script_fn
 
 
-def check_against_reference(self, func, reference_func, args, kwargs=None, allow_unused=True):
+def check_output_types(self, func, ref_outputs, args, kwargs):
+    graph = getattr(func, 'last_graph', None)
+    if not isinstance(ref_outputs, tuple):
+        ref_outputs = (ref_outputs,)
+    types = [o.type() for o in graph.outputs()]
+    self.assertEqual(len(types), len(ref_outputs))
+    for i, (t, ref_out) in enumerate(zip(types, ref_outputs)):
+        if isinstance(ref_out, list):
+            assert len(ref_out) > 0
+            elem = ref_out[0]
+            assert isinstance(elem, torch.Tensor)
+            self.assertTrue(t.isSubtypeOf(torch._C.ListType.ofTensors()))
+        else:
+            ref_type = torch._C.Type.inferFrom(ref_out)
+            self.assertTrue(ref_type.isSubtypeOf(t))
+
+
+def check_against_reference(self, func, reference_func, args, kwargs=None, allow_unused=True, check_types=True):
     kwargs = kwargs if kwargs else {}
 
     def allSum(vs):
@@ -7233,6 +7751,9 @@ def clone_inputs(requires_grad):
     outputs_test = func(*nograd_inputs, **kwargs)
     self.assertEqual(outputs, outputs_test)
 
+    if check_types:
+        check_output_types(self, func, outputs_test, nograd_inputs, kwargs)
+
     # test single grad case
     outputs = reference_func(*recording_inputs, **kwargs)
     grads = torch.autograd.grad(allSum(outputs), recording_tensors,
@@ -7272,7 +7793,7 @@ def clone_inputs(requires_grad):
         self.assertTrue(torch.allclose(g2, g2_test, atol=5e-4, rtol=1e-4))
 
 
-class TestJitGenerated(TestCase):
+class TestJitGenerated(JitTestCase):
     pass
 
 
@@ -7558,15 +8079,19 @@ def fn(*inputs, **kwargs):
                     output = getattr(inputs[0], name)(*inputs[1:], **kwargs)
                     return output_process_fn(output)
 
+                check_types = test_name not in EXCLUDE_TYPE_CHECK
+
                 if not is_inplace and name not in EXCLUDE_GRADCHECK and not exclude_tensor_method(name, test_name):
                     if test_name not in EXCLUDE_TRACED:
-                        check_against_reference(self, create_traced_fn(fn),
-                                                fn, (self_variable,) + args_variable, kwargs_variable)
+                        check_against_reference(self, create_traced_fn(self, fn),
+                                                fn, (self_variable,) + args_variable, kwargs_variable,
+                                                check_types=check_types)
 
                     if not is_magic_method and test_name not in EXCLUDE_SCRIPT:
                         check_against_reference(self,
-                                                create_script_fn(name, 'method', output_process_fn),
-                                                fn, (self_variable,) + args_variable, kwargs_variable)
+                                                create_script_fn(self, name, 'method', output_process_fn),
+                                                fn, (self_variable,) + args_variable, kwargs_variable,
+                                                check_types=check_types)
 
                 # functional interface tests
                 if hasattr(torch, name) and name not in EXCLUDE_FUNCTIONAL:
@@ -7578,12 +8103,14 @@ def fn(*inputs, **kwargs):
                     f_args_tensor = (self_tensor,) + args_tensor
 
                     if not is_inplace and test_name not in EXCLUDE_TRACED:
-                        check_against_reference(self, create_traced_fn(fn), fn, f_args_variable, kwargs_variable)
+                        check_against_reference(self, create_traced_fn(self, fn), fn,
+                                                f_args_variable, kwargs_variable, check_types=check_types)
 
                     if not is_inplace and test_name not in EXCLUDE_SCRIPT:
                         check_against_reference(self,
-                                                create_script_fn(name, 'functional', output_process_fn),
-                                                fn, f_args_variable, kwargs_variable)
+                                                create_script_fn(self, name, 'functional', output_process_fn),
+                                                fn, f_args_variable, kwargs_variable,
+                                                check_types=check_types)
 
             check(name)
             inplace_name = name + '_'
@@ -7620,7 +8147,7 @@ def fn(*inputs, **kwargs):
 
         if test_name not in EXCLUDE_SCRIPT:
             check_against_reference(self,
-                                    create_script_fn(name, 'nn_functional', output_process_fn),
+                                    create_script_fn(self, name, 'nn_functional', output_process_fn),
                                     fn, f_args_variable, kwargs_variable)
 
     post_add_test(test_name, skipTestIf, do_test)
diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py
index 8cc4e5a7c4d574..f40dae6a32072f 100644
--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
@@ -1,10 +1,10 @@
 import math
 import random
 import unittest
-import collections
 from copy import deepcopy
 
 import torch
+from torch._six import container_abcs
 import torch.legacy.nn as nn
 from common import to_gpu, freeze_rng_state, run_tests, skipIfRocm, TEST_WITH_ROCM
 from common_nn import NNTestCase, ModuleTest, CriterionTest, iter_tensors, \
@@ -699,7 +699,7 @@ def require_grad(input):
         input = input.detach()
         input.requires_grad = True
         return input
-    elif isinstance(input, collections.Iterable):
+    elif isinstance(input, container_abcs.Iterable):
         return type(input)(require_grad(e) for e in input)
     return input
 
diff --git a/test/test_nn.py b/test/test_nn.py
index b6fb725edd211e..be72aeb1886f8a 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -1137,6 +1137,9 @@ def check():
         modules += [nn.Conv2d(3, 4, 3)]
         module_list += [modules[-1]]
         check()
+        modules.insert(1, nn.Linear(3, 2))
+        module_list.insert(1, modules[1])
+        check()
         modules.append(nn.Tanh())
         module_list.append(modules[-1])
         check()
@@ -1683,6 +1686,12 @@ def test_weight_norm(self):
         self.assertEqual(m.weight_g.size(), (1, 5))
         self.assertEqual(m(input), expected_output)
 
+        # test with dim=None
+        m = nn.Linear(5, 7)
+        expected_output = m(input)
+        m = torch.nn.utils.weight_norm(m, dim=None)
+        self.assertEqual(m(input), expected_output)
+
     def test_weight_norm_pickle(self):
         m = torch.nn.utils.weight_norm(nn.Linear(5, 7))
         m = pickle.loads(pickle.dumps(m))
@@ -1950,8 +1959,7 @@ def _test_EmbeddingBag(self, cuda, mode, sparse, dtype=torch.double):
         input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=torch.long)
 
         # Empty list is only handled in CPU for now
-        offsets = torch.tensor([0, 3], device=device, dtype=torch.long) if cuda \
-            else torch.tensor([0, 0, 3, 3, 6], device=device, dtype=torch.long)
+        offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=torch.long)
 
         grad_output = torch.tensor(
             [1, 2,
@@ -2002,33 +2010,31 @@ def _test_EmbeddingBag(self, cuda, mode, sparse, dtype=torch.double):
                  [3, 4]], device=device, dtype=dtype)
 
         output = es(input, offsets)
-        output.backward(grad_output if cuda else grad_output_with_empty)
+        output.backward(grad_output_with_empty)
 
         es_weight_grad = es.weight.grad.data
         if sparse:
-            es_weight_grad = es.weight.grad.data.to_dense()
-        self.assertEqual(
-            output.data,
-            expected_output if cuda else expected_output_with_empty)
+            es_weight_grad = es.weight.grad.to_dense()
+        self.assertEqual(output, expected_output_with_empty)
         self.assertEqual(es_weight_grad, expected_grad_weight, dtype2prec[dtype])
 
         # check same example except as 2D (2 x 3)
-        input = input.data.view(2, -1)
+        input = input.view(2, -1)
         es.zero_grad()
         output = es(input)
         output.backward(grad_output)
 
-        es_weight_grad = es.weight.grad.data
+        es_weight_grad = es.weight.grad
         if sparse:
-            es_weight_grad = es.weight.grad.data.to_dense()
-        self.assertEqual(output.data, expected_output)
+            es_weight_grad = es.weight.grad.to_dense()
+        self.assertEqual(output, expected_output)
         self.assertEqual(es_weight_grad, expected_grad_weight, dtype2prec[dtype])
 
         # now compare EmbeddingBag vs Embedding + Sum/Mean, for constant bag length
         def _test_vs_Embedding(N, D, B, L, max_norm=None):
             es = nn.EmbeddingBag(N, D, mode=mode, sparse=sparse, max_norm=max_norm).to(device, dtype)
             e = nn.Embedding(N, D, max_norm=max_norm).to(device, dtype)
-            e.weight.data.copy_(es.weight.data)
+            e.weight.data.copy_(es.weight)
             input = torch.randint(N, (B, L), device=device, dtype=torch.long)
             offsets = torch.arange(0, B, device=device, dtype=torch.long).mul_(L)
             grad_output = torch.rand(B, D, device=device, dtype=dtype)
@@ -3140,6 +3146,24 @@ def forward(self, input):
         self.assertEqual(out.get_device(), 0)
         self.assertEqual(out.data, expected_out)
 
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
+    def test_data_parallel_device_args(self):
+        cuda0 = torch.device('cuda:0')
+        cuda1 = torch.device('cuda:1')
+
+        # test output_device
+        l = nn.Linear(10, 5).to(cuda0, torch.float)
+        i = torch.randn(20, 10, dtype=torch.float, device=cuda0, requires_grad=True)
+        out = dp.data_parallel(l, i, device_ids=(0, 1), output_device=cuda0)
+        self.assertEqual(out, l(i))
+
+        # test device_ids
+        l = nn.Linear(10, 5).to(cuda0, torch.float)
+        i = torch.randn(20, 10, dtype=torch.float, device=cuda0, requires_grad=True)
+        out = dp.data_parallel(l, i, device_ids=(cuda0, cuda1), output_device=cuda0)
+        self.assertEqual(out, l(i))
+
     def test_state_dict(self):
         l = nn.Linear(5, 5)
         block = nn.Module()
@@ -5058,34 +5082,55 @@ def test_grid_sample_error_checking(self):
     def test_grid_sample(self):
         def test(N, C, H, W, mode, padding_mode):
             def test_shape(N, C, IH, IW, H, W, mode, padding_mode):
-                input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_()
-                grid_cpu = torch.randn(H, N, W, 2).transpose(0, 1).requires_grad_()
-                out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode)
-                self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W]))
+                for grid_dim_contig_order in [(0, 1, 2, 3), (0, 3, 1, 2), (3, 0, 1, 2), (0, 2, 1, 3)]:
+                    # grid_dim_contig_order specifies the dimension order that can
+                    # make grid to be contiguous.
+                    # i.e., grid.permute(grid_dim_contig_order) is contiguous.
+                    # e.g., with grid_dim_contig_order=[0, 3, 1, 2], grid should be
+                    #       initialized with contiguous tensor of shape [N, 2, H, W]
+                    #       and permuted to [N, H, W, 2] afterwards.
+                    grid_shape = [N, H, W, 2]
+                    grid_init_shape = [grid_shape[d] for d in grid_dim_contig_order]
+                    grid_fwd_permute = [None, None, None, None]
+                    for i, d in enumerate(grid_dim_contig_order):
+                        grid_fwd_permute[d] = i
+
+                    def get_grid(device='cpu', data=None):
+                        if data is not None:
+                            assert list(data.shape) == grid_shape
+                            data = data.permute(grid_dim_contig_order).to(device)
+                        else:
+                            data = torch.randn(grid_init_shape, device=device)
+                        grid = data.permute(grid_fwd_permute)
+                        assert grid.permute(grid_dim_contig_order).is_contiguous()
+                        return grid
+
+                    input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_()
+                    grid_cpu = get_grid().requires_grad_()
+                    out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode)
+                    self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W]))
 
-                gradients = torch.randn_like(out_cpu)
-                out_cpu.backward(gradients)
+                    gradients = torch.randn_like(out_cpu)
+                    out_cpu.backward(gradients)
 
-                if TEST_CUDA:
-                    input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
-                    grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
-                    out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode)
-                    self.assertEqual(out_cpu, out_cuda)
+                    if TEST_CUDA:
+                        input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
+                        grid_cuda = get_grid('cuda', grid_cpu.detach()).requires_grad_()
+                        out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode)
+                        self.assertEqual(out_cpu, out_cuda)
 
-                    out_cuda.backward(gradients.cuda())
-                    self.assertEqual(input_cpu.grad, input_cuda.grad)
-                    self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5)
+                        out_cuda.backward(gradients.cuda())
+                        self.assertEqual(input_cpu.grad, input_cuda.grad)
+                        self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5)
 
-                    # check that zero-dimensional input strides don't error out
-                    base_input = torch.randn(N, C, 1, IW)
-                    input_cpu = base_input.expand_as(input_cuda).requires_grad_()
-                    grid_cpu = torch.randn(N, H, W, 2, requires_grad=True)
-                    out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode)
+                        # check that zero-dimensional input strides don't error out
+                        base_input = torch.randn(N, C, 1, IW)
+                        input_cpu = base_input.expand_as(input_cuda).requires_grad_()
+                        out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode)
 
-                    input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_()
-                    grid_cuda = grid_cpu.detach().cuda().requires_grad_()
-                    out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode)
-                    self.assertEqual(out_cpu, out_cuda)
+                        input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_()
+                        out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode)
+                        self.assertEqual(out_cpu, out_cuda)
 
             # test same size output
             test_shape(N, C, H, W, H, W, mode, padding_mode)
@@ -5143,15 +5188,11 @@ def test_shape(N, C, IH, IW, H, W, mode, padding_mode):
 
         for mode in ('bilinear', 'nearest'):
             for padding_mode in ('zeros', 'border', 'reflection'):
-
                 # test known input on CPU
                 input = torch.arange(1., 11).view(1, 1, 2, 5)
                 grid = torch.tensor(
-                    [[-0.9, -4.1, 0, 0.2, 1],
-                     [-1, -0.333, 0, 0.5, 1],
-                     [-1, -0.5, 0, 0.3333, 1],
-                     [-1, -0.2, 0, 1.5, 0.5]]).view(1, 2, 5, 2)
-                output = F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode)
+                    [[[-0.9, -4.1], [0, 0.2000], [1, -1], [-0.333, 1e-10], [0.5, 1.0]],
+                     [[-1.0, -0.5], [0, 0.3333], [1, -1], [-0.200, 1e-10], [1.5, 0.5]]]).view(1, 2, 5, 2)
                 if mode == 'bilinear':
                     if padding_mode == 'zeros':
                         groundtruth = torch.tensor(
@@ -5184,7 +5225,10 @@ def test_shape(N, C, IH, IW, H, W, mode, padding_mode):
                         assert False, "missing groundtruth test for padding mode '{}'".format(padding_mode)
                 else:
                     assert False, "missing groundtruth test for interpolation mode '{}'".format(mode)
-                self.assertEqual(output, groundtruth)
+                output = F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode)
+                self.assertEqual(output, groundtruth,
+                                 "groundtruth comparison failed for mode={}, "
+                                 "padding_mode={}".format(mode, padding_mode))
 
                 # do gradcheck
                 N = random.randint(2, 8)
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 0b3ff39ab48301..10622cd6798a85 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -1176,6 +1176,36 @@ def test_factory_copy(self):
         self.assertNotEqual(indices.data_ptr(), sparse_tensor._indices().data_ptr())
         self.assertNotEqual(values.data_ptr(), sparse_tensor._values().data_ptr())
 
+    @cpu_only  # just run once, we test both cpu and cuda
+    @skipIfRocm
+    def test_constructor_device_legacy(self):
+        i = torch.tensor([[0, 1, 1], [2, 0, 2]])
+        v = torch.tensor([3., 4., 5.])
+        size = torch.Size([2, 3])
+
+        self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(i, v, device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(i, v, size, device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.sparse.FloatTensor(torch.Size([2, 3, 4]), device='cuda'))
+
+        x = torch.sparse_coo_tensor(i, v, size, device='cpu')
+        self.assertRaises(RuntimeError, lambda: x.new(device='cuda'))
+        self.assertRaises(RuntimeError, lambda: x.new(i, v, device='cuda'))
+        self.assertRaises(RuntimeError, lambda: x.new(i, v, size, device='cuda'))
+        self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cuda'))
+
+        if torch.cuda.is_available():
+            self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(i, v, device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(i, v, size, device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.cuda.sparse.FloatTensor(torch.Size([2, 3, 4]), device='cpu'))
+
+            x = torch.sparse_coo_tensor(i, v, size, device='cuda')
+            self.assertRaises(RuntimeError, lambda: x.new(device='cpu'))
+            self.assertRaises(RuntimeError, lambda: x.new(i, v, device='cpu'))
+            self.assertRaises(RuntimeError, lambda: x.new(i, v, size, device='cpu'))
+            self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cpu'))
+
     @cpu_only  # not really, but we only really want to run this once
     def test_dtypes(self):
         all_sparse_dtypes = [dtype for dtype in torch.testing.get_all_dtypes() if dtype != torch.float16]
diff --git a/test/test_thd_distributed.py b/test/test_thd_distributed.py
index 47dbe9d056f154..a104baeb3fb451 100644
--- a/test/test_thd_distributed.py
+++ b/test/test_thd_distributed.py
@@ -11,7 +11,7 @@
 
 import torch
 import torch.cuda
-import torch.distributed as dist
+import torch.distributed.deprecated as dist
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -985,7 +985,7 @@ def test_DistributedDataParallel(self):
         # DDP training setup
         model_DDP = copy.deepcopy(model)
         model_DDP.cuda(gpu_subset[0])
-        model_DDP = nn.parallel.DistributedDataParallel(
+        model_DDP = nn.parallel.deprecated.DistributedDataParallel(
             model_DDP, device_ids=gpu_subset
         )
 
@@ -1019,7 +1019,7 @@ def test_DistributedDataParallelCPU(self):
 
         # DDP-CPU training setup
         model_DDP = copy.deepcopy(model_base)
-        model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP)
+        model_DDP = nn.parallel.deprecated.DistributedDataParallelCPU(model_DDP)
 
         # dummy data initialization
         local_bs = 2
diff --git a/test/test_torch.py b/test/test_torch.py
index afb57ca1cb3e19..a01a93f580865c 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -2326,6 +2326,38 @@ def test_constructor_dtypes(self):
 
         torch.set_default_tensor_type(default_type)
 
+    def test_constructor_device_legacy(self):
+        self.assertRaises(RuntimeError, lambda: torch.FloatTensor(device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.FloatTensor(torch.Size([2, 3, 4]), device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.FloatTensor((2.0, 3.0), device='cuda'))
+
+        self.assertRaises(RuntimeError, lambda: torch.Tensor(device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.Tensor(torch.Size([2, 3, 4]), device='cuda'))
+        self.assertRaises(RuntimeError, lambda: torch.Tensor((2.0, 3.0), device='cuda'))
+
+        x = torch.randn((3,), device='cpu')
+        self.assertRaises(RuntimeError, lambda: x.new(device='cuda'))
+        self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cuda'))
+        self.assertRaises(RuntimeError, lambda: x.new((2.0, 3.0), device='cuda'))
+
+        if torch.cuda.is_available():
+            self.assertRaises(RuntimeError, lambda: torch.cuda.FloatTensor(device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.cuda.FloatTensor(torch.Size([2, 3, 4]), device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.cuda.FloatTensor((2.0, 3.0), device='cpu'))
+
+            default_type = torch.Tensor().type()
+            torch.set_default_tensor_type(torch.cuda.FloatTensor)
+            self.assertRaises(RuntimeError, lambda: torch.Tensor(device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.Tensor(torch.Size([2, 3, 4]), device='cpu'))
+            self.assertRaises(RuntimeError, lambda: torch.Tensor((2.0, 3.0), device='cpu'))
+            torch.set_default_tensor_type(torch.cuda.FloatTensor)
+            torch.set_default_tensor_type(default_type)
+
+            x = torch.randn((3,), device='cuda')
+            self.assertRaises(RuntimeError, lambda: x.new(device='cpu'))
+            self.assertRaises(RuntimeError, lambda: x.new(torch.Size([2, 3, 4]), device='cpu'))
+            self.assertRaises(RuntimeError, lambda: x.new((2.0, 3.0), device='cpu'))
+
     def test_type(self):
         x = torch.randn(3, 3).double()
         self.assertEqual(x.type('torch.FloatTensor').dtype, torch.float32)
@@ -2361,6 +2393,31 @@ def test_tensor_factory(self):
             a[0] = 7.
             self.assertEqual(5., res1[0].item())
 
+    def test_tensor_factory_copy_var(self):
+        # default copy from var
+        source = torch.randn(5, 5, requires_grad=True)
+        copy = torch.tensor(source)
+        self.assertEqual(copy.data, source.data)
+        self.assertTrue(source.requires_grad)
+        self.assertTrue(copy.is_leaf)
+        self.assertFalse(copy.requires_grad)
+
+        # copy with requires_grad=False
+        source = torch.randn(5, 5, requires_grad=True)
+        copy = torch.tensor(source, requires_grad=False)
+        self.assertEqual(copy.data, source.data)
+        self.assertTrue(source.requires_grad)
+        self.assertTrue(copy.is_leaf)
+        self.assertFalse(copy.requires_grad)
+
+        # copy with requires_grad=True
+        source = torch.randn(5, 5, requires_grad=True)
+        copy = torch.tensor(source, requires_grad=True)
+        self.assertEqual(copy.data, source.data)
+        self.assertTrue(source.requires_grad)
+        self.assertTrue(copy.is_leaf)
+        self.assertTrue(copy.requires_grad)
+
     def test_tensor_factory_type_inference(self):
         def test_inference(default_dtype):
             saved_dtype = torch.get_default_dtype()
@@ -3962,8 +4019,9 @@ def test_gesv(self):
 
     @staticmethod
     def _test_gesv_batched(self, cast):
+        from common import random_fullrank_matrix_distinct_singular_value as fullrank
         # test against gesv: one batch
-        A = cast(torch.randn(1, 5, 5))
+        A = cast(fullrank(5, 1))
         b = cast(torch.randn(1, 5, 10))
         x_exp, LU_exp = torch.gesv(b.squeeze(0), A.squeeze(0))
         x, LU = torch.gesv(b, A)
@@ -3971,7 +4029,7 @@ def _test_gesv_batched(self, cast):
         self.assertEqual(LU, LU_exp.unsqueeze(0))
 
         # test against gesv in a loop: four batches
-        A = cast(torch.randn(4, 5, 5))
+        A = cast(fullrank(5, 4))
         b = cast(torch.randn(4, 5, 10))
 
         x_exp_list = list()
@@ -3988,7 +4046,7 @@ def _test_gesv_batched(self, cast):
         self.assertEqual(LU, LU_exp)
 
         # basic correctness test
-        A = cast(torch.randn(3, 5, 5))
+        A = cast(fullrank(5, 3))
         b = cast(torch.randn(3, 5, 10))
         x, LU = torch.gesv(b, A)
         self.assertEqual(torch.matmul(A, x), b)
@@ -3998,7 +4056,7 @@ def _test_gesv_batched(self, cast):
             return
         import numpy
         from numpy.linalg import solve
-        A = cast(torch.randn(2, 2, 2)).permute(1, 0, 2)
+        A = cast(fullrank(2, 2)).permute(1, 0, 2)
         b = cast(torch.randn(2, 2, 2)).permute(2, 1, 0)
         x, _ = torch.gesv(b, A)
         x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
@@ -4013,18 +4071,18 @@ def _test_gesv_batched_dims(self, cast):
         if not TEST_NUMPY:
             return
 
-        import numpy
         from numpy.linalg import solve
+        from common import random_fullrank_matrix_distinct_singular_value as fullrank
 
         # test against numpy.linalg.solve
-        A = cast(torch.randn(2, 1, 3, 4, 4))
+        A = cast(fullrank(4, 2, 1, 3))
         b = cast(torch.randn(2, 1, 3, 4, 6))
         x, _ = torch.gesv(b, A)
         x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
         self.assertEqual(x.data, cast(x_exp))
 
         # test column major format
-        A = cast(torch.randn(2, 1, 3, 4, 4)).transpose(-2, -1)
+        A = cast(fullrank(4, 2, 1, 3)).transpose(-2, -1)
         b = cast(torch.randn(2, 1, 3, 6, 4)).transpose(-2, -1)
         assert not A.is_contiguous()
         assert not b.is_contiguous()
@@ -4033,21 +4091,21 @@ def _test_gesv_batched_dims(self, cast):
         self.assertEqual(x.data, cast(x_exp))
 
         # broadcasting b
-        A = cast(torch.randn(2, 1, 3, 4, 4))
+        A = cast(fullrank(4, 2, 1, 3))
         b = cast(torch.randn(4, 6))
         x, _ = torch.gesv(b, A)
         x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
         self.assertEqual(x.data, cast(x_exp))
 
         # broadcasting A
-        A = cast(torch.randn(4, 4))
+        A = cast(fullrank(4))
         b = cast(torch.randn(2, 1, 3, 4, 2))
         x, _ = torch.gesv(b, A)
         x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
         self.assertEqual(x.data, cast(x_exp))
 
         # broadcasting both A & b
-        A = cast(torch.randn(1, 3, 1, 4, 4))
+        A = cast(fullrank(4, 1, 3, 1))
         b = cast(torch.randn(2, 1, 3, 4, 5))
         x, _ = torch.gesv(b, A)
         x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
@@ -4631,7 +4689,7 @@ def run_test(M, sign=1):
 
         # Single matrix, but full rank
         # This is for negative powers
-        from test_autograd import random_fullrank_matrix_distinct_singular_value
+        from common import random_fullrank_matrix_distinct_singular_value
         M = conv_fn(random_fullrank_matrix_distinct_singular_value(5))
         run_test(M)
         run_test(M, sign=-1)
@@ -5200,7 +5258,7 @@ def test_potrs(self):
         self.assertLessEqual(b.dist(torch.mm(a, x)), 1e-12)
 
     @skipIfNoLapack
-    def tset_potri(self):
+    def test_potri(self):
         a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
                           (-6.05, -3.30, 5.36, -4.44, 1.08),
                           (-0.45, 2.58, -2.70, 0.27, 9.04),
@@ -5208,7 +5266,7 @@ def tset_potri(self):
                           (-9.67, -5.14, -7.26, 6.08, -6.87))).t()
 
         # make sure 'a' is symmetric PSD
-        a = a * a.t()
+        a = torch.mm(a, a.t())
 
         # compute inverse directly
         inv0 = torch.inverse(a)
@@ -5219,13 +5277,13 @@ def tset_potri(self):
         self.assertLessEqual(inv0.dist(inv1), 1e-12)
 
         # upper Triangular Test
-        chol = torch.potrf(a, 'U')
-        inv1 = torch.potri(chol, 'U')
+        chol = torch.potrf(a, True)
+        inv1 = torch.potri(chol, True)
         self.assertLessEqual(inv0.dist(inv1), 1e-12)
 
         # lower Triangular Test
-        chol = torch.potrf(a, 'L')
-        inv1 = torch.potri(chol, 'L')
+        chol = torch.potrf(a, False)
+        inv1 = torch.potri(chol, False)
         self.assertLessEqual(inv0.dist(inv1), 1e-12)
 
     @skipIfNoLapack
@@ -7427,6 +7485,14 @@ def test_parsing_intlist(self):
             self.assertRaises(TypeError, lambda: torch.ones(np.array(3, 3)))
             self.assertRaises(TypeError, lambda: torch.ones((np.array(3, 3))))
 
+        # fail parse with additional positional args after intlist arg
+        self.assertRaisesRegex(TypeError,
+                               "received an invalid combination of arguments",
+                               lambda: torch.LongTensor((6, 0), 1, 1, 0))
+        self.assertRaisesRegex(TypeError,
+                               "missing 1 required positional arguments",
+                               lambda: torch.tensor().new_zeros((5, 5), 0))
+
     def _test_serialization_data(self):
         a = [torch.randn(5, 5).float() for i in range(2)]
         b = [a[i % 2] for i in range(4)]  # 0-3
@@ -8832,6 +8898,10 @@ def test_meshgrid(self):
         self.assertEqual(grid_a.shape, torch.Size([1, 3, 2]))
         self.assertEqual(grid_b.shape, torch.Size([1, 3, 2]))
         self.assertEqual(grid_c.shape, torch.Size([1, 3, 2]))
+        grid_a2, grid_b2, grid_c2 = torch.meshgrid(a, b, c)
+        self.assertEqual(grid_a2.shape, torch.Size([1, 3, 2]))
+        self.assertEqual(grid_b2.shape, torch.Size([1, 3, 2]))
+        self.assertEqual(grid_c2.shape, torch.Size([1, 3, 2]))
         expected_grid_a = torch.ones(1, 3, 2, dtype=torch.int64)
         expected_grid_b = torch.tensor([[[1, 1],
                                          [2, 2],
@@ -8842,6 +8912,32 @@ def test_meshgrid(self):
         self.assertTrue(grid_a.equal(expected_grid_a))
         self.assertTrue(grid_b.equal(expected_grid_b))
         self.assertTrue(grid_c.equal(expected_grid_c))
+        self.assertTrue(grid_a2.equal(expected_grid_a))
+        self.assertTrue(grid_b2.equal(expected_grid_b))
+        self.assertTrue(grid_c2.equal(expected_grid_c))
+
+    @unittest.skipIf(torch.cuda.is_available(), "CUDA is available, can't test CUDA not built error")
+    def test_cuda_not_built(self):
+        msg = "Torch not compiled with CUDA enabled"
+        self.assertRaisesRegex(AssertionError, msg, lambda: torch.cuda.current_device())
+        self.assertRaisesRegex(AssertionError, msg, lambda: torch.tensor([1], device="cuda"))
+        self.assertRaisesRegex(AssertionError, msg, lambda: torch.tensor([1]).cuda())
+        self.assertRaisesRegex(AssertionError, msg, lambda: torch.cuda.FloatTensor())
+        self.assertRaisesRegex(AssertionError, msg, lambda: torch.tensor([1]).to(device="cuda"))
+
+    def test_cast_binary_op(self):
+        # Scalar
+        a = torch.tensor(2)
+        b = torch.tensor(3)
+        a_copy = a.clone()
+        b_copy = b.clone()
+
+        self.assertEqual(torch.tensor(6), a.float() * b)
+
+        self.assertEqual(a.type(), a_copy.type())
+        self.assertEqual(a.data.type(), a_copy.data.type())
+        self.assertEqual(b.type(), b_copy.type())
+        self.assertEqual(b.data.type(), b_copy.type())
 
 
 # Functions to test negative dimension wrapping
diff --git a/third_party/googletest b/third_party/googletest
index 69e48e92de4396..2fe3bd994b3189 160000
--- a/third_party/googletest
+++ b/third_party/googletest
@@ -1 +1 @@
-Subproject commit 69e48e92de43960a316a826293510b7b3deb9eca
+Subproject commit 2fe3bd994b3189899d93f1d5a881e725e046fdc2
diff --git a/third_party/onnx b/third_party/onnx
index bff0b8835870c7..39dd0d4fec5913 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit bff0b8835870c7df7762ef43498d000d2d8ffb52
+Subproject commit 39dd0d4fec5913aa517b71bcfcbf638a427894eb
diff --git a/third_party/pybind11 b/third_party/pybind11
index add56ccdcac23a..5c8746ff135abb 160000
--- a/third_party/pybind11
+++ b/third_party/pybind11
@@ -1 +1 @@
-Subproject commit add56ccdcac23a6c522a2c1174a866e293c61dab
+Subproject commit 5c8746ff135abb390bf95944be593e895a586a50
diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py
index e9daea644e9cb2..d433ad50dbfae7 100644
--- a/tools/amd_build/build_pytorch_amd.py
+++ b/tools/amd_build/build_pytorch_amd.py
@@ -41,7 +41,7 @@
 
 # Make various replacements inside AMD_BUILD/torch directory
 ignore_files = ["csrc/autograd/profiler.h", "csrc/autograd/profiler.cpp",
-                "csrc/cuda/cuda_check.h", "csrc/jit/fusion_compiler.cpp"]
+                "csrc/cuda/cuda_check.h"]
 for root, _directories, files in os.walk(os.path.join(proj_dir, "torch")):
     for filename in files:
         if filename.endswith(".cpp") or filename.endswith(".h"):
diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml
index 74eb2669300241..44dc23ade9cd8d 100644
--- a/tools/amd_build/disabled_features.yaml
+++ b/tools/amd_build/disabled_features.yaml
@@ -150,13 +150,6 @@
         "gamma_grad_cuda_kernel",
       ]
     },
-    {
-      "path": "aten/src/THC/THCGeneral.cpp",
-      "functions": [
-        "THC_float2half",
-        "THC_half2float"
-      ]
-    },
     {
       "path": "aten/src/THCUNN/generic/SparseLinear.cu",
       "functions": [
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 8accdb97427b27..caec9575ef9cca 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -762,6 +762,12 @@
   self: where(condition, grad, zeros_like(grad))
   other: where(condition, zeros_like(grad), grad)
 
+# weight_norm_cuda_interface_backward does not have an explicitly defined derivative, so if we do happen
+# to be running backward with create_graph=True, fall back to a backward function that uses 
+# differentiable ops.
+- name: _weight_norm_cuda_interface(Tensor v, Tensor g, int64_t dim) 
+  v, g: "GradMode::is_enabled() ? _weight_norm_differentiable_backward(grad.contiguous(), v, g, result1, dim) : _weight_norm_cuda_interface_backward(grad.contiguous(), v, g, result1, dim)"
+
 - name: zero_(Tensor self)
   self: zeros_like(grad)
 
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index b3ea70aa87222e..249ba042bb7056 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -28,7 +28,7 @@
     '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', '_th_.*',
     'arange.*', 'range.*', '_gesv.*', '_getri.*', 'slice', 'randint(_out)?',
     '_local_scalar', '_local_scalar_dense',
-    'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear'
+    'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear', 'to'
 ]
 
 # These function signatures are not exposed to Python. Note that this signature
@@ -427,9 +427,9 @@ def append_actuals_formals(actual, formal):
         env['actuals'] = actuals
 
         if has_tensor_options:
-            env['initialize_cuda'] = 'maybe_initialize_cuda(at::getType(options));'
+            env['initialize_cuda'] = 'maybe_initialize_cuda(options);'
         else:
-            env['initialize_cuda'] = 'maybe_initialize_cuda({});'.format(type_args[0]['name']) if type_args else ''
+            env['initialize_cuda'] = ''
 
         if 'call_args' in declaration:
             env['dispatch_args'] = declaration['call_args']
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index d6bcb0821e83c0..d09a07a7b550c4 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -124,17 +124,19 @@
 """)
 
 RECORD_FUNCTION = CodeTemplate("""\
-profiler::RecordFunction profiler("${name}");""")
+profiler::RecordFunction profiler("${name}", Function::peek_at_next_sequence_nr());""")
 
 PRE_RECORD_TRACE = CodeTemplate("""\
 torch::jit::Node* node = nullptr;
+std::shared_ptr<jit::tracer::TracingState> tracer_state;
 if (jit::tracer::isTracing()) {
-  auto& graph = jit::tracer::getTracingState()->graph;
-  node = graph->create(jit::aten::${trace_name}, /*outputs=*/0);
+  tracer_state = jit::tracer::getTracingState();
+  node = tracer_state->graph->create(jit::aten::${trace_name}, /*outputs=*/0);
   jit::tracer::recordSourceLocation(node);
   ${add_trace_inputs}
-  graph->appendNode(node);
+  tracer_state->graph->appendNode(node);
   ${inplace_guard}
+  jit::tracer::setTracingState(nullptr);
 }
 """)
 
@@ -145,35 +147,20 @@
 ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${input}", ${input});""")
 
 POST_RECORD_TRACE = CodeTemplate("""\
-if (jit::tracer::isTracing()) {
+if (tracer_state) {
+  jit::tracer::setTracingState(std::move(tracer_state));
   ${record_trace_outputs}
 }
 """)
 
-RECORD_ATTRIBUTE = CodeTemplate("""\
-setattr(trace_info.n, jit::attr::${attr_name}, ${name});""")
-
-RECORD_POSITIONAL_ATTRIBUTE = CodeTemplate("""\
-setposattr(trace_info.n, ${i}, "${name}", ${name});""")
-
-POSITIONAL_ATTR_NYI = """\
-throw std::runtime_error("Can't have size-dependent arguments to functions that "
-                         "take variable number of tensor arguments");
-"""
-
 
 def should_trace(declaration):
-    # Operations involving Generator, Storage, Type are not traceable
-    # at the moment
-    if any(arg['simple_type'] in {'Generator', 'Storage', 'ScalarType', 'Type', 'optional<ScalarType>'}
-            for arg in declaration['arguments']):
+    # Operations involving Storage or Type are not traceable at the moment
+    if any(arg['simple_type'] in {'Storage', 'Type'} for arg in declaration['arguments']):
         return False
     # We can't trace functions which don't have any Tensor or TensorList returns
     if 'Tensor' not in declaration['return_type']:
         return False
-    tensor_args = [arg for arg in declaration['arguments'] if arg['simple_type'] in {'Tensor', 'TensorList'}]
-    if len(tensor_args) == 0:
-        return False
     name = declaration['name']
     base_name = name[:-1] if declaration['inplace'] else name[:-4] if name.endswith('_out') else name
     if base_name in DONT_RECORD_TRACE:
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index d4a9a4eccab86a..6bd409ad0a4040 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -42,7 +42,7 @@ using namespace torch::autograd::generated;
 
 namespace torch { namespace autograd {
 
-VariableType::VariableType(Context* context, Type* baseType)
+VariableType::VariableType(Context* context, TypeExtendedInterface* baseType)
   : TypeDefault(baseType->type_id(), /*is_variable=*/true, /*is_undefined=*/false)
   , baseType(baseType)
   , id_(context->freshTypeID()) {
@@ -52,6 +52,9 @@ VariableType::VariableType(Context* context, Type* baseType)
 ScalarType VariableType::scalarType() const {
   return baseType->scalarType();
 }
+caffe2::TypeMeta VariableType::typeMeta() const {
+  return baseType->typeMeta();
+}
 Backend VariableType::backend() const {
   return baseType->backend();
 }
@@ -102,7 +105,7 @@ TypeID VariableType::ID() const {
 std::vector<std::unique_ptr<Type>> type_to_variable_type;
 
 // XXX - this is not threadsafe with uses of Variables
-void register_variable_type_for(Type* baseType) {
+void register_variable_type_for(TypeExtendedInterface* baseType) {
   AT_ASSERT(baseType);
   size_t base_id = static_cast<size_t>(baseType->ID());
   if(type_to_variable_type.size() <= base_id) {
@@ -163,7 +166,7 @@ REGISTER_VARIABLE_HOOKS(VariableHooks)
 // Pre-condition: backend/scalar_type is a valid type in the type_registry
 void VariableHooks::registerVariableTypeFor(at::LegacyTypeDispatch* context, at::Backend backend, at::ScalarType scalar_type) const {
   auto* baseType = context->getNonVariableTypeRaw(backend, scalar_type);
-  register_variable_type_for(baseType);
+  register_variable_type_for(static_cast<at::TypeExtendedInterface*>(baseType));
 }
 
 at::Type& VariableHooks::getVariableTypeFromBaseType(const at::Type& baseType) const {
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index fe3e57f4fc0246..446fb5b889f47a 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -29,12 +29,14 @@ using at::TensorList;
 using at::Type;
 using at::ScalarType;
 using at::optional;
+using at::Device;
 
-void register_variable_type_for(at::Type* baseType);
+void register_variable_type_for(at::TypeExtendedInterface* baseType);
 
 struct TORCH_API VariableType final : public at::TypeDefault {
-  VariableType(Context* context, at::Type* baseType);
+  VariableType(Context* context, at::TypeExtendedInterface* baseType);
   at::ScalarType scalarType() const override;
+  virtual caffe2::TypeMeta typeMeta() const override;
   at::Backend backend() const override;
   at::Allocator* allocator() const override;
   at::Device getDeviceFromPtr(void * data) const override;
@@ -72,7 +74,7 @@ struct TORCH_API VariableType final : public at::TypeDefault {
   static at::Tensor unpack_opt(const Tensor & t, const char * name, int pos);
   static std::vector<at::Tensor> unpack(at::TensorList tl, const char *name, int pos);
 
-  at::Type* baseType;
+  at::TypeExtendedInterface* baseType;
   std::string str;
   size_t id_;
 };
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 144771290d1c89..b2b6739710c8da 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -19,6 +19,7 @@
 #include "torch/csrc/utils/tensor_layouts.h"
 #include "torch/csrc/utils/tensor_new.h"
 #include "torch/csrc/utils/tensor_numpy.h"
+#include "torch/csrc/jit/tracer.h"
 #include "torch/csrc/autograd/generated/variable_factories.h"
 
 #include <ATen/ATen.h>
@@ -64,7 +65,7 @@ inline Tensor dispatch_arange(Scalar end, Tensor result) {
 }
 
 inline Tensor dispatch_arange(Scalar end, const TensorOptions& options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   return torch::arange(end, options);
 }
@@ -75,7 +76,7 @@ inline Tensor dispatch_arange(Scalar start, Scalar end, Scalar step, Tensor resu
 }
 
 inline Tensor dispatch_arange(Scalar start, Scalar end, Scalar step, const TensorOptions& options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   return torch::arange(start, end, step, options);
 }
@@ -146,7 +147,7 @@ inline Tensor dispatch_range(Scalar start, Scalar end, Scalar step, Tensor resul
 }
 
 inline Tensor dispatch_range(Scalar start, Scalar end, Scalar step, const TensorOptions& options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   DeviceGuard device_guard(options.device());
   return torch::range(start, end, step, options);
@@ -188,7 +189,7 @@ inline Tensor dispatch_randint(int64_t high, IntList size, Generator * generator
   return at::randint_out(result, high, size, generator);
 }
 inline Tensor dispatch_randint(int64_t high, IntList size, Generator * generator, const TensorOptions & options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   return torch::randint(high, size, generator, options);
 }
@@ -197,7 +198,7 @@ inline Tensor dispatch_randint(int64_t high, IntList size, Tensor result) {
   return at::randint_out(result, high, size);
 }
 inline Tensor dispatch_randint(int64_t high, IntList size, const TensorOptions & options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   return torch::randint(high, size, options);
 }
@@ -206,7 +207,7 @@ inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, Generato
   return at::randint_out(result, low, high, size, generator);
 }
 inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, Generator * generator, const TensorOptions & options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   return torch::randint(low, high, size, generator, options);
 }
@@ -215,7 +216,7 @@ inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, Tensor r
   return at::randint_out(result, low, high, size);
 }
 inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, const TensorOptions & options) {
-  maybe_initialize_cuda(at::getType(options));
+  maybe_initialize_cuda(options);
   AutoNoGIL no_gil;
   return torch::randint(low, high, size, options);
 }
@@ -320,6 +321,7 @@ static PyObject * THPVariable_randint(PyObject* self_, PyObject* args, PyObject*
 static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.as_tensor", jit::tracer::WARN_CONSTRUCTOR);
   return THPVariable_Wrap(torch::utils::as_tensor(default_type(), args, kwargs));
   END_HANDLE_TH_ERRORS
 }
@@ -327,6 +329,7 @@ static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject
 static PyObject * THPVariable_from_numpy(PyObject* module, PyObject* arg)
 {
   HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.from_numpy", jit::tracer::WARN_CONSTRUCTOR);
   auto data = torch::utils::tensor_from_numpy(arg);
   return THPVariable_Wrap(make_variable(std::move(data), /*requires_grad=*/false));
   END_HANDLE_TH_ERRORS
@@ -351,6 +354,7 @@ static PyObject * THPVariable__promote_types(PyObject* self, PyObject* args, PyO
 static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.sparse_coo_tensor", jit::tracer::WARN_CONSTRUCTOR);
   return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(default_type(), args, kwargs));
   END_HANDLE_TH_ERRORS
 }
@@ -358,6 +362,7 @@ static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args,
 static PyObject * THPVariable_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.tensor", jit::tracer::WARN_CONSTRUCTOR);
   return THPVariable_Wrap(torch::utils::tensor_ctor(default_type(), args, kwargs));
   END_HANDLE_TH_ERRORS
 }
diff --git a/tools/autograd/templates/python_torch_functions_dispatch.h b/tools/autograd/templates/python_torch_functions_dispatch.h
index 78dbe4c225f639..fdc33538bcada2 100644
--- a/tools/autograd/templates/python_torch_functions_dispatch.h
+++ b/tools/autograd/templates/python_torch_functions_dispatch.h
@@ -28,8 +28,8 @@ static at::Type& default_type() {
   return torch::tensors::get_default_tensor_type();
 }
 
-static void maybe_initialize_cuda(const at::Type &type) {
-  if (type.is_cuda()) {
+static void maybe_initialize_cuda(const at::TensorOptions& options) {
+  if (options.device().is_cuda()) {
     torch::utils::cuda_lazy_init();
   }
 }
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index a863a18e04bdf3..d92ad3dbf7688b 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -179,7 +179,7 @@ static int64_t dispatch_to_CLong(const Tensor & self) {
 
 static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("Converting a tensor to a Python float");
+  jit::tracer::warn("Converting a tensor to a Python float", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   return wrap(dispatch_to_CDouble(self_));
   END_HANDLE_TH_ERRORS
@@ -187,7 +187,7 @@ static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) {
 
 static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("Converting a tensor to a Python integer");
+  jit::tracer::warn("Converting a tensor to a Python integer", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   if (isFloatingType(self_.type().scalarType())) {
     // we can't dispatch to toCLong here because we want to avoid ATen overflow checks;
@@ -203,7 +203,7 @@ static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) {
 // called when used as a slice.
 static PyObject * THPVariable_index_scalar(PyObject* self, PyObject* args) {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("Converting a tensor to a Python index");
+  jit::tracer::warn("Converting a tensor to a Python index", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   // TODO: change the condition to `self_.dim() != 0` once we expose scalars
   // in PyTorch.
@@ -230,13 +230,26 @@ static PyObject * THPVariable_invert(PyObject* self, PyObject* args) {
   END_HANDLE_TH_ERRORS
 }
 
+static Tensor dispatch_to(const Tensor & self, Device device, bool non_blocking) {
+  AutoNoGIL no_gil;
+  return self.to(device, non_blocking);
+}
+
+static Tensor dispatch_to(const Tensor & self, ScalarType dtype, bool non_blocking) {
+  AutoNoGIL no_gil;
+  return self.to(dtype, non_blocking);
+}
+
+static Tensor dispatch_to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking) {
+  AutoNoGIL no_gil;
+  return self.to(device, dtype, non_blocking);
+}
+
 static PyObject * THPVariable_cpu(PyObject* self, PyObject* args)
 {
    HANDLE_TH_ERRORS
    auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
-   auto backend = self_.is_sparse() ? Backend::SparseCPU : Backend::CPU;
-   auto& type = self_.type().toBackend(backend);
-   return wrap(torch::utils::dispatch_type_conversion(self_, type));
+   return THPVariable_Wrap(dispatch_to(self_, at::Device(at::DeviceType::CPU), false));
    END_HANDLE_TH_ERRORS
 }
 
@@ -250,25 +263,17 @@ static PyObject * THPVariable_cuda(PyObject* self, PyObject* args, PyObject* kwa
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   ParsedArgs<2> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
-  auto backend = self_.is_sparse() ? at::Backend::SparseCUDA : at::Backend::CUDA;
-  auto& type = self_.type().toBackend(backend);
-  auto device_obj = r.device(0);
-  if (!r.isNone(0) && device_obj.is_cpu()) {
-    throw std::runtime_error("Invalid device, must be cuda device");
-  }
-  int32_t device_index = -1;
-  if (device_obj.has_index() && device_obj.is_cuda()) {
-    device_index = device_obj.index();
-  }
-  return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type, device_index, r.toBool(1)));
+  auto device = r.isNone(0) ? at::Device(at::DeviceType::CUDA) : r.device(0);
+  AT_CHECK(device.is_cuda(), "Invalid device, must be cuda device");
+  torch::utils::cuda_lazy_init();
+  return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1)));
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject * THPVariable_to_type(PyObject* self, ScalarType scalarType) {
   HANDLE_TH_ERRORS
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
-  auto& type = self_.type().toScalarType(scalarType);
-  return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type));
+  return THPVariable_Wrap(dispatch_to(self_, scalarType, false));
   END_HANDLE_TH_ERRORS
 }
 static PyObject * THPVariable_byte(PyObject* self, PyObject* args) {
@@ -315,7 +320,7 @@ static PyObject * THPVariable_element_size(PyObject* self, PyObject* args)
 static PyObject * THPVariable_numpy(PyObject* self, PyObject* arg)
 {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("Converting a tensor to a NumPy array");
+  jit::tracer::warn("Converting a tensor to a NumPy array", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   if (self_.requires_grad()) {
     throw std::runtime_error(
@@ -370,7 +375,7 @@ static PyObject * THPVariable_requires_grad_(PyObject* self, PyObject* args, PyO
 static PyObject * THPVariable_item(PyObject* self, PyObject* args)
 {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("Converting a tensor to a Python number");
+  jit::tracer::warn("Converting a tensor to a Python number", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   if (self_.is_floating_point()) {
     return wrap(dispatch_to_CDouble(self_));
@@ -497,18 +502,19 @@ static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwarg
   auto& device = std::get<0>(parsed);
   auto& scalarType = std::get<1>(parsed);
   auto non_blocking = std::get<2>(parsed);
-  if (!device) {
-    // device not given
-    auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
-    auto& type = self_.type().toScalarType(scalarType.value_or(self_.type().scalarType()));
-    return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type));
+  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  if (device && device->is_cuda()) {
+    torch::utils::cuda_lazy_init();
+  }
+  if (!device && !scalarType) {
+    Py_INCREF(self);
+    return self;
+  } else if (!device) {
+    return THPVariable_Wrap(dispatch_to(self_, *scalarType, non_blocking));
+  } else if (!scalarType) {
+    return THPVariable_Wrap(dispatch_to(self_, *device, non_blocking));
   } else {
-    // device and maybe dtype are given
-    auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
-    auto& layout = *torch::getLayout(self_.type().backend());
-    auto& type = torch::getVariableType(scalarType.value_or(self_.type().scalarType()), layout, device->type());
-    const int32_t device_index = type.is_cuda() ? device->index() : -1;
-    return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type, device_index, non_blocking));
+    return THPVariable_Wrap(dispatch_to(self_, *device, *scalarType, non_blocking));
   }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
@@ -517,7 +523,7 @@ static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwarg
 static PyObject * THPVariable_tolist(PyObject* self, PyObject* args)
 {
   HANDLE_TH_ERRORS
-  jit::tracer::warn("Converting a tensor to a Python list");
+  jit::tracer::warn("Converting a tensor to a Python list", jit::tracer::WARN_PYTHON_DATAFLOW);
   auto self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   return torch::utils::tensor_to_list(self_.data());
   END_HANDLE_TH_ERRORS
@@ -564,7 +570,7 @@ static PyObject * THPVariable_type(PyObject* self, PyObject* args, PyObject* kwa
 ${py_methods}
 
 static PyObject * THPVariable_bool(PyObject* self, PyObject* args) {
-  jit::tracer::warn("Converting a tensor to a Python boolean");
+  jit::tracer::warn("Converting a tensor to a Python boolean", jit::tracer::WARN_PYTHON_DATAFLOW);
   return THPVariable_is_nonzero(self, args);
 }
 
diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py
index 46b73687a12064..db698a24128ea9 100644
--- a/tools/build_libtorch.py
+++ b/tools/build_libtorch.py
@@ -9,20 +9,26 @@
 if __name__ == '__main__':
     # Placeholder for future interface. For now just gives a nice -h.
     parser = argparse.ArgumentParser(description='Build libtorch')
-    args = parser.parse_args()
+    parser.add_argument('--use-cereal', action='store_true')
+    options = parser.parse_args()
 
     os.environ['BUILD_TORCH'] = 'ON'
+    os.environ['BUILD_TEST'] = 'ON'
     os.environ['ONNX_NAMESPACE'] = 'onnx_torch'
     os.environ['PYTORCH_PYTHON'] = sys.executable
 
     tools_path = os.path.dirname(os.path.abspath(__file__))
     build_pytorch_libs = os.path.join(tools_path, 'build_pytorch_libs.sh')
 
-    command = '{} --use-nnpack '.format(build_pytorch_libs)
+    command = [build_pytorch_libs, '--use-nnpack']
     if USE_CUDA:
-        command += '--use-cuda '
-    command += 'caffe2'
+        command.append('--use-cuda')
+        if os.environ.get('USE_CUDA_STATIC_LINK', False):
+            command.append('--cuda-static-link')
+    if options.use_cereal:
+        command.append('--use-cereal')
+    command.append('caffe2')
 
     sys.stdout.flush()
     sys.stderr.flush()
-    subprocess.check_call(shlex.split(command), universal_newlines=True)
+    subprocess.check_call(command, universal_newlines=True)
diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat
index 80d79dc4364260..acc5bed4a98bfb 100755
--- a/tools/build_pytorch_libs.bat
+++ b/tools/build_pytorch_libs.bat
@@ -176,14 +176,17 @@ goto:eof
                   -DNVTOOLEXT_HOME="%NVTOOLEXT_HOME%" ^
                   -DNO_API=ON ^
                   -DBUILD_SHARED_LIBS="%BUILD_SHARED_LIBS%" ^
-                  -DBUILD_PYTHON=OFF ^
+                  -DBUILD_PYTHON=%BUILD_PYTHON% ^
                   -DBUILD_BINARY=%BUILD_BINARY% ^
                   -DBUILD_TEST=OFF ^
                   -DINSTALL_TEST=%INSTALL_TEST% ^
+                  -DBUILD_CAFFE2_OPS=%BUILD_CAFFE2_OPS% ^
                   -DONNX_NAMESPACE=%ONNX_NAMESPACE% ^
                   -DUSE_CUDA=%USE_CUDA% ^
                   -DUSE_CUDNN=OFF ^
                   -DUSE_NNPACK=%USE_NNPACK% ^
+                  -DUSE_LEVELDB=%USE_LEVELDB% ^
+                  -DUSE_LMDB=%USE_LMDB% ^
                   -DUSE_OPENCV=%USE_OPENCV% ^
                   -DUSE_GLOG=OFF ^
                   -DUSE_GFLAGS=OFF ^
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index ba0bde0d577347..37d816775f75b2 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -10,6 +10,11 @@
 
 set -ex
 
+SYNC_COMMAND="cp"
+if [ -x "$(command -v rsync)" ]; then
+    SYNC_COMMAND="rsync -lptgoD"
+fi
+
 # Options for building only a subset of the libraries
 USE_CUDA=0
 USE_ROCM=0
@@ -17,8 +22,13 @@ USE_NNPACK=0
 USE_MKLDNN=0
 USE_GLOO_IBVERBS=0
 CAFFE2_STATIC_LINK_CUDA=0
+TORCH_USE_CEREAL=0
+RERUN_CMAKE=1
 while [[ $# -gt 0 ]]; do
     case "$1" in
+      --dont-rerun-cmake)
+          RERUN_CMAKE=0
+          ;;
       --use-cuda)
           USE_CUDA=1
           ;;
@@ -37,6 +47,9 @@ while [[ $# -gt 0 ]]; do
       --cuda-static-link)
           CAFFE2_STATIC_LINK_CUDA=1
           ;;
+      --use-cereal)
+          TORCH_USE_CEREAL=1
+          ;;
       *)
           break
           ;;
@@ -147,46 +160,50 @@ function build() {
       THCS | THCUNN ) BUILD_C_FLAGS=$C_FLAGS;;
       *) BUILD_C_FLAGS=$C_FLAGS" -fexceptions";;
   esac
-  # TODO: The *_LIBRARIES cmake variables should eventually be
-  # deprecated because we are using .cmake files to handle finding
-  # installed libraries instead
-  ${CMAKE_VERSION} ../../$1 -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \
-              ${CMAKE_GENERATOR} \
-              -DTorch_FOUND="1" \
-              -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
-              -DCMAKE_C_FLAGS="$BUILD_C_FLAGS $USER_CFLAGS" \
-              -DCMAKE_CXX_FLAGS="$BUILD_C_FLAGS $CPP_FLAGS $USER_CFLAGS" \
-              -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
-              -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
-              -DCMAKE_INSTALL_LIBDIR="$INSTALL_DIR/lib" \
-              -DCUDA_NVCC_FLAGS="$CUDA_NVCC_FLAGS" \
-              -DCUDA_DEVICE_DEBUG=$CUDA_DEVICE_DEBUG \
-              -DCMAKE_PREFIX_PATH="$INSTALL_DIR" \
-              -Dcwrap_files="$CWRAP_FILES" \
-              -DTH_INCLUDE_PATH="$INSTALL_DIR/include" \
-              -DTH_LIB_PATH="$INSTALL_DIR/lib" \
-              -DTH_LIBRARIES="$INSTALL_DIR/lib/libTH$LD_POSTFIX" \
-              -DCAFFE2_LIBRARIES="$INSTALL_DIR/lib/libcaffe2$LD_POSTFIX" \
-              -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
-              -DTHNN_LIBRARIES="$INSTALL_DIR/lib/libTHNN$LD_POSTFIX" \
-              -DTHCUNN_LIBRARIES="$INSTALL_DIR/lib/libTHCUNN$LD_POSTFIX" \
-              -DTHS_LIBRARIES="$INSTALL_DIR/lib/libTHS$LD_POSTFIX" \
-              -DTHC_LIBRARIES="$INSTALL_DIR/lib/libTHC$LD_POSTFIX" \
-              -DTHCS_LIBRARIES="$INSTALL_DIR/lib/libTHCS$LD_POSTFIX" \
-              -DTH_SO_VERSION=1 \
-              -DTHC_SO_VERSION=1 \
-              -DTHNN_SO_VERSION=1 \
-              -DTHCUNN_SO_VERSION=1 \
-              -DTHD_SO_VERSION=1 \
-              -DUSE_CUDA=$USE_CUDA \
-              -DBUILD_EXAMPLES=OFF \
-              -DBUILD_TEST=$BUILD_TEST \
-              -DNO_NNPACK=$((1-$USE_NNPACK)) \
-              -DNCCL_EXTERNAL=1 \
-              -DCMAKE_DEBUG_POSTFIX="" \
-              -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-              ${@:2} \
-              -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ${CMAKE_ARGS[@]}
+  if [[ $RERUN_CMAKE -eq 1 ]] || [ ! -f CMakeCache.txt ]; then
+      # TODO: The *_LIBRARIES cmake variables should eventually be
+      # deprecated because we are using .cmake files to handle finding
+      # installed libraries instead
+      ${CMAKE_VERSION} ../../$1 -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \
+		       ${CMAKE_GENERATOR} \
+		       -DCMAKE_INSTALL_MESSAGE="LAZY" \
+		       -DTorch_FOUND="1" \
+		       -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
+		       -DCMAKE_C_FLAGS="$BUILD_C_FLAGS $USER_CFLAGS" \
+		       -DCMAKE_CXX_FLAGS="$BUILD_C_FLAGS $CPP_FLAGS $USER_CFLAGS" \
+		       -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
+		       -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
+		       -DCMAKE_INSTALL_LIBDIR="$INSTALL_DIR/lib" \
+		       -DCUDA_NVCC_FLAGS="$CUDA_NVCC_FLAGS" \
+		       -DCUDA_DEVICE_DEBUG=$CUDA_DEVICE_DEBUG \
+		       -DCMAKE_PREFIX_PATH="$INSTALL_DIR" \
+		       -Dcwrap_files="$CWRAP_FILES" \
+		       -DTH_INCLUDE_PATH="$INSTALL_DIR/include" \
+		       -DTH_LIB_PATH="$INSTALL_DIR/lib" \
+		       -DTH_LIBRARIES="$INSTALL_DIR/lib/libTH$LD_POSTFIX" \
+		       -DCAFFE2_LIBRARIES="$INSTALL_DIR/lib/libcaffe2$LD_POSTFIX" \
+		       -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
+		       -DTHNN_LIBRARIES="$INSTALL_DIR/lib/libTHNN$LD_POSTFIX" \
+		       -DTHCUNN_LIBRARIES="$INSTALL_DIR/lib/libTHCUNN$LD_POSTFIX" \
+		       -DTHS_LIBRARIES="$INSTALL_DIR/lib/libTHS$LD_POSTFIX" \
+		       -DTHC_LIBRARIES="$INSTALL_DIR/lib/libTHC$LD_POSTFIX" \
+		       -DTHCS_LIBRARIES="$INSTALL_DIR/lib/libTHCS$LD_POSTFIX" \
+		       -DTH_SO_VERSION=1 \
+		       -DTHC_SO_VERSION=1 \
+		       -DTHNN_SO_VERSION=1 \
+		       -DTHCUNN_SO_VERSION=1 \
+		       -DTHD_SO_VERSION=1 \
+		       -DUSE_CUDA=$USE_CUDA \
+		       -DTORCH_USE_CEREAL=$TORCH_USE_CEREAL \
+		       -DBUILD_EXAMPLES=OFF \
+		       -DBUILD_TEST=$BUILD_TEST \
+		       -DNO_NNPACK=$((1-$USE_NNPACK)) \
+		       -DNCCL_EXTERNAL=1 \
+		       -DCMAKE_DEBUG_POSTFIX="" \
+		       -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+		       ${@:2} \
+		       -DCMAKE_EXPORT_COMPILE_COMMANDS=1 ${CMAKE_ARGS[@]}
+  fi
   ${CMAKE_INSTALL} -j"$MAX_JOBS"
   popd
 
@@ -211,18 +228,21 @@ function path_remove {
 function build_nccl() {
   mkdir -p build/nccl
   pushd build/nccl
-  ${CMAKE_VERSION} ../../nccl -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \
-              ${CMAKE_GENERATOR} \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
-              -DCMAKE_C_FLAGS="$C_FLAGS $USER_CFLAGS" \
-              -DCMAKE_CXX_FLAGS="$C_FLAGS $CPP_FLAGS $USER_CFLAGS" \
-              -DCMAKE_SHARED_LINKER_FLAGS="$USER_LDFLAGS" \
-              -DCMAKE_UTILS_PATH="$BASE_DIR/cmake/public/utils.cmake" \
-              -DNUM_JOBS="$MAX_JOBS"
+  if [[ $RERUN_CMAKE -eq 1 ]] || [ ! -f CMakeCache.txt ]; then
+      ${CMAKE_VERSION} ../../nccl -DCMAKE_MODULE_PATH="$BASE_DIR/cmake/Modules_CUDA_fix" \
+		       ${CMAKE_GENERATOR} \
+		       -DCMAKE_INSTALL_MESSAGE="LAZY" \
+		       -DCMAKE_BUILD_TYPE=Release \
+		       -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
+		       -DCMAKE_C_FLAGS="$C_FLAGS $USER_CFLAGS" \
+		       -DCMAKE_CXX_FLAGS="$C_FLAGS $CPP_FLAGS $USER_CFLAGS" \
+		       -DCMAKE_SHARED_LINKER_FLAGS="$USER_LDFLAGS" \
+		       -DCMAKE_UTILS_PATH="$BASE_DIR/cmake/public/utils.cmake" \
+		       -DNUM_JOBS="$MAX_JOBS"
+  fi
   ${CMAKE_INSTALL} -j"$MAX_JOBS"
   mkdir -p ${INSTALL_DIR}/lib
-  cp "lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so.1"
+  find lib -name "libnccl.so*" | xargs -I {} $SYNC_COMMAND {} "${INSTALL_DIR}/lib/"
   if [ ! -f "${INSTALL_DIR}/lib/libnccl.so" ]; then
     ln -s "${INSTALL_DIR}/lib/libnccl.so.1" "${INSTALL_DIR}/lib/libnccl.so"
   fi
@@ -252,60 +272,77 @@ function build_caffe2() {
     EXTRA_CAFFE2_CMAKE_FLAGS+=("-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH")
   fi
 
-  ${CMAKE_VERSION} $BASE_DIR \
-  ${CMAKE_GENERATOR} \
-      -DPYTHON_EXECUTABLE=$PYTORCH_PYTHON \
-      -DBUILDING_WITH_TORCH_LIBS=ON \
-      -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-      -DBUILD_TORCH=$BUILD_TORCH \
-      -DBUILD_PYTHON=$BUILD_PYTHON \
-      -DBUILD_SHARED_LIBS=$BUILD_SHARED_LIBS \
-      -DBUILD_BINARY=$BUILD_BINARY \
-      -DBUILD_TEST=$BUILD_TEST \
-      -DINSTALL_TEST=$INSTALL_TEST \
-      -DONNX_NAMESPACE=$ONNX_NAMESPACE \
-      -DUSE_CUDA=$USE_CUDA \
-      -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
-      -DUSE_ROCM=$USE_ROCM \
-      -DUSE_NNPACK=$USE_NNPACK \
-      -DUSE_OPENCV=$USE_OPENCV \
-      -DUSE_GLOG=OFF \
-      -DUSE_GFLAGS=OFF \
-      -DUSE_SYSTEM_EIGEN_INSTALL=OFF \
-      -DCUDNN_INCLUDE_DIR=$CUDNN_INCLUDE_DIR \
-      -DCUDNN_LIB_DIR=$CUDNN_LIB_DIR \
-      -DCUDNN_LIBRARY=$CUDNN_LIBRARY \
-      -DUSE_MKLDNN=$USE_MKLDNN \
-      -DMKLDNN_INCLUDE_DIR=$MKLDNN_INCLUDE_DIR \
-      -DMKLDNN_LIB_DIR=$MKLDNN_LIB_DIR \
-      -DMKLDNN_LIBRARY=$MKLDNN_LIBRARY \
-      -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-      -DCMAKE_C_FLAGS="$USER_CFLAGS" \
-      -DCMAKE_CXX_FLAGS="$USER_CFLAGS" \
-      -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
-      -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" ${EXTRA_CAFFE2_CMAKE_FLAGS[@]}
+  if [[ $RERUN_CMAKE -eq 1 ]] || [ ! -f CMakeCache.txt ]; then
+      ${CMAKE_VERSION} $BASE_DIR \
+		       ${CMAKE_GENERATOR} \
+		       -DCMAKE_INSTALL_MESSAGE="LAZY" \
+		       -DPYTHON_EXECUTABLE=$PYTORCH_PYTHON \
+		       -DBUILDING_WITH_TORCH_LIBS=ON \
+		       -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+		       -DBUILD_TORCH=$BUILD_TORCH \
+		       -DBUILD_PYTHON=$BUILD_PYTHON \
+		       -DBUILD_SHARED_LIBS=$BUILD_SHARED_LIBS \
+		       -DBUILD_BINARY=$BUILD_BINARY \
+		       -DBUILD_TEST=$BUILD_TEST \
+		       -DINSTALL_TEST=$INSTALL_TEST \
+		       -DBUILD_CAFFE2_OPS=$BUILD_CAFFE2_OPS \
+		       -DONNX_NAMESPACE=$ONNX_NAMESPACE \
+		       -DUSE_CUDA=$USE_CUDA \
+		       -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
+		       -DUSE_ROCM=$USE_ROCM \
+		       -DUSE_NNPACK=$USE_NNPACK \
+		       -DUSE_LEVELDB=$USE_LEVELDB \
+		       -DUSE_LMDB=$USE_LMDB \
+		       -DUSE_OPENCV=$USE_OPENCV \
+		       -DUSE_GLOG=OFF \
+		       -DUSE_GFLAGS=OFF \
+		       -DUSE_SYSTEM_EIGEN_INSTALL=OFF \
+		       -DCUDNN_INCLUDE_DIR=$CUDNN_INCLUDE_DIR \
+		       -DCUDNN_LIB_DIR=$CUDNN_LIB_DIR \
+		       -DCUDNN_LIBRARY=$CUDNN_LIBRARY \
+		       -DUSE_MKLDNN=$USE_MKLDNN \
+		       -DMKLDNN_INCLUDE_DIR=$MKLDNN_INCLUDE_DIR \
+		       -DMKLDNN_LIB_DIR=$MKLDNN_LIB_DIR \
+		       -DMKLDNN_LIBRARY=$MKLDNN_LIBRARY \
+		       -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \
+		       -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
+		       -DCMAKE_C_FLAGS="$USER_CFLAGS" \
+		       -DCMAKE_CXX_FLAGS="$USER_CFLAGS" \
+		       -DCMAKE_EXE_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" \
+		       -DCMAKE_SHARED_LINKER_FLAGS="$LDFLAGS $USER_LDFLAGS" ${EXTRA_CAFFE2_CMAKE_FLAGS[@]}
       # STOP!!! Are you trying to add a C or CXX flag?  Add it
       # to CMakeLists.txt and aten/CMakeLists.txt, not here.
       # We need the vanilla cmake build to work.
+  fi
 
   # This is needed by the aten tests built with caffe2
   if [ -f "${INSTALL_DIR}/lib/libnccl.so" ] && [ ! -f "lib/libnccl.so.1" ]; then
-    # cp root/torch/lib/tmp_install/libnccl root/build/lib/libnccl
-    cp "${INSTALL_DIR}/lib/libnccl.so.1" "lib/libnccl.so.1"
+      # $SYNC_COMMAND root/torch/lib/tmp_install/libnccl root/build/lib/libnccl
+      find "${INSTALL_DIR}/lib" -name "libnccl.so*" | xargs -I {} $SYNC_COMMAND {} "lib/"
   fi
 
   ${CMAKE_INSTALL} -j"$MAX_JOBS"
 
+  # Install Python proto files
+  if [[ "$BUILD_PYTHON" == 'ON' ]]; then
+      echo "Copying Caffe2 proto files from $(pwd)/caffe2/proto to  $(cd .. && pwd)/caffe2/proto"
+      echo "All the files in caffe2/proto are $(find caffe2/proto)"
+      for proto_file in $(pwd)/caffe2/proto/*.py; do
+          cp $proto_file "$(pwd)/../caffe2/proto/"
+      done
+  fi
+
+
   # Fix rpaths of shared libraries
   if [[ $(uname) == 'Darwin' ]]; then
-    # root/torch/lib/tmp_install/lib
-    pushd "$INSTALL_DIR/lib"
-    for lib in *.dylib; do
-      echo "Updating install_name for $lib"
-      install_name_tool -id @rpath/$lib $lib
-    done
-    popd
+      # root/torch/lib/tmp_install/lib
+      echo "Updating all install_names in $INSTALL_DIR/lib"
+      pushd "$INSTALL_DIR/lib"
+      for lib in *.dylib; do
+          echo "Updating install_name for $(pwd)/$lib"
+          install_name_tool -id @rpath/$lib $lib
+      done
+      popd
   fi
 }
 
@@ -347,17 +384,24 @@ pushd $TORCH_LIB_DIR
 
 # If all the builds succeed we copy the libraries, headers,
 # binaries to torch/lib
+echo "tools/build_pytorch_libs.sh succeeded at $(date)"
+echo "removing $INSTALL_DIR/lib/cmake and $INSTALL_DIR/lib/python"
 rm -rf "$INSTALL_DIR/lib/cmake"
 rm -rf "$INSTALL_DIR/lib/python"
-cp -r "$INSTALL_DIR/lib"/* .
+
+echo "Copying $INSTALL_DIR/lib to $(pwd)"
+$SYNC_COMMAND -r "$INSTALL_DIR/lib"/* .
 if [ -d "$INSTALL_DIR/lib64/" ]; then
-    cp -r "$INSTALL_DIR/lib64"/* .
+    $SYNC_COMMAND -r "$INSTALL_DIR/lib64"/* .
 fi
-cp ../../aten/src/THNN/generic/THNN.h .
-cp ../../aten/src/THCUNN/generic/THCUNN.h .
-cp -r "$INSTALL_DIR/include" .
+echo "Copying $(cd ../.. && pwd)/aten/src/generic/THNN.h to $(pwd)"
+$SYNC_COMMAND ../../aten/src/THNN/generic/THNN.h .
+$SYNC_COMMAND ../../aten/src/THCUNN/generic/THCUNN.h .
+
+echo "Copying $INSTALL_DIR/include to $(pwd)"
+$SYNC_COMMAND -r "$INSTALL_DIR/include" .
 if [ -d "$INSTALL_DIR/bin/" ]; then
-    cp -r "$INSTALL_DIR/bin/"/* .
+    $SYNC_COMMAND -r "$INSTALL_DIR/bin/"/* .
 fi
 
 popd
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index ff7fce56e91552..b7326e526baa86 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -25,6 +25,7 @@
     'std::array<bool,2>': 'bool[2]',
     'std::array<bool,3>': 'bool[3]',
     'std::array<bool,4>': 'bool[4]',
+    'std::string': 'str',
     'Scalar': 'Scalar',
     'Tensor': 'Tensor',
     'TensorList': 'Tensor[]',
@@ -56,16 +57,18 @@ def jit_type_of(arg):
 # map from aten 'simple_type' to the function that will turn a tensor into
 # that type
 FROM_IVALUE = {
-    'Device': 'as_device({}.toIntList()->elements())',
+    'Device': '{}.to<at::Device>()',
     'IntList': '{}.toIntList()->elements()',
-    'Layout': 'static_cast<at::Layout>({}.toInt())',
+    'Layout': '{}.to<at::Layout>()',
     'Scalar': '{}.toScalar()',
-    'ScalarType': 'static_cast<at::ScalarType>({}.toInt())',
+    'ScalarType': '{}.to<at::ScalarType>()',
     'Tensor': '{}.toTensor()',
     'TensorList': '{}.toTensorList()->elements()',
     'bool': 'bool({}.toInt())',
     'double': '{}.toDouble()',
     'int64_t': '{}.toInt()',
+    'std::string': '{}.toString()->string()',
+    'Generator': 'nullptr',
     'std::array<bool,2>': 'as_bool_array<2>({}.toIntList()->elements())',
     'std::array<bool,3>': 'as_bool_array<3>({}.toIntList()->elements())',
     'std::array<bool,4>': 'as_bool_array<4>({}.toIntList()->elements())',
@@ -121,7 +124,7 @@ def is_magic_method(api_name):
     return api_name.startswith('__') and api_name.endswith('__')
 
 
-blacklisted_types = {'SparseTensorRef', 'Storage', 'ScalarType', 'optional<ScalarType>', 'std::string', 'void*'}
+blacklisted_types = {'SparseTensorRef', 'Storage', 'void*'}
 default_only_types = {'Generator'}
 
 
@@ -195,12 +198,9 @@ def emit_decl_variant(decl):
 
         real_inputs = 0
         for arg in decl['arguments']:
-            if arg['simple_type'] in default_only_types:
-                arguments.append(arg['default'])
-            else:
-                value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, num_inputs)
-                arguments.append(from_ivalue(arg, value))
-                real_inputs += 1
+            value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, num_inputs)
+            arguments.append(from_ivalue(arg, value))
+            real_inputs += 1
 
         call = get_invocation(decl, arguments, num_inputs)
 
diff --git a/tools/jit/templates/register_aten_ops.cpp b/tools/jit/templates/register_aten_ops.cpp
index 4ac499b99faaed..de205de52a484e 100644
--- a/tools/jit/templates/register_aten_ops.cpp
+++ b/tools/jit/templates/register_aten_ops.cpp
@@ -50,10 +50,6 @@ std::array<bool, N> as_bool_array(at::ArrayRef<int64_t> vec) {
   return res;
 }
 
-at::Device as_device(ArrayRef<int64_t> elements) {
-  return at::Device(static_cast<at::Device::Type>(elements[0]), elements[1]);
-}
-
 RegisterOperators reg({
   ${constructors}
 });
diff --git a/tools/setup_helpers/build.py b/tools/setup_helpers/build.py
index 82364bd7394c51..09e26cb5aa1b20 100644
--- a/tools/setup_helpers/build.py
+++ b/tools/setup_helpers/build.py
@@ -2,4 +2,7 @@
 
 BUILD_BINARY = check_env_flag('BUILD_BINARY')
 BUILD_TEST = not check_negative_env_flag('BUILD_TEST')
+BUILD_CAFFE2_OPS = not check_negative_env_flag('BUILD_CAFFE2_OPS')
+USE_LEVELDB = check_env_flag('USE_LEVELDB')
+USE_LMDB = check_env_flag('USE_LMDB')
 USE_OPENCV = check_env_flag('USE_OPENCV')
diff --git a/tools/test_aten_install.sh b/tools/test_aten_install.sh
deleted file mode 100755
index d2d57235505778..00000000000000
--- a/tools/test_aten_install.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/sh
-set -xe
-rm -rf aten_build
-rm -rf aten_install
-mkdir aten_build aten_install
-cd aten_build
-cmake ../aten -DUSE_CUDA=OFF -DCMAKE_INSTALL_PREFIX=../aten_install
-NUM_JOBS="$(getconf _NPROCESSORS_ONLN)"
-make -j"$NUM_JOBS" install
-cd ..
-aten/tools/test_install.sh $(pwd)/aten_install $(pwd)/aten
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index d5d0ebc663915d..58814d21ffa301 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -9,7 +9,7 @@ else()
   option(USE_CUDA "Use CUDA" ON)
 endif()
 
-option(BUILD_TORCH_TEST "Build torch test binaries" ON)
+option(BUILD_TEST "Build torch test binaries" ON)
 option(TORCH_STATIC "Build libtorch.a rather than libtorch.so" OFF)
 
 # TODO: Unify with version from setup.py
@@ -26,7 +26,7 @@ if(NOT TORCH_INSTALL_BIN_DIR)
 endif()
 
 if(NOT TORCH_INSTALL_INCLUDE_DIR)
-  set(TORCH_INSTALL_INCLUDE_DIR include/libtorch)
+  set(TORCH_INSTALL_INCLUDE_DIR include)
 endif()
 
 if(NOT TORCH_INSTALL_LIB_DIR)
@@ -146,7 +146,6 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
   ${TORCH_SRC_DIR}/csrc/jit/autodiff.cpp
   ${TORCH_SRC_DIR}/csrc/jit/export.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/fusion_compiler.cpp
   ${TORCH_SRC_DIR}/csrc/jit/generated/register_aten_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/graph_executor.cpp
   ${TORCH_SRC_DIR}/csrc/jit/import.cpp
@@ -154,7 +153,6 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/interpreter.cpp
   ${TORCH_SRC_DIR}/csrc/jit/constants.cpp
   ${TORCH_SRC_DIR}/csrc/jit/ir.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/ivalue.cpp
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
@@ -174,7 +172,9 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/passes/peephole.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/remove_expands.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/passes/requires_grad_analysis.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_undef.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/fusers/interface.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_symbols.cpp
@@ -189,6 +189,34 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp
   )
 
+SET(USE_CPU_FUSER 0)
+if (NOT WIN32)
+  SET(USE_CPU_FUSER 1)
+
+  list(APPEND TORCH_SRCS
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/common/tensor_desc.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/common/fusion_handle_impl.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/common/fused_kernel.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/cpu/fusion_compiler.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/cpu/fused_kernel.cpp
+  )
+endif()
+
+SET(USE_CUDA_FUSER 0)
+if (USE_CUDA AND NOT USE_ROCM AND NOT WIN32)
+  SET(USE_CUDA_FUSER 1)
+
+  list(APPEND TORCH_SRCS
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/cuda/fusion_compiler.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/cuda/fused_kernel.cpp
+  )
+
+endif()
+
+CONFIGURE_FILE(
+    ${TORCH_SRC_DIR}/csrc/jit/fusers/Config.h.in
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/jit/fusers/Config.h)
+
 if (NOT NO_API AND NOT USE_ROCM)
   list(APPEND TORCH_SRCS
     ${TORCH_SRC_DIR}/csrc/api/src/utils.cpp
@@ -211,7 +239,6 @@ if (NOT NO_API AND NOT USE_ROCM)
     ${TORCH_SRC_DIR}/csrc/api/src/optim/sgd.cpp
     ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
     )
-
 endif()
 
 
@@ -287,8 +314,7 @@ if (MSVC)
   target_link_libraries(torch onnx onnx_library)
 endif()
 
-target_link_libraries(torch
-  caffe2_library)
+target_link_libraries(torch caffe2_library)
 
 find_package(OpenMP)
 if(OPENMP_FOUND)
@@ -303,6 +329,13 @@ if (NOT NO_API AND NOT USE_ROCM)
   target_include_directories(torch PUBLIC
     ${TORCH_SRC_DIR}/csrc/api
     ${TORCH_SRC_DIR}/csrc/api/include)
+
+  if (TORCH_USE_CEREAL)
+    target_compile_definitions(torch PUBLIC TORCH_USE_CEREAL)
+    # SYSTEM headers are included with -isystem and thus do not trigger warnings.
+    target_include_directories(torch SYSTEM PUBLIC
+      "${TORCH_ROOT}/third_party/cereal/include") # For cereal/
+  endif()
 endif()
 
 if(USE_CUDA)
@@ -310,6 +343,7 @@ if(USE_CUDA)
     set(TORCH_CUDA_LIBRARIES
       ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
       ${CUDA_LIBRARIES})
+    set_target_properties(torch PROPERTIES LINK_FLAGS "/FORCE:UNRESOLVED")
     target_include_directories(torch PRIVATE "${NVTOOLEXT_HOME}/include")
   elseif(APPLE)
     set(TORCH_CUDA_LIBRARIES
@@ -320,23 +354,11 @@ if(USE_CUDA)
     set_target_properties(torch PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
   else()
     set(TORCH_CUDA_LIBRARIES
-      ${CUDA_CUDA_LIB}
-      ${CUDA_NVRTC_LIB}
       ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so
       ${CUDA_LIBRARIES})
   endif()
 
-  if(MSVC OR APPLE)
-    target_link_libraries(torch caffe2_gpu_library ${TORCH_CUDA_LIBRARIES})
-  else()
-    # TODO: using the full TORCH_CUDA_LIBRARIES here causes some
-    # builds to fail in CI, as libcuda.so can no longer be found. It's
-    # not clear why this is the case, and the situation should be
-    # investigated/cleaned up.  Note that the test+jit/test_api
-    # targets below do require linking against the full
-    # TORCH_CUDA_LIBRARIES, even on Linux
-    target_link_libraries(torch caffe2_gpu_library ${CUDA_LIBRARIES})
-  endif()
+  target_link_libraries(torch caffe2_gpu_library ${TORCH_CUDA_LIBRARIES})
   target_compile_definitions(torch PRIVATE USE_CUDA)
 endif()
 
@@ -382,10 +404,6 @@ target_include_directories(torch PRIVATE ${ATen_CPU_INCLUDE})
 target_include_directories(torch PUBLIC
   ${TORCH_SRC_DIR}/csrc)
 
-# SYSTEM headers are included with -isystem and thus do not trigger warnings.
-target_include_directories(torch SYSTEM PUBLIC
-  "${TORCH_ROOT}/third_party/cereal/include") # For cereal/
-
 set_target_properties(torch PROPERTIES VERSION 1 SOVERSION 1)
 
 if(NOT ${CMAKE_VERSION} VERSION_LESS "3.1")
@@ -395,6 +413,8 @@ endif()
 install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
         DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
         FILES_MATCHING PATTERN "*.h")
+install(FILES "${TORCH_SRC_DIR}/script.h"
+        DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
 
 install(TARGETS torch
   RUNTIME DESTINATION "${TORCH_INSTALL_BIN_DIR}"
@@ -402,7 +422,7 @@ install(TARGETS torch
   ARCHIVE DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
 # JIT Tests. TODO: Put into test/cpp/jit folder
-if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
+if (BUILD_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
   add_executable(test_jit ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp)
   target_link_libraries(test_jit torch ${TORCH_CUDA_LIBRARIES})
   target_compile_definitions(test_jit PUBLIC USE_CATCH _FORCE_INLINES)
@@ -411,14 +431,16 @@ if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
     ${ATen_CPU_INCLUDE})
 
   if (USE_CUDA)
-    target_link_libraries(test_jit ${CUDA_LIBRARIES})
+    target_link_libraries(test_jit ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB})
   endif()
 endif()
 
-if (BUILD_TORCH_TEST AND NOT NO_API AND NOT USE_ROCM)
-  set(TORCH_API_TEST_DIR "${TORCH_ROOT}/test/cpp/api")
+if (BUILD_TEST AND NOT NO_API AND NOT USE_ROCM)
 
-  add_executable(test_api
+  #Catch test of api.
+  #TODO: Change all these tests to Google test.
+  set(TORCH_API_TEST_DIR "${TORCH_ROOT}/test/cpp/api")
+  set(TORCH_API_TEST_SOURCES
     ${TORCH_API_TEST_DIR}/any.cpp
     ${TORCH_API_TEST_DIR}/cursor.cpp
     ${TORCH_API_TEST_DIR}/integration.cpp
@@ -430,40 +452,82 @@ if (BUILD_TORCH_TEST AND NOT NO_API AND NOT USE_ROCM)
     ${TORCH_API_TEST_DIR}/parallel.cpp
     ${TORCH_API_TEST_DIR}/rnn.cpp
     ${TORCH_API_TEST_DIR}/sequential.cpp
-    ${TORCH_API_TEST_DIR}/serialization.cpp
-    ${TORCH_API_TEST_DIR}/static.cpp
     ${TORCH_API_TEST_DIR}/tensor_cuda.cpp
     ${TORCH_API_TEST_DIR}/tensor.cpp
     ${TORCH_API_TEST_DIR}/jit.cpp
-    # Temporary until ATen tests are built with Caffe2
     ${TORCH_API_TEST_DIR}/tensor_options.cpp
     ${TORCH_API_TEST_DIR}/tensor_options_cuda.cpp
-    )
+  )
+
+  if (TORCH_USE_CEREAL)
+    list(APPEND TORCH_API_TEST_SOURCES ${TORCH_API_TEST_DIR}/serialization.cpp)
+  endif()
+
+  add_executable(test_api ${TORCH_API_TEST_SOURCES})
 
   target_include_directories(test_api
     PUBLIC
     "${TORCH_ROOT}/third_party/catch/single_include"
     ${ATen_CPU_INCLUDE})
 
-  target_link_libraries(test_api torch ${TORCH_CUDA_LIBRARIES})
+  target_link_libraries(test_api torch ${TORCH_CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB})
 
+   #Google test of api.
+   set(TORCH_API_GTEST_DIR "${TORCH_ROOT}/test/cpp/api/")
+   add_executable(gtest_api
+		${TORCH_API_GTEST_DIR}/static.cpp
+		)
+   target_include_directories(gtest_api PRIVATE ${ATen_CPU_INCLUDE})
+   target_link_libraries(gtest_api torch gtest_main)
+   if (USE_CUDA)
+      target_link_libraries(gtest_api ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES})
+   endif()
+
+  #Adding compile options for both tests.
   if (NOT MSVC)
     if (APPLE)
       target_compile_options(test_api PRIVATE
         # Clang has an unfixed bug leading to spurious missing braces
         # warnings, see https://bugs.llvm.org/show_bug.cgi?id=21629
         -Wno-missing-braces)
-    else()
+      target_compile_options(gtest_api PRIVATE
+        -Wno-missing-braces)
+	else()
       target_compile_options(test_api PRIVATE
         # Considered to be flaky.  See the discussion at
         # https://github.com/pytorch/pytorch/pull/9608
         -Wno-maybe-uninitialized
         # gcc gives nonsensical warnings about variadic.h
         -Wno-unused-but-set-parameter)
+     target_compile_options(gtest_api PRIVATE
+		-Wno-maybe-uninitialized
+		-Wno-unused-but-set-parameter)
     endif()
   endif()
 endif()
 
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  message(STATUS "${CMAKE_CXX_COMPILER} ${CMAKE_CURRENT_LIST_DIR}/abi-check.cpp -o ${CMAKE_BINARY_DIR}/abi-check")
+  execute_process(
+    COMMAND
+      "${CMAKE_CXX_COMPILER}"
+      "${CMAKE_CURRENT_LIST_DIR}/abi-check.cpp"
+      "-o"
+      "${CMAKE_BINARY_DIR}/abi-check"
+    RESULT_VARIABLE ABI_CHECK_COMPILE_RESULT)
+  if (ABI_CHECK_COMPILE_RESULT)
+    message(FATAL_ERROR "Could not compile ABI Check: ${ABI_CHECK_COMPILE_RESULT}")
+  endif()
+  execute_process(
+    COMMAND "${CMAKE_BINARY_DIR}/abi-check"
+    RESULT_VARIABLE ABI_CHECK_RESULT
+    OUTPUT_VARIABLE GLIBCXX_USE_CXX11_ABI)
+  if (ABI_CHECK_RESULT)
+    message(WARNING "Could not run ABI Check: ${ABI_CHECK_RESULT}")
+  endif()
+  message(STATUS "Determined _GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
+endif()
+
 # CMake config for external projects.
 configure_file(
     ${PROJECT_SOURCE_DIR}/cmake/TorchConfigVersion.cmake.in
@@ -473,3 +537,7 @@ configure_file(
     ${TORCH_ROOT}/cmake/TorchConfig.cmake.in
     ${PROJECT_BINARY_DIR}/TorchConfig.cmake
     @ONLY)
+install(FILES
+    ${PROJECT_BINARY_DIR}/TorchConfigVersion.cmake
+    ${PROJECT_BINARY_DIR}/TorchConfig.cmake
+    DESTINATION share/cmake/Torch)
diff --git a/torch/__init__.py b/torch/__init__.py
index e494cdec6cbec1..c6298353cd8ab8 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -281,6 +281,7 @@ def manager_path():
 
 import torch.cuda
 import torch.autograd
+from torch.autograd import no_grad, enable_grad, set_grad_enabled
 import torch.nn
 import torch.optim
 import torch.multiprocessing
@@ -293,7 +294,6 @@ def manager_path():
 import torch.testing
 import torch.backends.cuda
 import torch.backends.mkl
-from torch.autograd import no_grad, enable_grad, set_grad_enabled
 
 _C._init_names(list(torch._storage_classes))
 
diff --git a/torch/_six.py b/torch/_six.py
index 1d70df51830d5e..84ba9a464891bb 100644
--- a/torch/_six.py
+++ b/torch/_six.py
@@ -108,3 +108,10 @@ def exec_(_code_, _globs_=None, _locs_=None):
 else:
     def raise_from(value, from_value):
         raise value
+
+if PY2:
+    import collections
+    container_abcs = collections
+elif PY3:
+    import collections.abc
+    container_abcs = collections.abc
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index f7ace5e0edcac2..151b0dee218876 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -482,7 +482,7 @@ def add_docstr_all(method, docstr):
 
 .. math::
 
-    f(x) = \dfrac{1}{\pi} \dfrac{\sigma}{(x - median)^2 + \sigma^2}
+    f(x) = \dfrac{1}{\pi} \dfrac{\sigma}{(x - \text{median})^2 + \sigma^2}
 """)
 
 add_docstr_all('ceil',
@@ -1229,17 +1229,18 @@ def add_docstr_all(method, docstr):
 In-place version of :meth:`~Tensor.log2`
 """)
 
-add_docstr_all('log_normal_', u"""
+add_docstr_all('log_normal_', r"""
 log_normal_(mean=1, std=2, *, generator=None)
 
 Fills :attr:`self` tensor with numbers samples from the log-normal distribution
-parameterized by the given mean (\u00B5) and standard deviation (\u03C3).
-Note that :attr:`mean` and :attr:`stdv` are the mean and standard deviation of
-the underlying normal distribution, and not of the returned distribution:
+parameterized by the given mean :math:`\mu` and standard deviation
+:math:`\sigma`. Note that :attr:`mean` and :attr:`std` are the mean and
+standard deviation of the underlying normal distribution, and not of the
+returned distribution:
 
 .. math::
 
-    f(x) = \\dfrac{1}{x \\sigma \\sqrt{2\\pi}}\ e^{-\\dfrac{(\\ln x - \\mu)^2}{2\\sigma^2}}
+    f(x) = \dfrac{1}{x \sigma \sqrt{2\pi}}\ e^{-\frac{(\ln x - \mu)^2}{2\sigma^2}}
 """)
 
 add_docstr_all('logsumexp',
@@ -1412,14 +1413,7 @@ def callable(a, b) -> number
                r"""
 narrow(dimension, start, length) -> Tensor
 
-Returns a new tensor that is a narrowed version of :attr:`self` tensor. The
-dimension :attr:`dim` is narrowed from :attr:`start` to :attr:`start + length`. The
-returned tensor and :attr:`self` tensor share the same underlying storage.
-
-Args:
-    dimension (int): the dimension along which to narrow
-    start (int): the starting dimension
-    length (int): the distance to the ending dimension
+See :func:`torch.narrow`
 
 Example::
 
@@ -1752,13 +1746,16 @@ def callable(a, b) -> number
                r"""
 reshape(*shape) -> Tensor
 
-Returns a tensor with the same data and number of elements as :attr:`self`,
-but with the specified shape.
+Returns a tensor with the same data and number of elements as :attr:`self`
+but with the specified shape. This method returns a view if :attr:`shape` is
+compatible with the current shape. See :meth:`torch.Tensor.view` on when it is
+possible to return a view.
+
+See :func:`torch.reshape`
 
 Args:
     shape (tuple of ints or int...): the desired shape
 
-See :func:`torch.reshape`
 """)
 
 add_docstr_all('reshape_as',
@@ -1767,8 +1764,10 @@ def callable(a, b) -> number
 
 Returns this tensor as the same shape as :attr:`other`.
 ``self.reshape_as(other)`` is equivalent to ``self.reshape(other.sizes())``.
+This method returns a view if ``other.sizes()`` is compatible with the current
+shape. See :meth:`torch.Tensor.view` on when it is possible to return a view.
 
-Please see :meth:`~Tensor.reshape` for more information about ``reshape``.
+Please see :meth:`reshape` for more information about ``reshape``.
 
 Args:
     other (:class:`torch.Tensor`): The result tensor has the same shape
@@ -2531,10 +2530,10 @@ def callable(a, b) -> number
 
 add_docstr_all('view',
                r"""
-view(*args) -> Tensor
+view(*shape) -> Tensor
 
 Returns a new tensor with the same data as the :attr:`self` tensor but of a
-different size.
+different :attr:`shape`.
 
 The returned tensor shares the same data and must have the same number
 of elements, but may have a different size. For a tensor to be viewed, the new
@@ -2545,13 +2544,14 @@ def callable(a, b) -> number
 
 .. math::
 
-  stride[i] = stride[i+1] \times size[i+1]
+  \text{stride}[i] = \text{stride}[i+1] \times \text{size}[i+1]
 
-Otherwise, :func:`contiguous` needs to be called before the tensor can be
-viewed.
+Otherwise, :meth:`contiguous` needs to be called before the tensor can be
+viewed. See also: :meth:`reshape`, which returns a view if the shapes are
+compatible, and copies (equivalent to calling :meth:`contiguous`) otherwise.
 
 Args:
-    args (torch.Size or int...): the desired size
+    shape (torch.Size or int...): the desired size
 
 Example::
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index ea6016f778f711..9abb9f1bbf76d3 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -117,7 +117,7 @@ def parse_kwargs(desc):
 and returns a new resulting tensor.
 
 .. math::
-    out = input + value
+    \text{out} = \text{input} + \text{value}
 
 If :attr:`input` is of type FloatTensor or DoubleTensor, :attr:`value` must be
 a real number, otherwise it should be an integer.
@@ -147,7 +147,7 @@ def parse_kwargs(desc):
 :ref:`broadcastable <broadcasting-semantics>`.
 
 .. math::
-    out = input + value \times other
+    \text{out} = \text{input} + \text{value} \times \text{other}
 
 If :attr:`other` is of type FloatTensor or DoubleTensor, :attr:`value` must be
 a real number, otherwise it should be an integer.
@@ -197,7 +197,7 @@ def parse_kwargs(desc):
 and :attr:`out` will be a :math:`(n \times p)` tensor.
 
 .. math::
-    out = \beta\ mat + \alpha\ (\sum_{i=0}^{b} batch1_i \mathbin{@} batch2_i)
+    out = \beta\ \text{mat} + \alpha\ (\sum_{i=0}^{b} \text{batch1}_i \mathbin{@} \text{batch2}_i)
 
 For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
 must be real numbers, otherwise they should be integers.
@@ -229,7 +229,7 @@ def parse_kwargs(desc):
 multiply the result by the scalar :attr:`value` and add it to :attr:`tensor`.
 
 .. math::
-    out_i = tensor_i + value \times \frac{tensor1_i}{tensor2_i}
+    \text{out}_i = \text{tensor}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i}
 
 The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
 :ref:`broadcastable <broadcasting-semantics>`.
@@ -239,7 +239,7 @@ def parse_kwargs(desc):
 
 Args:
     tensor (Tensor): the tensor to be added
-    value (Number, optional): multiplier for :math:`tensor1 ./ tensor2`
+    value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}`
     tensor1 (Tensor): the numerator tensor
     tensor2 (Tensor): the denominator tensor
     out (Tensor, optional): the output tensor
@@ -264,7 +264,7 @@ def parse_kwargs(desc):
 and add it to :attr:`tensor`.
 
 .. math::
-    out_i = tensor_i + value \times tensor1_i \times tensor2_i
+    \text{out}_i = \text{tensor}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
 
 The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
 :ref:`broadcastable <broadcasting-semantics>`.
@@ -306,7 +306,7 @@ def parse_kwargs(desc):
 :attr:`mat1` and :attr`mat2` and the added matrix :attr:`mat` respectively.
 
 .. math::
-    out = \beta\ mat + \alpha\ (mat1_i \mathbin{@} mat2_i)
+    \text{out} = \beta\ \text{mat} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
 
 For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
 :attr:`alpha` must be real numbers, otherwise they should be integers.
@@ -346,7 +346,7 @@ def parse_kwargs(desc):
 :attr:`mat` and :attr:`vec` and the added tensor :attr:`tensor` respectively.
 
 .. math::
-    out = \beta\ tensor + \alpha\ (mat \mathbin{@} vec)
+    \text{out} = \beta\ \text{tensor} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
 
 For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
 :attr:`alpha` must be real numbers, otherwise they should be integers
@@ -380,7 +380,7 @@ def parse_kwargs(desc):
 :attr:`mat` respectively.
 
 .. math::
-    out = \beta\ mat + \alpha\ (vec1 \otimes vec2)
+    \text{out} = \beta\ \text{mat} + \alpha\ (\text{vec1} \otimes \text{vec2})
 
 If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
 of size `m`, then :attr:`mat` must be
@@ -394,7 +394,7 @@ def parse_kwargs(desc):
 Args:
     beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
     mat (Tensor): matrix to be added
-    alpha (Number, optional): multiplier for :math:`vec1 \otimes vec2` (:math:`\alpha`)
+    alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
     vec1 (Tensor): the first vector of the outer product
     vec2 (Tensor): the second vector of the outer product
     out (Tensor, optional): the output tensor
@@ -427,7 +427,7 @@ def parse_kwargs(desc):
     other (Tensor): second tensor to compare
     atol (float, optional): absolute tolerance. Default: 1e-08
     rtol (float, optional): relative tolerance. Default: 1e-05
-    equal_nan (float, optional): if ``True``, then two ``NaN``s will be compared as ``True``. Default: ``False``
+    equal_nan (float, optional): if ``True``, then two ``NaN`` s will be compared as equal. Default: ``False``
 
 Example::
 
@@ -560,7 +560,7 @@ def parse_kwargs(desc):
 same as the scaling factors used in :meth:`torch.addbmm`.
 
 .. math::
-    out_i = \beta\ mat_i + \alpha\ (batch1_i \mathbin{@} batch2_i)
+    \text{out}_i = \beta\ \text{mat}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
 
 For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
 :attr:`alpha` must be real numbers, otherwise they should be integers.
@@ -568,7 +568,7 @@ def parse_kwargs(desc):
 Args:
     beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
     mat (Tensor): the tensor to be added
-    alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+    alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
     batch1 (Tensor): the first batch of matrices to be multiplied
     batch2 (Tensor): the second batch of matrices to be multiplied
     out (Tensor, optional): the output tensor
@@ -609,7 +609,7 @@ def parse_kwargs(desc):
 
 Example::
 
-    >>> a = torch.empty(3, 3).uniform_(0, 1) # generate a uniform random matrix with range [0, 1]
+    >>> a = torch.empty(3, 3).uniform_(0, 1)  # generate a uniform random matrix with range [0, 1]
     >>> a
     tensor([[ 0.1737,  0.0950,  0.3609],
             [ 0.7148,  0.0289,  0.2676],
@@ -642,8 +642,8 @@ def parse_kwargs(desc):
 tensor of size 0. If :attr:`minlength` is specified, the number of bins is at least
 :attr:`minlength` and if :attr:`input` is empty, then the result is tensor of size
 :attr:`minlength` filled with zeros. If ``n`` is the value at position ``i``,
-:math:`out[n] += weights[i]` if :attr:`weights` is specified else
-:math:`out[n] += 1`.
+``out[n] += weights[i]`` if :attr:`weights` is specified else
+``out[n] += 1``.
 
 Arguments:
     input (Tensor): 1-d int tensor
@@ -651,9 +651,9 @@ def parse_kwargs(desc):
         Should be of same size as input tensor.
     minlength (int): optional, minimum number of bins. Should be non-negative.
 
-Shape:
-    output (Tensor): ``Size([max(input) + 1])`` if :attr:`input` is non-empty, else
-                     ``Size(0)``
+Returns:
+    output (Tensor): a tensor of shape ``Size([max(input) + 1])`` if
+    :attr:`input` is non-empty, else ``Size(0)``
 
 Example::
 
@@ -685,7 +685,7 @@ def parse_kwargs(desc):
 :math:`(b \times n \times p)` tensor.
 
 .. math::
-    out_i = batch1_i \mathbin{@} batch2_i
+    \text{out}_i = \text{batch1}_i \mathbin{@} \text{batch2}_i
 
 .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
           For broadcasting matrix products, see :func:`torch.matmul`.
@@ -1259,7 +1259,7 @@ def parse_kwargs(desc):
 and returns a new resulting tensor.
 
 .. math::
-    out_i = \frac{input_i}{value}
+    \text{out}_i = \frac{\text{input}_i}{\text{value}}
 
 If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, :attr:`value`
 should be a real number, otherwise it should be an integer
@@ -1285,7 +1285,7 @@ def parse_kwargs(desc):
 :ref:`broadcastable <broadcasting-semantics>`.
 
 .. math::
-    out_i = \frac{input_i}{other_i}
+    \text{out}_i = \frac{\text{input}_i}{\text{other}_i}
 
 Args:
     input (Tensor): the numerator tensor
@@ -1341,16 +1341,17 @@ def parse_kwargs(desc):
     (Tensor, Tensor): A tuple containing
 
         - **e** (*Tensor*): Shape :math:`(n \times 2)`. Each row is an eigenvalue of ``a``,
-            where the first element is the real part and the second element is the imaginary part.
-            The eigenvalues are not necessarily ordered.
+          where the first element is the real part and the second element is the imaginary part.
+          The eigenvalues are not necessarily ordered.
         - **v** (*Tensor*): If ``eigenvectors=False``, it's an empty tensor.
-            Otherwise, this tensor of shape :math:`(n \times n)` can be used to compute normalized (unit length)
-            eigenvectors of corresponding eigenvalues ``e`` as follows.
-            If the corresponding e[j] is a real number, column v[:, j] is the eigenvector corresponding to
-            eigenvalue e[j].
-            If the corresponding e[j] and e[j + 1] eigenvalues form a complex conjugate pair, then the true eigenvectors
-            can be computed as
-            :math:`eigenvector[j] = v[:, j] + i * v[:, j + 1], eigenvector[j + 1] = v[:, j] - i * v[:, j + 1]`.
+          Otherwise, this tensor of shape :math:`(n \times n)` can be used to compute normalized (unit length)
+          eigenvectors of corresponding eigenvalues ``e`` as follows.
+          If the corresponding e[j] is a real number, column v[:, j] is the eigenvector corresponding to
+          eigenvalue e[j].
+          If the corresponding e[j] and e[j + 1] eigenvalues form a complex conjugate pair, then the true eigenvectors
+          can be computed as
+          :math:`\text{eigenvector}[j] = v[:, j] + i \times v[:, j + 1]`,
+          :math:`\text{eigenvector}[j + 1] = v[:, j] - i \times v[:, j + 1]`.
 """)
 
 add_docstr(torch.eq,
@@ -1645,7 +1646,7 @@ def parse_kwargs(desc):
 
 If :attr:`input` is an n-dimensional tensor with size
 :math:`(x_0, x_1..., x_{i-1}, x_i, x_{i+1}, ..., x_{n-1})`
-and :attr:`dim` :math:`= i`, then :attr:`index` must be an :math:`n`-dimensional tensor with
+and ``dim = i``, then :attr:`index` must be an :math:`n`-dimensional tensor with
 size :math:`(x_0, x_1, ..., x_{i-1}, y, x_{i+1}, ..., x_{n-1})` where :math:`y \geq 1`
 and :attr:`out` will have the same size as :attr:`index`.
 
@@ -1667,7 +1668,7 @@ def parse_kwargs(desc):
            r"""
 ge(input, other, out=None) -> Tensor
 
-Computes :math:`input \geq other` element-wise.
+Computes :math:`\text{input} \geq \text{other}` element-wise.
 
 The second argument can be a number or a tensor whose shape is
 :ref:`broadcastable <broadcasting-semantics>` with the first argument.
@@ -1693,7 +1694,7 @@ def parse_kwargs(desc):
 
 Computes the solution to the least squares and least norm problems for a full
 rank matrix :math:`A` of size :math:`(m \times n)` and a matrix :math:`B` of
-size :math:`(n \times k)`.
+size :math:`(m \times k)`.
 
 If :math:`m \geq n`, :func:`gels` solves the least-squares problem:
 
@@ -1708,11 +1709,11 @@ def parse_kwargs(desc):
 .. math::
 
    \begin{array}{ll}
-   \min_X & \|X\|_2 & \mbox{subject to} & AX = B.
+   \min_X & \|X\|_2 & \text{subject to} & AX = B.
    \end{array}
 
 Returned tensor :math:`X` has shape :math:`(\max(m, n) \times k)`. The first :math:`n`
-rows of :math:`X` contains the solution. If :math`m \geq n`, the residual sum of squares
+rows of :math:`X` contains the solution. If :math:`m \geq n`, the residual sum of squares
 for the solution in each column is given by the sum of squares of elements in the
 remaining :math:`m - n` rows of that column.
 
@@ -1821,7 +1822,7 @@ def parse_kwargs(desc):
 
 .. note::
 
-    The `out` keyword only supports 2D matrix inputs, that is,
+    The :attr:`out` keyword only supports 2D matrix inputs, that is,
     `B, A` must be 2D matrices.
 
 .. note::
@@ -1832,10 +1833,10 @@ def parse_kwargs(desc):
     `A.contiguous().transpose(-1, -2).strides()` respectively.
 
 Args:
-    B (Tensor): input matrix of size :math:`(*, m, k)` , where `*`
-    is zero or more batch dimensions.
+    B (Tensor): input matrix of size :math:`(*, m, k)` , where :math:`*`
+                is zero or more batch dimensions.
     A (Tensor): input square matrix of size :math:`(*, m, m)`, where
-    `*` is zero or more batch dimensions.
+                :math:`*` is zero or more batch dimensions.
     out ((Tensor, Tensor), optional): optional output tuple.
 
 Example::
@@ -1893,7 +1894,7 @@ def parse_kwargs(desc):
            r"""
 gt(input, other, out=None) -> Tensor
 
-Computes :math:`input > other` element-wise.
+Computes :math:`\text{input} > \text{other}` element-wise.
 
 The second argument can be a number or a tensor whose shape is
 :ref:`broadcastable <broadcasting-semantics>` with the first argument.
@@ -2054,7 +2055,7 @@ def parse_kwargs(desc):
            r"""
 le(input, other, out=None) -> Tensor
 
-Computes :math:`input \leq other` element-wise.
+Computes :math:`\text{input} \leq \text{other}` element-wise.
 
 The second argument can be a number or a tensor whose shape is
 :ref:`broadcastable <broadcasting-semantics>` with the first argument.
@@ -2082,7 +2083,7 @@ def parse_kwargs(desc):
 on a scalar :attr:`weight` and returns the resulting :attr:`out` tensor.
 
 .. math::
-    out_i = start_i + weight \times (end_i - start_i)
+    \text{out}_i = \text{start}_i + \text{weight} \times (\text{end}_i - \text{start}_i)
 
 The shapes of :attr:`start` and :attr:`end` must be
 :ref:`broadcastable <broadcasting-semantics>`.
@@ -2299,7 +2300,7 @@ def parse_kwargs(desc):
            r"""
 lt(input, other, out=None) -> Tensor
 
-Computes :math:`input < other` element-wise.
+Computes :math:`\text{input} < \text{other}` element-wise.
 
 The second argument can be a number or a tensor whose shape is
 :ref:`broadcastable <broadcasting-semantics>` with the first argument.
@@ -2471,7 +2472,7 @@ def parse_kwargs(desc):
 but they must be :ref:`broadcastable <broadcasting-semantics>`.
 
 .. math::
-    out_i = \max(tensor_i, other_i)
+    \text{out}_i = \max(\text{tensor}_i, \text{other}_i)
 
 .. note:: When the shapes do not match, the shape of the returned output tensor
           follows the :ref:`broadcasting rules <broadcasting-semantics>`.
@@ -2647,7 +2648,7 @@ def parse_kwargs(desc):
 but they must be :ref:`broadcastable <broadcasting-semantics>`.
 
 .. math::
-    out_i = \min(tensor_i, other_i)
+    \text{out}_i = \min(\text{tensor}_i, \text{other}_i)
 
 .. note:: When the shapes do not match, the shape of the returned output tensor
           follows the :ref:`broadcasting rules <broadcasting-semantics>`.
@@ -2803,7 +2804,7 @@ def parse_kwargs(desc):
 :attr:`value` and returns a new resulting tensor.
 
 .. math::
-    out_i = value \times input_i
+    \text{out}_i = \text{value} \times \text{input}_i
 
 If :attr:`input` is of type `FloatTensor` or `DoubleTensor`, :attr:`value`
 should be a real number, otherwise it should be an integer
@@ -2830,7 +2831,7 @@ def parse_kwargs(desc):
 :ref:`broadcastable <broadcasting-semantics>`.
 
 .. math::
-    out_i = input_i \times other_i
+    \text{out}_i = \text{input}_i \times \text{other}_i
 
 Args:
     input (Tensor): the first multiplicand tensor
@@ -2874,7 +2875,7 @@ def parse_kwargs(desc):
 If :attr:`input` is a vector, :attr:`out` is a vector of size :attr:`num_samples`.
 
 If :attr:`input` is a matrix with `m` rows, :attr:`out` is an matrix of shape
-:math:`(m \times num\_samples)`.
+:math:`(m \times \text{num\_samples})`.
 
 If replacement is ``True``, samples are drawn with replacement.
 
@@ -2953,6 +2954,32 @@ def parse_kwargs(desc):
             [1.0311, 0.3901, 0.5049]])
 """)
 
+add_docstr(torch.narrow,
+           r"""
+narrow(input, dimension, start, length) -> Tensor
+
+Returns a new tensor that is a narrowed version of :attr:`input` tensor. The
+dimension :attr:`dim` is input from :attr:`start` to :attr:`start + length`. The
+returned tensor and :attr:`self` tensor share the same underlying storage.
+
+Args:
+    input (Tensor): the tensor to narrow
+    dimension (int): the dimension along which to narrow
+    start (int): the starting dimension
+    length (int): the distance to the ending dimension
+
+Example::
+
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> torch.narrow(x, 0, 0, 2)
+    tensor([[ 1,  2,  3],
+            [ 4,  5,  6]])
+    >>> torch.narrow(x, 1, 1, 2)
+    tensor([[ 2,  3],
+            [ 5,  6],
+            [ 8,  9]])
+""")
+
 add_docstr(torch.ne,
            r"""
 ne(input, other, out=None) -> Tensor
@@ -2984,7 +3011,7 @@ def parse_kwargs(desc):
 Returns a new tensor with the negative of the elements of :attr:`input`.
 
 .. math::
-    out = -1 \times input
+    \text{out} = -1 \times \text{input}
 
 Args:
     input (Tensor): the input tensor
@@ -3313,13 +3340,13 @@ def parse_kwargs(desc):
 Cholesky factor :attr:`u`: returns matrix `inv`
 
 If :attr:`upper` is ``True`` or not provided, :attr:`u` is upper
-triangular such that:
+triangular such that the returned tensor is
 
 .. math::
     inv = (u^T u)^{-1}
 
 If :attr:`upper` is ``False``, :attr:`u` is lower triangular
-such that:
+such that the returned tensor is
 
 .. math::
     inv = (uu^{T})^{-1}
@@ -3412,12 +3439,12 @@ def parse_kwargs(desc):
 When :attr:`exponent` is a scalar value, the operation applied is:
 
 .. math::
-    out_i = x_i ^ {exponent}
+    \text{out}_i = x_i ^ \text{exponent}
 
 When :attr:`exponent` is a tensor, the operation applied is:
 
 .. math::
-    out_i = x_i ^ {exponent_i}
+    \text{out}_i = x_i ^ {\text{exponent}_i}
 
 When :attr:`exponent` is a tensor, the shapes of :attr:`input`
 and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
@@ -3623,7 +3650,7 @@ def parse_kwargs(desc):
     >>> torch.rand(2, 3)
     tensor([[ 0.8237,  0.5781,  0.6879],
             [ 0.3816,  0.7249,  0.0998]])
-""")
+""".format(**factory_common_args))
 
 add_docstr(torch.rand_like,
            r"""
@@ -3791,6 +3818,13 @@ def parse_kwargs(desc):
     If you have a NumPy ``ndarray`` and want to avoid a copy, use
     :func:`torch.from_numpy`.
 
+.. warning::
+
+    When data is a tensor `x`, :func:`torch.tensor` reads out 'the data' from whatever it is passed,
+    and constructs a leaf variable. Therefore ``torch.tensor(x)`` is equivalent to ``x.clone().detach()``
+    and ``torch.tensor(x, requires_grad=True)`` is equivalent to ``x.clone().detach().requires_grad_(True)``.
+    The equivalents use ``clone()`` and ``detach()`` are recommended.
+
 Args:
     {data}
     {dtype}
@@ -3824,13 +3858,13 @@ def parse_kwargs(desc):
            r"""
 range(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
-Returns a 1-D tensor of size :math:`\left\lfloor \frac{{end - start}}{{step}} \right\rfloor + 1`
+Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor + 1`
 with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is
 the gap between two values in the tensor.
 
 .. math::
-    \text{{out}}_{{i+1}} = \text{{out}}_i + step.
-
+    \text{out}_{i+1} = \text{out}_i + \text{step}.
+""" + r"""
 .. warning::
     This function is deprecated in favor of :func:`torch.arange`.
 
@@ -3856,7 +3890,7 @@ def parse_kwargs(desc):
            r"""
 arange(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
-Returns a 1-D tensor of size :math:`\left\lfloor \frac{{end - start}}{{step}} \right\rfloor`
+Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor`
 with values from the interval ``[start, end)`` taken with common difference
 :attr:`step` beginning from `start`.
 
@@ -3865,8 +3899,8 @@ def parse_kwargs(desc):
 in such cases.
 
 .. math::
-    \text{{out}}_{{i+1}} = \text{{out}}_{{i}} + \text{{step}}
-
+    \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+""" + r"""
 Args:
     start (Number): the starting value for the set of points. Default: ``0``.
     end (Number): the ending value for the set of points
@@ -3966,6 +4000,8 @@ def parse_kwargs(desc):
 with compatible strides can be reshaped without copying, but you should not
 depend on the copying vs. viewing behavior.
 
+See :meth:`torch.Tensor.view` on when it is possible to return a view.
+
 A single dimension may be -1, in which case it's inferred from the remaining
 dimensions and the number of elements in :attr:`input`.
 
@@ -4293,8 +4329,8 @@ def parse_kwargs(desc):
 
 When :attr:`dim` is given, a squeeze operation is done only in the given
 dimension. If `input` is of shape: :math:`(A \times 1 \times B)`,
-`squeeze(input, 0)` leaves the tensor unchanged, but :func:`squeeze(input, 1)` will
-squeeze the tensor to the shape :math:`(A \times B)`.
+``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+will squeeze the tensor to the shape :math:`(A \times B)`.
 
 .. note:: The returned tensor shares the storage with the input tensor,
           so changing the contents of one will change the contents of the other.
@@ -4504,7 +4540,7 @@ def parse_kwargs(desc):
 :math:`m` dimensional vector.
 
 This function calculates all eigenvalues (and vectors) of :attr:`input`
-such that :math:`input = V diag(e) V^T`.
+such that :math:`\text{input} = V \text{diag}(e) V^T`.
 
 The boolean argument :attr:`eigenvectors` defines computation of
 eigenvectors or eigenvalues only.
@@ -4530,10 +4566,10 @@ def parse_kwargs(desc):
     (Tensor, Tensor): A tuple containing
 
         - **e** (*Tensor*): Shape :math:`(m)`. Each element is an eigenvalue of ``input``,
-            The eigenvalues are in ascending order.
+          The eigenvalues are in ascending order.
         - **V** (*Tensor*): Shape :math:`(m \times m)`.
-            If ``eigenvectors=False``, it's a tensor filled with zeros.
-            Otherwise, this tensor contains the orthonormal eigenvectors of the ``input``.
+          If ``eigenvectors=False``, it's a tensor filled with zeros.
+          Otherwise, this tensor contains the orthonormal eigenvectors of the ``input``.
 
 Examples::
 
@@ -4561,7 +4597,7 @@ def parse_kwargs(desc):
 Expects :attr:`input` to be a matrix (2-D tensor) and transposes dimensions 0
 and 1.
 
-Can be seen as a short-hand function for :meth:`transpose(input, 0, 1)`
+Can be seen as a short-hand function for ``transpose(input, 0, 1)``.
 
 Args:
     input (Tensor): the input tensor
@@ -4907,28 +4943,28 @@ def parse_kwargs(desc):
            r"""
 trtrs(b, A, upper=True, transpose=False, unitriangular=False) -> (Tensor, Tensor)
 
-Solves a system of equations with a triangular coefficient matrix `A`
-and multiple right-hand sides `b`.
+Solves a system of equations with a triangular coefficient matrix :math:`A`
+and multiple right-hand sides :attr:`b`.
 
-In particular, solves :math:`AX = b` and assumes `A` is upper-triangular
+In particular, solves :math:`AX = b` and assumes :math:`A` is upper-triangular
 with the default keyword arguments.
 
 Args:
     A (Tensor): the input triangular coefficient matrix
-    b (Tensor): multiple right-hand sides. Each column of `b` is a
+    b (Tensor): multiple right-hand sides. Each column of :math:`b` is a
         right-hand side for the system of equations.
     upper (bool, optional): whether to solve the upper-triangular system
         of equations (default) or the lower-triangular system of equations. Default: True.
-    transpose (bool, optional): whether `A` should be transposed before
+    transpose (bool, optional): whether :math:`A` should be transposed before
         being sent into the solver. Default: False.
-    unitriangular (bool, optional): whether `A` is unit triangular.
-        If True, the diagonal elements of `A` are assumed to be
-        1 and not referenced from `A`. Default: False.
+    unitriangular (bool, optional): whether :math:`A` is unit triangular.
+        If True, the diagonal elements of :math:`A` are assumed to be
+        1 and not referenced from :math:`A`. Default: False.
 
 Returns:
-    A tuple (X, M) where `M` is a clone of `A` and `X` is the solution to
-    `AX = b` (or whatever variant of the system of equations, depending on
-    the keyword arguments.)
+    A tuple :math:`(X, M)` where :math:`M` is a clone of :math:`A` and :math:`X`
+    is the solution to :math:`AX = b` (or whatever variant of the system of
+    equations, depending on the keyword arguments.)
 
 Shape:
     - A: :math:`(N, N)`
@@ -5281,7 +5317,7 @@ def parse_kwargs(desc):
 
 .. math::
     out_i = \begin{cases}
-        x_i & \text{if } condition_i \\
+        x_i & \text{if } \text{condition}_i \\
         y_i & \text{otherwise} \\
     \end{cases}
 
@@ -5961,36 +5997,3 @@ def parse_kwargs(desc):
     >>>                            [7, 8, 9]]))
     (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9]))
 """)
-
-
-add_docstr(torch.meshgrid,
-           r"""
-meshgrid(seq) -> seq
-
-Take a sequence of :math:`N` tensors, each of which can be either scalar or 1-dimensional
-vector, and create :math:`N` N-dimensional grids, where the :math:`i`th grid is defined by
-expanding the :math:`i`th input over dimensions defined by other inputs.
-
-Arguments:
-    seq (sequence of Tensors): sequence of scalars or 1 dimensional tensors. Scalars will be
-        treated as tensors of size :math:`(1,)` automatically.
-
-Returns:
-    seq (sequence of Tensors): If the input has :math:`k` tensors of size
-        :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also has :math:`k` tensors,
-        where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`.
-
-Example::
-
-    >>> x = torch.tensor([1, 2, 3])
-    >>> y = torch.tensor([4, 5, 6])
-    >>> grid_x, grid_y = torch.meshgrid([x, y])
-    >>> grid_x
-    tensor([[1, 1, 1],
-            [2, 2, 2],
-            [3, 3, 3]])
-    >>> grid_y
-    tensor([[4, 5, 6],
-            [4, 5, 6],
-            [4, 5, 6]])
-""")
diff --git a/torch/abi-check.cpp b/torch/abi-check.cpp
new file mode 100644
index 00000000000000..e506eb7831cad4
--- /dev/null
+++ b/torch/abi-check.cpp
@@ -0,0 +1,9 @@
+#include <iostream>
+
+int main() {
+#ifdef _GLIBCXX_USE_CXX11_ABI
+  std::cout << _GLIBCXX_USE_CXX11_ABI;
+#else
+  std::cout << 0;
+#endif
+}
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index 35dada34265d12..a4a768b633e331 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -28,11 +28,8 @@ class no_grad(object):
         >>> z.requires_grad
         False
     """
-
-    def __init__(self):
-        self.prev = torch.is_grad_enabled()
-
     def __enter__(self):
+        self.prev = torch.is_grad_enabled()
         torch._C.set_grad_enabled(False)
 
     def __exit__(self, *args):
@@ -75,11 +72,8 @@ class enable_grad(object):
         True
 
     """
-
-    def __init__(self):
-        self.prev = torch.is_grad_enabled()
-
     def __enter__(self):
+        self.prev = torch.is_grad_enabled()
         torch._C.set_grad_enabled(True)
 
     def __exit__(self, *args):
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 2cc4ebbfacd4a3..26dc9daf4a7350 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -1,5 +1,5 @@
 import torch
-from collections import Iterable
+from torch._six import container_abcs
 import torch.testing
 import sys
 from itertools import product
@@ -11,7 +11,7 @@ def zero_gradients(x):
         if x.grad is not None:
             x.grad.detach_()
             x.grad.data.zero_()
-    elif isinstance(x, Iterable):
+    elif isinstance(x, container_abcs.Iterable):
         for elem in x:
             zero_gradients(elem)
 
@@ -23,7 +23,7 @@ def make_jacobian(input, num_out):
         if not input.requires_grad:
             return None
         return torch.zeros(input.nelement(), num_out, dtype=input.dtype)
-    elif isinstance(input, Iterable):
+    elif isinstance(input, container_abcs.Iterable):
         jacobians = list(filter(
             lambda x: x is not None, (make_jacobian(elem, num_out) for elem in input)))
         if not jacobians:
@@ -37,7 +37,7 @@ def iter_tensors(x, only_requiring_grad=False):
     if isinstance(x, torch.Tensor):
         if x.requires_grad or not only_requiring_grad:
             yield x
-    elif isinstance(x, Iterable):
+    elif isinstance(x, container_abcs.Iterable):
         for elem in x:
             for result in iter_tensors(elem, only_requiring_grad):
                 yield result
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index c1be47ad494397..97a0fa8e97d85f 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -249,6 +249,51 @@ class emit_nvtx(object):
         ...     model(x) # Warmup CUDA memory allocator and profiler
         ...     with torch.autograd.profiler.emit_nvtx():
         ...         model(x)
+
+    **Forward-backward correlation**
+
+    When viewing a profile created using :class:`emit_nvtx` in the Nvidia Visual Profiler,
+    correlating each backward-pass op with the corresponding forward-pass op can be difficult.
+    To ease this task, :class:`emit_nvtx` appends sequence number information to the ranges it
+    generates.
+
+    During the forward pass, each function range is decorated with ``seq=<N>``.  ``seq`` is a running
+    counter, incremented each time a new backward Function object is created and stashed for backward.
+    Thus, the `seq=<N>` annotation associated with each forward function range tells you that
+    if a backward Function object is created by this forward function,
+    the backward object will receive sequence number N.
+    During the backward pass, the top-level range wrapping each C++ backward Function's
+    ``apply()`` call is decorated with ``stashed seq=<M>``.  ``M`` is the sequence number that
+    the backward object was created with.  By comparing ``stashed seq`` numbers in backward with ``seq``
+    numbers in forward, you can track down which forward op created each backward Function.
+
+    Any functions executed during the backward pass are also decorated with ``seq=<N>``.  During
+    default backward (with ``create_graph=False``) this information is irrelevant, and in fact,
+    ``N`` may simply be 0 for all such functions.  Only the top-level ranges associated with
+    backward Function objects' ``apply()`` methods are useful, as a way to correlate these Function
+    objects with the earlier forward pass.
+
+    **Double-backward**
+
+    If, on the other hand, a backward pass with ``create_graph=True`` is underway (in other words,
+    if you are setting up for a double-backward), each function's execution during backward
+    is given a nonzero, useful ``seq=<N>``.  Those functions may themselves create Function objects
+    to be executed later during double-backward, just as the original functions in the forward pass did.
+    The relationship between backward and double-backward is conceptually the same as the relationship
+    between forward and backward: The functions still emit current-sequence-number-tagged ranges,
+    the Function objects they create still stash those sequence numbers, and during the eventual
+    double-backward, the Function objects' ``apply()`` ranges are still tagged with ``stashed seq``
+    numbers, which can be compared to `seq` numbers from the backward pass.
+
+    .. warning:
+        The sequence number is thread-local, and some forward functions don't create an associated
+        backward Function object (instead delegating that to sub-functions further down the call chain).
+        For these reasons, the correspondence of stashed sequence numbers in
+        backward Function ``apply()`` ranges with `seq` numbers in forward-pass ranges is
+        not guaranteed to be 1 to 1.  The sequence numbers alone may not be enough to fully
+        disambiguate which forward function created which
+        backward Function object.  You may need to make a judgment based on analytic knowledge of what
+        the expected correspondence should be.
     """
     def __init__(self, enabled=True):
         self.enabled = enabled
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
index 5d1e991574b341..e5a95a94e1ae85 100644
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@@ -7,6 +7,7 @@
 #include "torch/csrc/Exceptions.h"
 #include "torch/csrc/autograd/generated/VariableType.h"
 #include "torch/csrc/utils/cuda_enabled.h"
+#include "torch/csrc/utils/cuda_lazy_init.h"
 
 #include <ATen/ATen.h>
 
@@ -71,7 +72,7 @@ PyTypeObject* getPyTypeObject(const at::Storage& storage)
 {
   auto attype = at::globalContext().getNonVariableTypeOpt(
       deviceTypeToBackend(storage.device_type()),
-      at::dataTypeToScalarType(storage.dtype()));
+      at::dataTypeToScalarType(storage.dtype().id()));
   auto it = attype_to_py_storage_type.find(attype);
   if (it != attype_to_py_storage_type.end()) {
     return it->second;
@@ -99,6 +100,9 @@ void registerLayoutObject(THPLayout *layout, at::Backend backend) {
 
 at::Type& getVariableType(at::ScalarType scalarType, const THPLayout& layout, const at::Device& device) {
   const at::Backend backend = get_backend(device.type() == at::Device::Type::CUDA, layout.layout == at::Layout::Sparse);
+  if (device.is_cuda()) {
+    torch::utils::cuda_lazy_init();
+  }
   auto baseType = at::globalContext().getNonVariableTypeOpt(backend, scalarType);
   if (!baseType) {
     std::ostringstream oss;
diff --git a/torch/csrc/api/README.md b/torch/csrc/api/README.md
deleted file mode 100644
index 3f11fdae25226f..00000000000000
--- a/torch/csrc/api/README.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# AUTOGRADPP
-
-This is an experimental C++ frontend to pytorch's C++ backend. Use at your own
-risk.
-
-How to build:
-```
-git submodule update --init --recursive
-
-cd pytorch
-# On Linux:
-python setup.py build
-# On macOS (may need to prefix with `MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++` when using anaconda)
-LDSHARED="cc -dynamiclib -undefined dynamic_lookup" python setup.py build
-
-cd ..; mkdir -p build; cd build
-cmake .. -DPYTHON_EXECUTABLE:FILEPATH=$(which python)  # helpful if you use anaconda
-make -j
-```
-
-# Stuff
-
-- Check out the [MNIST example](https://github.com/ebetica/autogradpp/blob/eee977ddd377c484af5fce09ae8676410bb6fcce/tests/integration_t.cpp#L320-L355),
-which tries to replicate PyTorch's MNIST model + training loop
-- The principled way to write a model is probably something like 
-```
-TORCH_AUTOGRAD_CONTAINER_CLASS(MyModel) {
-  // This does a 2D convolution, followed by global sum pooling, followed by a linear.
- public:
-  void initialize_containers() override {
-    myConv_ = add(Conv2d(1, 50, 3, 3).stride(2).make(), "conv");
-    myLinear_ = add(Linear(50, 1).make(), "linear");
-  }
-  std::vector<Tensor> forward(std::vector<Tensor> x) override {
-    auto v = myConv_->forward(x);
-    v = v.mean(-1).mean(-1);
-    return myLinear_.forward({v});
-  }
- private:
-  Container myLinear_;
-  Container myConv_;
-}
-```
-
-Some things are not implemented:
-- SGD, Adagrad, RMSprop, and Adam are the only optimizers implemented
-- Bidirectional, batch first, and PackedSequence are not implemented for LSTMs
-- Sparse Tensors might work but are very untested
-
-Otherwise, lots of other things work. There may be breaking API changes.
diff --git a/torch/csrc/api/include/torch/jit.h b/torch/csrc/api/include/torch/jit.h
index 5a4b262101ab68..d43a22c3e9f50d 100644
--- a/torch/csrc/api/include/torch/jit.h
+++ b/torch/csrc/api/include/torch/jit.h
@@ -17,7 +17,7 @@ namespace jit {
 ///
 /// For example:
 /// \rst
-/// .. code-block::
+/// .. code-block:: cpp
 ///   auto module = torch::jit::compile(R"JIT(
 ///     def relu_script(a, b):
 ///       return torch.relu(a + b)
diff --git a/torch/csrc/api/include/torch/nn.h b/torch/csrc/api/include/torch/nn.h
index 408330d94232a6..7285a696350b7c 100644
--- a/torch/csrc/api/include/torch/nn.h
+++ b/torch/csrc/api/include/torch/nn.h
@@ -2,6 +2,7 @@
 
 #include <torch/nn/cloneable.h>
 #include <torch/nn/cursor.h>
+#include <torch/nn/init.h>
 #include <torch/nn/module.h>
 #include <torch/nn/modules.h>
 #include <torch/nn/pimpl.h>
diff --git a/torch/csrc/api/include/torch/nn/init.h b/torch/csrc/api/include/torch/nn/init.h
index 9054e8446f3e0c..0fd68df5bbd0c8 100644
--- a/torch/csrc/api/include/torch/nn/init.h
+++ b/torch/csrc/api/include/torch/nn/init.h
@@ -6,16 +6,64 @@ namespace torch {
 namespace nn {
 namespace init {
 
+/// Fills the given `tensor` with the provided `value` in-place, and returns it.
+/// No gradient will be recorded for this operation.
 Tensor constant_(Tensor tensor, Scalar value);
+
+/// Fills the given `tensor` with the Dirac delta function in-place, and returns
+/// it. No gradient will be recorded for this operation.
 Tensor dirac_(Tensor tensor);
-Tensor eye_(Tensor tensor);
+
+/// Fills the given 2-dimensional `matrix` with an identity matrix.
+/// No gradient will be recorded for this operation.
+Tensor eye_(Tensor matrix);
+
+/// Fills the given 2-dimensional `matrix` with values drawn from a normal
+/// distribution parameterized by `mean` and `std`.
+/// No gradient will be recorded for this operation.
 Tensor normal_(Tensor tensor, double mean = 0, double std = 1);
+
+/// Fills the given `tensor` with ones.
+/// No gradient will be recorded for this operation.
 Tensor ones_(Tensor tensor);
+
+/// Fills the input `Tensor` with a (semi) orthogonal matrix, as described in
+/// "Exact solutions to the nonlinear dynamics of learning in deep linear neural
+/// networks" - Saxe, A. et al. (2013). The input tensor must have at least 2
+/// dimensions, and for tensors with more than 2 dimensions the trailing
+/// dimensions are flattened.
+/// No gradient will be recorded for this operation.
 Tensor orthogonal_(Tensor tensor, double gain = 1.0);
+
+/// Fills the 2D input `Tensor` as a sparse matrix, where the
+/// non-zero elements will be drawn from a centered normal distribution
+/// with the given standard deviation `std`, as described in "Deep learning via
+/// Hessian-free optimization" - Martens, J. (2010). The `sparsity` is a real
+/// value between 0 and 1 that controls the fraction of elements in each column
+/// to be set to zero.
+/// No gradient will be recorded for this operation.
 Tensor sparse_(Tensor tensor, double sparsity, double std = 0.01);
+
+/// Fills the given 2-dimensional `matrix` with values drawn from a uniform
+/// distribution parameterized by `low` and `high`.
+/// No gradient will be recorded for this operation.
 Tensor uniform_(Tensor tensor, double low = 0, double high = 1);
+
+/// Fills the input `Tensor` with values according to the method
+/// described in "Understanding the difficulty of training deep feedforward
+/// neural networks" - Glorot, X. & Bengio, Y. (2010). Values are scaled by the
+/// `gain` parameter. No gradient will be recorded for this operation.
 Tensor xavier_normal_(Tensor tensor, double gain = 1.0);
+
+/// Fills the input `Tensor` with values according to the method
+/// described in "Understanding the difficulty of training deep feedforward
+/// neural networks" - Glorot, X. & Bengio, Y. (2010), using a uniform
+/// distribution. Values are scaled by the `gain` parameter
+/// No gradient will be recorded for this operation.
 Tensor xavier_uniform_(Tensor tensor, double gain = 1.0);
+
+/// Fills the given `tensor` with zeros.
+/// No gradient will be recorded for this operation.
 Tensor zeros_(Tensor tensor);
 
 } // namespace init
diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h
index ccfa10a90ad2a4..4dca240aac493e 100644
--- a/torch/csrc/api/include/torch/nn/module.h
+++ b/torch/csrc/api/include/torch/nn/module.h
@@ -14,12 +14,15 @@
 #include <type_traits>
 #include <unordered_map>
 
+// forward declarations confuse doxygen
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
 namespace torch {
 namespace detail {
 template <typename T>
 class CursorBase;
 } // namespace detail
 } // namespace torch
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
 namespace torch {
 namespace nn {
@@ -225,8 +228,8 @@ class Module {
   ///     }
   ///   }
   ///
-  /// MyModule module;
-  /// module->modules().apply(initialize_weights);
+  ///   MyModule module;
+  ///   module->modules().apply(initialize_weights);
   /// \endrst
   template <typename ModuleType>
   typename ModuleType::ContainedType* as() noexcept;
@@ -244,8 +247,8 @@ class Module {
   ///     }
   ///   }
   ///
-  /// MyModule module;
-  /// module->modules().apply(initialize_weights);
+  ///   MyModule module;
+  ///   module->modules().apply(initialize_weights);
   /// \endrst
   template <
       typename ModuleType,
@@ -260,7 +263,7 @@ class Module {
   /// methods such as `parameters()`, `clone()` or `to().`
   ///
   /// \rst
-  /// .. code-block: cpp
+  /// .. code-block:: cpp
   ///   MyModule::MyModule() {
   ///     weight_ = register_parameter("weight", torch::randn({A, B}));
   ///   }
@@ -277,7 +280,7 @@ class Module {
   /// to methods such as `buffers()`, `clone()` or `to().
   ///
   /// \rst
-  /// .. code-block: cpp
+  /// .. code-block:: cpp
   ///   MyModule::MyModule() {
   ///     mean_ = register_buffer("mean", torch::empty({num_features_}));
   ///   }
@@ -290,7 +293,7 @@ class Module {
   /// `clone()` or `to()`.
   ///
   /// \rst
-  /// .. code-block: cpp
+  /// .. code-block:: cpp
   ///   MyModule::MyModule() {
   ///     submodule_ = register_module("linear", torch::nn::Linear(3, 4));
   ///   }
@@ -308,7 +311,7 @@ class Module {
   /// `clone()` or `to()`.
   ///
   /// \rst
-  /// .. code-block: cpp
+  /// .. code-block:: cpp
   ///   MyModule::MyModule() {
   ///     submodule_ = register_module("linear", torch::nn::Linear(3, 4));
   ///   }
diff --git a/torch/csrc/api/include/torch/nn/modules/any.h b/torch/csrc/api/include/torch/nn/modules/any.h
index 920aea6aab3a54..1544ce31a835f9 100644
--- a/torch/csrc/api/include/torch/nn/modules/any.h
+++ b/torch/csrc/api/include/torch/nn/modules/any.h
@@ -21,10 +21,87 @@
 namespace torch {
 namespace nn {
 
-/// A class to store a type erased module, whose `forward()` method can be
-/// invoked, with dynamic type checking. An `AnyModule` has an empty state, into
-/// which it is default constructed. `is_empty()` can be used to query whether
-/// the `AnyModule` is empty.
+/// Stores a type erased `Module`.
+///
+/// The PyTorch C++ API does not impose an interface on the signature of
+/// `forward()` in `Module` subclasses. This gives you complete freedom to
+/// design your `forward()` methods to your liking. However, this also means
+/// there is no unified base type you could store in order to call `forward()`
+/// polymorphically for any module. This is where the `AnyModule` comes in.
+/// Instead of inheritance, it relies on type erasure for polymorphism.
+///
+/// An `AnyModule` can store any `nn::Module` subclass that provides a
+/// `forward()` method. This `forward()` may accept any types and return any
+/// type. Once stored in an `AnyModule`, you can invoke the underlying module's
+/// `forward()` by calling `AnyModule::forward()` with the arguments you would
+/// supply to the stored module (though see one important limitation below).
+/// Example:
+///
+/// \rst
+/// .. code-block:: cpp
+///   struct GenericTrainer {
+///     torch::nn::AnyModule module;
+///
+///     void train(torch::Tensor input) {
+///       module.forward(input);
+///     }
+///   };
+///
+///   GenericTrainer trainer1{torch::nn::Linear(3, 4)};
+///   GenericTrainer trainer2{torch::nn::Conv2d(3, 4, 2)};
+/// \endrst
+///
+/// As `AnyModule` erases the static type of the stored module (and its
+/// `forward()` method) to achieve polymorphism, type checking of arguments is
+/// moved to runtime. That is, passing an argument with an incorrect type to an
+/// `AnyModule` will compile, but throw an exception at runtime:
+///
+/// \rst
+/// .. code-block:: cpp
+///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
+///   // Linear takes a tensor as input, but we are passing an integer.
+///   // This will compile, but throw a `torch::Error` exception at runtime.
+///   module.forward(123);
+/// \endrst
+///
+/// \rst
+/// .. attention::
+///   One noteworthy limitation of `AnyModule` is that its `forward()` method
+///   does not support implicit conversion of argument types. For example, if
+///   the stored module's `forward()` method accepts a `float` and you call
+///   `any_module.forward(3.4)` (where `3.4` is a `double`), this will throw
+///   an exception.
+/// \endrst
+///
+/// The return type of the `AnyModule`'s `forward()` method is controlled via
+/// the first template argument to `AnyModule::forward()`. It defaults to
+/// `torch::Tensor`. To change it, you can write `any_module.forward<int>()`,
+/// for example.
+///
+/// \rst
+/// .. code-block:: cpp
+///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
+///   auto output = module.forward(torch::ones({2, 3}));
+///
+///   struct IntModule {
+///     int forward(int x) { return x; }
+///   };
+///   torch::nn::AnyModule module(IntModule{});
+///   int output = module.forward<int>(5);
+/// \endrst
+///
+/// The only other method an `AnyModule` provides access to on the stored
+/// module is `clone()`. However, you may acquire a handle on the module via
+/// `.ptr()`, which returns a `shared_ptr<nn::Module>`. Further, if you know
+/// the concrete type of the stored module, you can get a concrete handle to it
+/// using `.get<T>()` where `T` is the concrete module type.
+///
+/// \rst
+/// .. code-block:: cpp
+///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
+///   std::shared_ptr<nn::Module> ptr = module.ptr();
+///   torch::nn::Linear linear(module.get<torch::nn::Linear>());
+/// \endrst
 class AnyModule {
  public:
   /// A type-erased value.
@@ -69,7 +146,13 @@ class AnyModule {
   /// returns the return value as an `Value`. Use this method when chaining
   /// `AnyModule`s in a loop.
   template <typename... ArgumentTypes>
-  Value forward(ArgumentTypes&&... arguments);
+  Value any_forward(ArgumentTypes&&... arguments);
+
+  /// Invokes `forward()` on the contained module with the given arguments, and
+  /// casts the returned `Value` to the supplied `ReturnType` (which defaults to
+  /// `torch::Tensor`).
+  template <typename ReturnType = torch::Tensor, typename... ArgumentTypes>
+  ReturnType forward(ArgumentTypes&&... arguments);
 
   /// Attempts to cast the underlying module to the given module type. Throws an
   /// exception if the types do not match.
@@ -101,11 +184,13 @@ class AnyModule {
   bool is_empty() const noexcept;
 
  private:
+  /// \internal
   /// The static type of the object we store in the `AnyModule`, which erases
   /// the actual type, but allows us to call `forward()` on the underlying
   /// module.
   struct Placeholder;
 
+  /// \internal
   /// The dynamic type of the object stored in the `AnyModule`. It contains the
   /// concrete instance to which all calls are forwarded. It is parameterized
   /// over the concrete type of the module, and the types of the arguments the
@@ -206,6 +291,7 @@ class AnyModule::Value {
   explicit Value(autograd::Variable variable)
       : Value(Tensor(std::move(variable))) {}
 
+  /// \internal
   /// The static type of the object we store in the `Value`, which erases the
   /// actual object's type, allowing us only to check the `type_info` of the
   /// type stored in the dynamic type.
@@ -216,6 +302,7 @@ class AnyModule::Value {
     const std::type_info& type_info;
   };
 
+  /// \internal
   /// The dynamic type of the object we store in the `Value`, which hides the
   /// actual object we have erased in this `Value`.
   template <typename T>
@@ -254,6 +341,7 @@ struct AnyModule::Placeholder : public AnyModule::Value::Placeholder {
 
 template <typename ModuleType, typename... ArgumentTypes>
 struct AnyModule::Holder : public AnyModule::Placeholder {
+  /// \internal
   struct CheckedGetter {
     template <typename T>
     decay_t<T>&& operator()(size_t index) {
@@ -273,6 +361,7 @@ struct AnyModule::Holder : public AnyModule::Placeholder {
     std::vector<Value>& arguments_;
   };
 
+  /// \internal
   struct InvokeForward {
     template <typename... Ts>
     Value operator()(Ts&&... ts) {
@@ -358,7 +447,7 @@ AnyModule& AnyModule::operator=(std::shared_ptr<ModuleType> module) {
 }
 
 template <typename... ArgumentTypes>
-AnyModule::Value AnyModule::forward(ArgumentTypes&&... arguments) {
+AnyModule::Value AnyModule::any_forward(ArgumentTypes&&... arguments) {
   AT_CHECK(!is_empty(), "Cannot call forward() on an empty AnyModule");
   std::vector<Value> values;
   values.reserve(sizeof...(ArgumentTypes));
@@ -368,6 +457,12 @@ AnyModule::Value AnyModule::forward(ArgumentTypes&&... arguments) {
   return content_->forward(std::move(values));
 }
 
+template <typename ReturnType, typename... ArgumentTypes>
+ReturnType AnyModule::forward(ArgumentTypes&&... arguments) {
+  return any_forward(std::forward<ArgumentTypes>(arguments)...)
+      .template get<ReturnType>();
+}
+
 template <typename T, typename>
 T& AnyModule::get() {
   AT_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule");
@@ -393,9 +488,9 @@ inline std::shared_ptr<Module> AnyModule::ptr() const {
 template <typename T, typename>
 std::shared_ptr<T> AnyModule::ptr() const {
   AT_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule");
-  /// Call get() but discard the value, just to do the type checking.
+  // Call get() but discard the value, just to do the type checking.
   get_<T>();
-  return std::static_pointer_cast<T>(ptr());
+  return std::dynamic_pointer_cast<T>(ptr());
 }
 
 inline const std::type_info& AnyModule::type_info() const {
diff --git a/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
index d4dd669a286fef..ff5cebc7ed6b19 100644
--- a/torch/csrc/api/include/torch/nn/modules/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@@ -8,15 +8,43 @@
 
 namespace torch {
 namespace nn {
+
+/// Options for the `BatchNorm` module.
 struct BatchNormOptions {
   /* implicit */ BatchNormOptions(int64_t features);
+  /// The number of features of the input tensor.
+  /// Changing this parameter after construction __has no effect__.
   TORCH_ARG(int64_t, features);
+  /// Whether to learn a scale and bias that are applied in an affine
+  /// transformation on the input.
+  /// Changing this parameter after construction __has no effect__.
   TORCH_ARG(bool, affine) = true;
-  TORCH_ARG(bool, stateful) = false;
+  /// Whether to store and update batch statistics (mean and variance) in the
+  /// module. If `false`, you should call `pure_forward` and supply those batch
+  /// statistics yourself.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, stateful) = true;
+  /// The epsilon value added for numerical stability.
+  /// Changing this parameter after construction __is effective__.
   TORCH_ARG(double, eps) = 1e-5;
+  /// A momentum multiplier for the mean and variance.
+  /// Changing this parameter after construction __is effective__.
   TORCH_ARG(double, momentum) = 0.1;
 };
 
+/// Applies [Batch Normalization](https://arxiv.org/abs/1502.03167) to an input.
+///
+/// Refer to the documentation for
+/// [`BatchNorm1d`](https://pytorch.org/docs/stable/nn.html#torch.nn.BatchNorm1d)
+/// in PyTorch to learn more about the exact semantics of this module, __but see
+/// the note below regarding differences between the Python and C++ API__.
+///
+/// \rst
+/// .. attention::
+///   In the Python API, there are separate implementations for 1-D, 2-D and 3-D
+///   BatchNorm. In C++, there is only one `BatchNorm` module, which works for
+///   any of these dimensions.
+/// \endrst
 class BatchNormImpl : public torch::nn::Cloneable<BatchNormImpl> {
  public:
   explicit BatchNormImpl(int64_t features)
@@ -25,16 +53,42 @@ class BatchNormImpl : public torch::nn::Cloneable<BatchNormImpl> {
 
   void reset() override;
 
+  /// Applies batch normalization on the `input` using the stored mean and
+  /// variance.
+  ///
+  /// The module must be constructed with `stateful = true` when calling this
+  /// method, as the module will otherwise not store running statistics. If you
+  /// want to supply the mean and variance yourself, use `pure_forward`.
   Tensor forward(Tensor input);
+
+  /// Applies batch normalization on the `input` using the given `mean` and
+  /// `variance` statistics.
   Tensor pure_forward(Tensor input, Tensor mean, Tensor variance);
 
+  /// The options with which this module was constructed.
   BatchNormOptions options;
+
+  /// The learned weight.
+  /// Only defined if the `affine` option was `true` upon construction.
   Tensor weight;
+
+  /// The learned bias.
+  /// Only defined if the `affine` option was `true` upon construction.
   Tensor bias;
+
+  /// The running mean.
+  /// Only defined if the `stateful` option was `true` upon construction.
   Tensor running_mean;
+
+  /// The running variance.
+  /// Only defined if the `stateful` option was `true` upon construction.
   Tensor running_variance;
 };
 
+/// A `ModuleHolder` subclass for `BatchNormImpl`.
+/// See the documentation for `BatchNormImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(BatchNorm);
 
 } // namespace nn
diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h
index 3f562eab1c5a52..c6e6a2392dbe36 100644
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -10,6 +10,8 @@
 
 namespace torch {
 namespace nn {
+
+/// Options for a `D`-dimensional convolution module.
 template <size_t D>
 struct ConvOptions {
   ConvOptions(
@@ -17,18 +19,59 @@ struct ConvOptions {
       int64_t output_channels,
       ExpandingArray<D> kernel_size);
 
+  /// The number of channels the input volumes will have.
+  /// Changing this parameter after construction __has no effect__.
   TORCH_ARG(int64_t, input_channels);
+
+  /// The number of output channels the convolution should produce.
+  /// Changing this parameter after construction __has no effect__.
   TORCH_ARG(int64_t, output_channels);
+
+  /// The kernel size to use.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
   TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// The stride of the convolution.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
   TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// The padding to add to the input volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
   TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// The kernel dilation.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
   TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// For transpose convolutions, the padding to add to output volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
   TORCH_ARG(ExpandingArray<D>, output_padding) = 0;
+
+  /// If true, convolutions will be transpose convolutions (a.k.a.
+  /// deconvolutions).
+  /// Changing this parameter after construction __has no effect__.
   TORCH_ARG(bool, transposed) = false;
+
+  /// Whether to add a bias after individual applications of the kernel.
+  /// Changing this parameter after construction __has no effect__.
   TORCH_ARG(bool, with_bias) = true;
+
+  /// The number of convolution groups.
+  /// This parameter __can__ be changed after construction.
   TORCH_ARG(int64_t, groups) = 1;
 };
 
+/// Base class for all (dimension-specialized) convolution modules.
 template <size_t D, typename Derived>
 class ConvImpl : public torch::nn::Cloneable<Derived> {
  public:
@@ -36,44 +79,80 @@ class ConvImpl : public torch::nn::Cloneable<Derived> {
       int64_t input_channels,
       int64_t output_channels,
       ExpandingArray<D> kernel_size)
-      : ConvImpl(ConvOptions<D>(input_channels, output_channels, kernel_size)) {}
+      : ConvImpl(ConvOptions<D>(input_channels, output_channels, kernel_size)) {
+  }
   explicit ConvImpl(ConvOptions<D> options);
 
   void reset() override;
 
+  /// The options with which this `Module` was constructed.
   ConvOptions<D> options;
+
+  /// The learned kernel (or "weight").
   Tensor weight;
+
+  /// The learned bias. Only defined if the `with_bias` option was true.
   Tensor bias;
 };
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+/// Applies convolution over a 1-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv1d to learn about
+/// the exact behavior of this module.
 class Conv1dImpl : public ConvImpl<1, Conv1dImpl> {
  public:
   using ConvImpl<1, Conv1dImpl>::ConvImpl;
   Tensor forward(Tensor input);
 };
+
+/// `ConvOptions` specialized for 1-D convolution.
 using Conv1dOptions = ConvOptions<1>;
+
+/// A `ModuleHolder` subclass for `Conv1dImpl`.
+/// See the documentation for `Conv1dImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Conv1d);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+/// Applies convolution over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv2d to learn about
+/// the exact behavior of this module.
 class Conv2dImpl : public ConvImpl<2, Conv2dImpl> {
  public:
   using ConvImpl<2, Conv2dImpl>::ConvImpl;
   Tensor forward(Tensor input);
 };
+
+/// `ConvOptions` specialized for 2-D convolution.
 using Conv2dOptions = ConvOptions<2>;
+
+/// A `ModuleHolder` subclass for `Conv2dImpl`.
+/// See the documentation for `Conv2dImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Conv2d);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+/// Applies convolution over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv3d to learn about
+/// the exact behavior of this module.
 class Conv3dImpl : public ConvImpl<3, Conv3dImpl> {
  public:
   using ConvImpl<3, Conv3dImpl>::ConvImpl;
   Tensor forward(Tensor input);
 };
+
+/// `ConvOptions` specialized for 3-D convolution.
 using Conv3dOptions = ConvOptions<3>;
+
+/// A `ModuleHolder` subclass for `Conv3dImpl`.
+/// See the documentation for `Conv3dImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Conv3d);
 
 } // namespace nn
diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h
index f547b5b2fa2d2b..bfe230a597215f 100644
--- a/torch/csrc/api/include/torch/nn/modules/dropout.h
+++ b/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -9,10 +9,13 @@
 
 namespace torch {
 namespace nn {
+
+/// Options for `Dropout` and `FeatureDropout`.
 struct DropoutOptions {
   DropoutOptions(double rate);
   /// The probability with which a particular component of the input is set to
   /// zero.
+  /// Changes to this parameter at runtime are effective.
   TORCH_ARG(double, rate) = 0.5;
 };
 
@@ -26,14 +29,7 @@ class DropoutImplBase : public torch::nn::Cloneable<Derived> {
 
   void reset() override;
 
-  /// During training, applies a noise mask to the input tensor.
-  /// During evaluation, applies an identity function.
-  Tensor forward(Tensor input);
-
-  /// Returns a noise mask that can be applied to the given input tensor.
-  /// Used inside `forward()` to generate the noise mask for dropout.
-  virtual Tensor noise_mask(Tensor input) const = 0;
-
+  /// The options used to configure this `Dropout` module.
   DropoutOptions options;
 };
 } // namespace detail
@@ -45,18 +41,26 @@ class DropoutImplBase : public torch::nn::Cloneable<Derived> {
 class DropoutImpl : public detail::DropoutImplBase<DropoutImpl> {
  public:
   using detail::DropoutImplBase<DropoutImpl>::DropoutImplBase;
-  Tensor noise_mask(Tensor input) const override;
+  /// During training, applies a noise mask to the input tensor.
+  /// During evaluation, applies an identity function.
+  Tensor forward(Tensor input);
 };
 
-/// Applies [Dropout](https://arxiv.org/abs/1207.0580) to inputs with
-/// 2-dimensional features.
+/// Applies spatial [Dropout](https://arxiv.org/abs/1207.0580) to inputs with
+/// 2-D or 3-D features.
 ///
-/// See https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout2d to learn more
-/// about the exact semantics of this module.
-class Dropout2dImpl : public detail::DropoutImplBase<Dropout2dImpl> {
+/// The equivalent in Python is
+/// [Dropout2d](https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout2d) for
+/// 2-D features and
+/// [Dropout3d](https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout3d) for
+/// 3-D features. This `FeatureDropout` module can instead deal with both 2-D
+/// and 3-D features.
+class FeatureDropoutImpl : public detail::DropoutImplBase<FeatureDropoutImpl> {
  public:
-  using detail::DropoutImplBase<Dropout2dImpl>::DropoutImplBase;
-  Tensor noise_mask(Tensor input) const override;
+  using detail::DropoutImplBase<FeatureDropoutImpl>::DropoutImplBase;
+  /// During training, applies a noise mask to the input tensor.
+  /// During evaluation, applies an identity function.
+  Tensor forward(Tensor input);
 };
 
 /// A `ModuleHolder` subclass for `DropoutImpl`.
@@ -65,10 +69,10 @@ class Dropout2dImpl : public detail::DropoutImplBase<Dropout2dImpl> {
 /// module storage semantics.
 TORCH_MODULE(Dropout);
 
-/// A `ModuleHolder` subclass for `Dropout2dImpl`.
-/// See the documentation for `Dropout2dImpl` class to learn what methods it
-/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
-/// module storage semantics.
-TORCH_MODULE(Dropout2d);
+/// A `ModuleHolder` subclass for `FeatureDropoutImpl`.
+/// See the documentation for `FeatureDropoutImpl` class to learn what methods
+/// it provides, or the documentation for `ModuleHolder` to learn about
+/// PyTorch's module storage semantics.
+TORCH_MODULE(FeatureDropout);
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/embedding.h b/torch/csrc/api/include/torch/nn/modules/embedding.h
index bc33f8df74f75c..c2a701eb3ede24 100644
--- a/torch/csrc/api/include/torch/nn/modules/embedding.h
+++ b/torch/csrc/api/include/torch/nn/modules/embedding.h
@@ -10,12 +10,16 @@
 namespace torch {
 namespace nn {
 
+/// Options for the `Embedding` module.
 struct EmbeddingOptions {
   EmbeddingOptions(int64_t count, int64_t dimension);
+  /// The number of embeddings (number of rows in the table).
   TORCH_ARG(int64_t, count);
+  /// The size of each embedding vector (number of columns in the table).
   TORCH_ARG(int64_t, dimension);
 };
 
+/// Performs a lookup in a fixed size embedding table.
 class EmbeddingImpl : public torch::nn::Cloneable<EmbeddingImpl> {
  public:
   EmbeddingImpl(int64_t count, int64_t dimension)
@@ -23,12 +27,23 @@ class EmbeddingImpl : public torch::nn::Cloneable<EmbeddingImpl> {
   explicit EmbeddingImpl(EmbeddingOptions options);
 
   void reset() override;
-  Tensor forward(Tensor);
 
+  /// Performs a lookup on the embedding table stored in `weight` using the
+  /// `indices` supplied and returns the result.
+  Tensor forward(Tensor indices);
+
+  /// The `Options` used to configure this `Embedding` module.
+  /// Changes to `EmbeddingOptions` *after construction* have no effect.
   EmbeddingOptions options;
+
+  /// The embedding table.
   Tensor weight;
 };
 
+/// A `ModuleHolder` subclass for `EmbeddingImpl`.
+/// See the documentation for `EmbeddingImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Embedding);
 
 } // namespace nn
diff --git a/torch/csrc/api/include/torch/nn/modules/functional.h b/torch/csrc/api/include/torch/nn/modules/functional.h
index 4e234a8ad3fc80..98f72963686f32 100644
--- a/torch/csrc/api/include/torch/nn/modules/functional.h
+++ b/torch/csrc/api/include/torch/nn/modules/functional.h
@@ -6,16 +6,59 @@
 #include <torch/tensor.h>
 
 #include <functional>
+#include <utility>
 
 namespace torch {
 namespace nn {
 
-// Lets you create a container from a function, designed for use in
-// Sequential.
+/// Wraps a function in a `Module`.
+///
+/// The `Functional` module allows wrapping an arbitrary function or function
+/// object in an `nn::Module`. This is primarily handy for usage in
+/// `Sequential`.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   Sequential sequential(
+///     Linear(3, 4),
+///     Functional(torch::relu),
+///     BatchNorm(3),
+///     Functional(torch::elu, /*alpha=*/1));
+/// \endrst
+///
+/// While a `Functional` module only accepts a single `Tensor` as input, it is
+/// possible for the the wrapped function to accept further arguments. However,
+/// these have to be bound *at construction time*. For example, if
+/// you want to wrap `torch::leaky_relu`, which accepts a `slope` scalar as its
+/// second argument, with a particular value for its `slope` in a `Functional`
+/// module, you could write
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   Functional(torch::leaky_relu, /*slope=*/0.5)
+/// \endrst
+///
+/// The value of `0.5` is then stored within the `Functional` object and supplied
+/// to the function call at invocation time. Note that such bound values are
+/// evaluated eagerly and stored a single time. See the documentation of
+/// [std::bind](https://en.cppreference.com/w/cpp/utility/functional/bind) for
+/// more information on the semantics of argument binding.
+///
+/// \rst
+/// .. attention::
+///   After passing any bound arguments, the function must accept a single
+///   tensor and return a single tensor.
+/// \endrst
+///
+/// Note that `Functional` overloads the call operator (`operator()`) such that
+/// you can invoke it with `my_func(...)`.
 class FunctionalImpl : public torch::nn::Cloneable<FunctionalImpl> {
  public:
   using Function = std::function<Tensor(Tensor)>;
 
+  /// Constructs a `Functional` from a function object.
   explicit FunctionalImpl(Function function);
 
   template <
@@ -34,6 +77,8 @@ class FunctionalImpl : public torch::nn::Cloneable<FunctionalImpl> {
   }
 
   void reset() override;
+
+  /// Forwards the `input` tensor to the underlying (bound) function object.
   Tensor forward(Tensor input);
 
   /// Calls forward(input).
@@ -43,6 +88,10 @@ class FunctionalImpl : public torch::nn::Cloneable<FunctionalImpl> {
   Function function_;
 };
 
+/// A `ModuleHolder` subclass for `FunctionalImpl`.
+/// See the documentation for `FunctionalImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Functional);
 
 } // namespace nn
diff --git a/torch/csrc/api/include/torch/nn/modules/linear.h b/torch/csrc/api/include/torch/nn/modules/linear.h
index 1d1a25da36163b..3e8044f5715826 100644
--- a/torch/csrc/api/include/torch/nn/modules/linear.h
+++ b/torch/csrc/api/include/torch/nn/modules/linear.h
@@ -10,6 +10,7 @@
 
 namespace torch {
 namespace nn {
+/// Options for the `Linear` module.
 struct LinearOptions {
   LinearOptions(int64_t in, int64_t out);
   /// The number of input features (columns of the input matrix).
diff --git a/torch/csrc/api/include/torch/nn/modules/rnn.h b/torch/csrc/api/include/torch/nn/modules/rnn.h
index 326e9e267a9f32..e287b79cacd18a 100644
--- a/torch/csrc/api/include/torch/nn/modules/rnn.h
+++ b/torch/csrc/api/include/torch/nn/modules/rnn.h
@@ -170,8 +170,8 @@ struct RNNOptions {
 };
 
 /// A multi-layer Elman RNN module with Tanh or ReLU activation.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.RNN for more
-/// documenation.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.RNN to learn about the
+/// exact behavior of this module.
 class RNNImpl : public detail::RNNImplBase<RNNImpl> {
  public:
   RNNImpl(int64_t input_size, int64_t hidden_size)
@@ -198,8 +198,8 @@ TORCH_MODULE(RNN);
 using LSTMOptions = detail::RNNOptionsBase;
 
 /// A multi-layer long-short-term-memory (LSTM) module.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.LSTM for more
-/// documenation.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LSTM to learn about the
+/// exact behavior of this module.
 class LSTMImpl : public detail::RNNImplBase<LSTMImpl> {
  public:
   LSTMImpl(int64_t input_size, int64_t hidden_size)
@@ -224,8 +224,8 @@ TORCH_MODULE(LSTM);
 using GRUOptions = detail::RNNOptionsBase;
 
 /// A multi-layer gated recurrent unit (GRU) module.
-/// See https://pytorch.org/docs/master/nn.html#torch.nn.GRU for more
-/// documenation.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.GRU to learn about the
+/// exact behavior of this module.
 class GRUImpl : public detail::RNNImplBase<GRUImpl> {
  public:
   GRUImpl(int64_t input_size, int64_t hidden_size)
diff --git a/torch/csrc/api/include/torch/nn/modules/sequential.h b/torch/csrc/api/include/torch/nn/modules/sequential.h
index 9f3f7a0b983ab9..e4839ac41a910b 100644
--- a/torch/csrc/api/include/torch/nn/modules/sequential.h
+++ b/torch/csrc/api/include/torch/nn/modules/sequential.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/detail/static.h>
+#include <torch/nn/cloneable.h>
 #include <torch/nn/module.h>
 #include <torch/nn/modules/any.h>
 #include <torch/nn/pimpl.h>
@@ -17,18 +18,81 @@
 
 namespace torch {
 namespace nn {
-/// A `Sequential` module is a container for any number of other modules. Its
-/// `forward()` method chains outputs to inputs and returns the final output.
-/// The `Sequential` class reference semantics.
+
+/// A list of `Module`s that acts as a `Module` itself.
+///
+/// A `Sequential` is fundamentally a list of `Module`s, each with a `forward()`
+/// method. `Sequential` provides a `forward()` method of its own, which accepts
+/// any input and forwards it to the first module it stores. It then "chains"
+/// outputs to inputs sequentially for each subsequent module, finally returning
+/// the output of the last module. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::Sequential seq(
+///     torch::nn::Linear(3, 4),
+///     torch::nn::BatchNorm(4),
+///     torch::nn::Dropout(0.5)
+///   );
+///
+///   auto output = seq->forward(torch::ones(3));
+///
+/// \endrst
+///
+/// This can conceptually be thought of as the following loop (using Python as
+/// pseudocode):
+///
+/// \rst
+/// .. code-block:: python
+///
+///   def forward(sequential, input):
+///     for module in sequential:
+///       input = module(input)
+///     return input
+///
+/// \endrst
+///
+/// Why should you use `Sequential` instead of a simple `std::vector`? The value
+/// a `Sequential` provides over manually calling a sequence of modules is that
+/// it allows treating the whole container *as a single module*, such that
+/// performing a transformation on the `Sequential` applies to each of the
+/// modules it stores (which are each a registered submodule of the
+/// `Sequential`). For example, calling
+/// `.to(torch::kCUDA)` on a `Sequential` will move each module in the list to
+/// CUDA memory. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::Sequential seq(
+///     torch::nn::Linear(3, 4),
+///     torch::nn::BatchNorm(4),
+///     torch::nn::Dropout(0.5)
+///   );
+///
+///   // Convert all modules to CUDA.
+///   seq->to(torch::kCUDA);
+///
+/// \endrst
+///
+/// Finally, `Sequential` provides a lightweight container API, such as allowing
+/// iteration over submodules, positional access, adding a new module after
+/// construction via `push_back`, as well as joining two `Sequential`s via
+/// `extend`.
+///
+/// \rst
+/// .. attention::
+///   One current limitation of `Sequential` is that all except the first module
+///   must accept a single argument. If your modules need to take multiple
+///   arguments, you should define them to take and return tuples.
+/// \endrst
 class SequentialImpl : public Cloneable<SequentialImpl> {
  public:
   using Iterator = std::vector<AnyModule>::iterator;
   using ConstIterator = std::vector<AnyModule>::const_iterator;
 
-  /// Constructs the `Sequential` from a pack of modules. Each module can either
-  /// be a plain value (e.g. `Linear`) or a boxed value (e.g.
-  /// `shared_ptr<Linear>`). Unboxed modules will be moved into `shared_ptr`s
-  /// internally.
+  /// Constructs the `Sequential` from a variadic list of modules.
   template <typename... Modules>
   explicit SequentialImpl(Modules&&... modules) {
     modules_.reserve(sizeof...(Modules));
@@ -46,21 +110,46 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
     return clone;
   }
 
-  /// `reset()` is empty for `Sequential`, since it does not have parameter of
+  /// `reset()` is empty for `Sequential`, since it does not have parameters of
   /// its own.
   void reset() override {}
 
-  /// Feeds the `inputs` to the first module, then chains the output of each
-  /// module with the input of the next, in order of construction.
-  template <typename ReturnType = Tensor, typename... ArgumentTypes>
-  ReturnType forward(ArgumentTypes&&... arguments) {
+  /// Feeds `inputs` to the first module and then chains outputs to inputs,
+  /// returning the last output.
+  ///
+  /// Conceptually the following loop in Python:
+  ///
+  /// \rst
+  /// .. code-block:: python
+  ///
+  ///   def forward(sequential, input):
+  ///     for module in sequential:
+  ///       input = module(input)
+  ///     return input
+  ///
+  /// \endrst
+  ///
+  /// The return type is taken as the first template parameter. It defaults to
+  /// `Tensor`. If the last module in the `Sequential` returns another type `T`,
+  /// you should call `forward<T>(inputs)` instead of just `forward(inputs)`:
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   torch::Tensor tensor = sequential1->forward(inputs);
+  ///   int integer = sequential2->forward<int>(inputs);
+  ///   float value = sequential3->forward<float>(inputs);
+  ///
+  /// \endrst
+  template <typename ReturnType = Tensor, typename... InputTypes>
+  ReturnType forward(InputTypes&&... inputs) {
     AT_CHECK(!is_empty(), "Cannot call forward() on an empty Sequential");
 
     auto iterator = modules_.begin();
-    auto input = iterator->forward(std::forward<ArgumentTypes>(arguments)...);
+    auto input = iterator->any_forward(std::forward<InputTypes>(inputs)...);
 
     for (++iterator; iterator != modules_.end(); ++iterator) {
-      input = iterator->forward(std::move(input));
+      input = iterator->any_forward(std::move(input));
     }
 
     // Check the return value and give a nice error message if the requsted
@@ -124,6 +213,8 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
   Iterator begin() {
     return modules_.begin();
   }
+
+  /// Returns a const iterator to the start of the `Sequential`.
   ConstIterator begin() const {
     return modules_.begin();
   }
@@ -132,6 +223,8 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
   Iterator end() {
     return modules_.end();
   }
+
+  /// Returns a const iterator to the end of the `Sequential`.
   ConstIterator end() const {
     return modules_.end();
   }
@@ -225,6 +318,10 @@ class SequentialImpl : public Cloneable<SequentialImpl> {
   std::vector<AnyModule> modules_;
 };
 
+/// A `ModuleHolder` subclass for `SequentialImpl`.
+/// See the documentation for `SequentialImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Sequential);
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/optim/adagrad.h b/torch/csrc/api/include/torch/optim/adagrad.h
index ced6a68a83f53b..203ce24a33cb14 100644
--- a/torch/csrc/api/include/torch/optim/adagrad.h
+++ b/torch/csrc/api/include/torch/optim/adagrad.h
@@ -2,13 +2,11 @@
 
 #include <torch/nn/module.h>
 #include <torch/optim/optimizer.h>
+#include <torch/serialization.h>
 #include <torch/tensor.h>
 
 #include <ATen/ATen.h>
 
-#include <cereal/access.hpp>
-#include <cereal/cereal.hpp>
-
 #include <utility>
 #include <vector>
 
@@ -37,12 +35,16 @@ class Adagrad : public Optimizer {
 
   template <class Archive>
   void serialize(Archive& ar) {
+#if defined(TORCH_USE_CEREAL)
     ar(CEREAL_NVP(sum_));
     ar(CEREAL_NVP(step_));
+#endif // defined(TORCH_USE_CEREAL)
   }
 
  private:
+#if defined(TORCH_USE_CEREAL)
   friend class cereal::access;
+#endif // defined(TORCH_USE_CEREAL)
   Adagrad() : options(0) {}
 
   std::vector<Tensor> sum_;
@@ -50,3 +52,10 @@ class Adagrad : public Optimizer {
 };
 } // namespace optim
 } // namespace torch
+
+#if defined(TORCH_USE_CEREAL)
+CEREAL_REGISTER_TYPE(torch::optim::Adagrad);
+CEREAL_REGISTER_POLYMORPHIC_RELATION(
+    torch::optim::Optimizer,
+    torch::optim::Adagrad);
+#endif // defined(TORCH_USE_CEREAL)
diff --git a/torch/csrc/api/include/torch/optim/adam.h b/torch/csrc/api/include/torch/optim/adam.h
index 87d00e6a6676a7..514715b6b0c181 100644
--- a/torch/csrc/api/include/torch/optim/adam.h
+++ b/torch/csrc/api/include/torch/optim/adam.h
@@ -3,12 +3,10 @@
 #include <torch/nn/module.h>
 #include <torch/nn/pimpl.h>
 #include <torch/optim/optimizer.h>
+#include <torch/serialization.h>
 
 #include <ATen/ATen.h>
 
-#include <cereal/access.hpp>
-#include <cereal/cereal.hpp>
-
 #include <utility>
 #include <vector>
 
@@ -36,16 +34,20 @@ class Adam : public Optimizer {
 
   template <class Archive>
   void serialize(Archive& ar) {
+#if defined(TORCH_USE_CEREAL)
     ar(CEREAL_NVP(step_buffers_),
        CEREAL_NVP(exp_average_buffers_),
        CEREAL_NVP(exp_average_sq_buffers_),
        CEREAL_NVP(max_exp_average_sq_buffers_));
+#endif // defined(TORCH_USE_CEREAL)
   }
 
   AdamOptions options;
 
  private:
+#if defined(TORCH_USE_CEREAL)
   friend class cereal::access;
+#endif // defined(TORCH_USE_CEREAL)
   Adam() : options(0) {}
 
   std::vector<int64_t> step_buffers_;
@@ -53,6 +55,12 @@ class Adam : public Optimizer {
   std::vector<Tensor> exp_average_sq_buffers_;
   std::vector<Tensor> max_exp_average_sq_buffers_;
 };
-
 } // namespace optim
 } // namespace torch
+
+#if defined(TORCH_USE_CEREAL)
+CEREAL_REGISTER_TYPE(torch::optim::Adam);
+CEREAL_REGISTER_POLYMORPHIC_RELATION(
+    torch::optim::Optimizer,
+    torch::optim::Adam);
+#endif // defined(TORCH_USE_CEREAL)
diff --git a/torch/csrc/api/include/torch/optim/lbfgs.h b/torch/csrc/api/include/torch/optim/lbfgs.h
index 9ea8065f000872..eec8c8fdc415d0 100644
--- a/torch/csrc/api/include/torch/optim/lbfgs.h
+++ b/torch/csrc/api/include/torch/optim/lbfgs.h
@@ -2,12 +2,10 @@
 
 #include <torch/nn/module.h>
 #include <torch/optim/optimizer.h>
+#include <torch/serialization.h>
 
 #include <ATen/ATen.h>
 
-#include <cereal/access.hpp>
-#include <cereal/cereal.hpp>
-
 #include <deque>
 #include <functional>
 #include <memory>
@@ -41,6 +39,7 @@ class LBFGS : public LossClosureOptimizer {
 
   template <class Archive>
   void serialize(Archive& ar) {
+#if defined(TORCH_USE_CEREAL)
     ar(CEREAL_NVP(d));
     ar(CEREAL_NVP(t));
     ar(CEREAL_NVP(H_diag));
@@ -48,10 +47,13 @@ class LBFGS : public LossClosureOptimizer {
     ar(CEREAL_NVP(prev_loss));
     ar(CEREAL_NVP(old_dirs));
     ar(CEREAL_NVP(old_stps));
+#endif // defined(TORCH_USE_CEREAL)
   }
 
  private:
+#if defined(TORCH_USE_CEREAL)
   friend class cereal::access;
+#endif // defined(TORCH_USE_CEREAL)
   LBFGS() : options(0) {}
 
   Tensor gather_flat_grad();
@@ -69,6 +71,5 @@ class LBFGS : public LossClosureOptimizer {
   int64_t func_evals{0};
   int64_t state_n_iter{0};
 };
-
 } // namespace optim
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/optim/rmsprop.h b/torch/csrc/api/include/torch/optim/rmsprop.h
index f7f1ddb96a0971..4a84331f92810c 100644
--- a/torch/csrc/api/include/torch/optim/rmsprop.h
+++ b/torch/csrc/api/include/torch/optim/rmsprop.h
@@ -2,12 +2,10 @@
 
 #include <torch/nn/module.h>
 #include <torch/optim/optimizer.h>
+#include <torch/serialization.h>
 
 #include <ATen/ATen.h>
 
-#include <cereal/access.hpp>
-#include <cereal/cereal.hpp>
-
 #include <functional>
 #include <memory>
 #include <string>
@@ -41,19 +39,29 @@ class RMSprop : public Optimizer {
 
   template <class Archive>
   void serialize(Archive& ar) {
+#if defined(TORCH_USE_CEREAL)
     ar(CEREAL_NVP(square_average_buffers_));
     ar(CEREAL_NVP(momentum_buffers_));
     ar(CEREAL_NVP(grad_average_buffers_));
+#endif // defined(TORCH_USE_CEREAL)
   }
 
  private:
+#if defined(TORCH_USE_CEREAL)
   friend class cereal::access;
+#endif // defined(TORCH_USE_CEREAL)
   RMSprop() : options(0) {}
 
   std::vector<Tensor> square_average_buffers_;
   std::vector<Tensor> momentum_buffers_;
   std::vector<Tensor> grad_average_buffers_;
 };
-
 } // namespace optim
 } // namespace torch
+
+#if defined(TORCH_USE_CEREAL)
+CEREAL_REGISTER_TYPE(torch::optim::RMSprop);
+CEREAL_REGISTER_POLYMORPHIC_RELATION(
+    torch::optim::Optimizer,
+    torch::optim::RMSprop);
+#endif // defined(TORCH_USE_CEREAL)
diff --git a/torch/csrc/api/include/torch/optim/sgd.h b/torch/csrc/api/include/torch/optim/sgd.h
index 8dc283b8b68da8..345d0343c1ba85 100644
--- a/torch/csrc/api/include/torch/optim/sgd.h
+++ b/torch/csrc/api/include/torch/optim/sgd.h
@@ -3,13 +3,11 @@
 #include <torch/nn/module.h>
 #include <torch/nn/pimpl.h>
 #include <torch/optim/optimizer.h>
+#include <torch/serialization.h>
 #include <torch/tensor.h>
 
 #include <ATen/ATen.h>
 
-#include <cereal/access.hpp>
-#include <cereal/cereal.hpp>
-
 #include <cstddef>
 #include <utility>
 #include <vector>
@@ -37,13 +35,17 @@ class SGD : public Optimizer {
 
   template <class Archive>
   void serialize(Archive& ar) {
+#if defined(TORCH_USE_CEREAL)
     ar(CEREAL_NVP(momentum_buffers_));
+#endif // defined(TORCH_USE_CEREAL)
   }
 
   SGDOptions options;
 
  private:
+#if defined(TORCH_USE_CEREAL)
   friend class cereal::access;
+#endif // defined(TORCH_USE_CEREAL)
   SGD() : options(0) {}
 
   std::vector<Tensor> momentum_buffers_;
@@ -52,3 +54,10 @@ class SGD : public Optimizer {
 };
 } // namespace optim
 } // namespace torch
+
+#if defined(TORCH_USE_CEREAL)
+CEREAL_REGISTER_TYPE(torch::optim::SGD);
+CEREAL_REGISTER_POLYMORPHIC_RELATION(
+    torch::optim::Optimizer,
+    torch::optim::SGD);
+#endif // defined(TORCH_USE_CEREAL)
diff --git a/torch/csrc/api/include/torch/serialization.h b/torch/csrc/api/include/torch/serialization.h
index 1f6be330090d31..a85c027ce5917a 100644
--- a/torch/csrc/api/include/torch/serialization.h
+++ b/torch/csrc/api/include/torch/serialization.h
@@ -2,45 +2,69 @@
 
 #include <fstream>
 
-#include <torch/optim.h>
 #include <torch/tensor.h>
 #include <torch/utils.h>
 
+#if defined(TORCH_USE_CEREAL)
+#include <cereal/access.hpp>
+#include <cereal/cereal.hpp>
+#include <cereal/types/polymorphic.hpp>
+
 #include "cereal/archives/binary.hpp"
-#include "cereal/types/polymorphic.hpp"
 
 #include "cereal/types/string.hpp"
 #include "cereal/types/unordered_map.hpp"
 #include "cereal/types/vector.hpp"
+#endif // defined(TORCH_USE_CEREAL)
 
 namespace torch {
-
 // Some convenience functions for saving and loading
 template <typename T>
 void save(std::ostream& stream, T const& obj) {
+#if defined(TORCH_USE_CEREAL)
   cereal::BinaryOutputArchive archive(stream);
   archive(*obj);
+#else
+  AT_ERROR("PyTorch compiled without serialization support");
+#endif
 }
+
 template <typename T>
 void load(std::istream& stream, T& obj) {
+#if defined(TORCH_USE_CEREAL)
   cereal::BinaryInputArchive archive(stream);
   archive(*obj);
+#else
+  AT_ERROR("PyTorch compiled without serialization support");
+#endif
 }
+
 template <typename T>
 void save(std::ostream& stream, T const* obj) {
+#if defined(TORCH_USE_CEREAL)
   cereal::BinaryOutputArchive archive(stream);
   archive(*obj);
+#else
+  AT_ERROR("PyTorch compiled without serialization support");
+#endif
 }
+
 template <typename T>
 void load(std::istream& stream, T* obj) {
+#if defined(TORCH_USE_CEREAL)
   cereal::BinaryInputArchive archive(stream);
   archive(*obj);
+#else
+  AT_ERROR("PyTorch compiled without serialization support");
+#endif
 }
+
 template <typename T>
 void save(std::string const& path, T const& obj) {
   std::ofstream os(path, std::ios::binary);
   torch::save(os, obj);
 }
+
 template <typename T>
 void load(std::string const& path, T& obj) {
   std::ifstream is(path, std::ios::binary);
@@ -74,8 +98,7 @@ inline int32_t scalarTypeId(torch::Dtype type) {
     case torch::Dtype::Undefined:
       return 8;
     default:
-      throw std::runtime_error(
-          "Unknown scalar type: " + std::to_string(static_cast<int>(type)));
+      AT_ERROR("Unknown scalar type: ", static_cast<int>(type));
   }
 }
 
@@ -100,7 +123,7 @@ inline torch::Dtype scalarTypeFromId(int32_t id) {
     case 8:
       return torch::Dtype::Undefined;
     default:
-      throw std::runtime_error("Unknown scalar type id: " + std::to_string(id));
+      AT_ERROR("Unknown scalar type id: ", id);
   }
 }
 
@@ -117,8 +140,7 @@ inline int32_t backendId(at::Backend backend) {
     case at::Backend::Undefined:
       return 4;
     default:
-      throw std::runtime_error(
-          "Unknown backend: " + std::to_string(static_cast<int>(backend)));
+      AT_ERROR("Unknown backend: ", static_cast<int>(backend));
   }
 }
 
@@ -135,33 +157,15 @@ inline at::Backend backendFromId(int32_t id) {
     case 4:
       return at::Backend::Undefined;
     default:
-      throw std::runtime_error("Unknown backend id: " + std::to_string(id));
+      AT_ERROR("Unknown backend id: ", id);
   }
 }
 
 } // namespace detail
 } // namespace torch
 
-// This is super ugly and I don't know how to simplify it
-CEREAL_REGISTER_TYPE(torch::optim::SGD);
-CEREAL_REGISTER_POLYMORPHIC_RELATION(
-    torch::optim::Optimizer,
-    torch::optim::SGD);
-CEREAL_REGISTER_TYPE(torch::optim::Adagrad);
-CEREAL_REGISTER_POLYMORPHIC_RELATION(
-    torch::optim::Optimizer,
-    torch::optim::Adagrad);
-CEREAL_REGISTER_TYPE(torch::optim::RMSprop);
-CEREAL_REGISTER_POLYMORPHIC_RELATION(
-    torch::optim::Optimizer,
-    torch::optim::RMSprop);
-CEREAL_REGISTER_TYPE(torch::optim::Adam);
-CEREAL_REGISTER_POLYMORPHIC_RELATION(
-    torch::optim::Optimizer,
-    torch::optim::Adam);
-
+#if defined(TORCH_USE_CEREAL)
 namespace cereal {
-
 namespace agimpl {
 
 template <class Archive>
@@ -269,3 +273,4 @@ void load(Archive& archive, torch::Tensor& tensor) {
   }
 }
 } // namespace cereal
+#endif // defined(TORCH_USE_CEREAL)
diff --git a/torch/csrc/api/src/nn/init.cpp b/torch/csrc/api/src/nn/init.cpp
index 1afc1858b2aa99..275e9639abf152 100644
--- a/torch/csrc/api/src/nn/init.cpp
+++ b/torch/csrc/api/src/nn/init.cpp
@@ -69,11 +69,11 @@ Tensor dirac_(Tensor tensor) {
   return tensor;
 }
 
-Tensor eye_(Tensor tensor) {
+Tensor eye_(Tensor matrix) {
   NoGradGuard guard;
   AT_CHECK(
-      tensor.ndimension() == 2, "Only tensors with 2 dimensions are supported");
-  return torch::eye_out(tensor, tensor.size(0), tensor.size(1));
+      matrix.ndimension() == 2, "Only tensors with 2 dimensions are supported");
+  return torch::eye_out(matrix, matrix.size(0), matrix.size(1));
 }
 
 Tensor normal_(Tensor tensor, double mean, double std) {
diff --git a/torch/csrc/api/src/nn/modules/batchnorm.cpp b/torch/csrc/api/src/nn/modules/batchnorm.cpp
index 1a9ae761c9f240..6a3920fe56a66d 100644
--- a/torch/csrc/api/src/nn/modules/batchnorm.cpp
+++ b/torch/csrc/api/src/nn/modules/batchnorm.cpp
@@ -34,14 +34,15 @@ void BatchNormImpl::reset() {
 }
 
 Tensor BatchNormImpl::forward(Tensor input) {
-  return pure_forward(input, Tensor(), Tensor());
+  AT_CHECK(
+      options.stateful_,
+      "Calling BatchNorm::forward is only permitted when "
+      "the 'stateful' option is true (was false). "
+      "Use BatchNorm::pure_forward instead.");
+  return pure_forward(input, running_mean, running_variance);
 }
 
 Tensor BatchNormImpl::pure_forward(Tensor input, Tensor mean, Tensor variance) {
-  auto& running_mean = options.stateful_ ? this->running_mean : mean;
-  auto& running_variance =
-      options.stateful_ ? this->running_variance : variance;
-
   if (is_training()) {
     const auto num_channels = input.dim() > 1 ? input.size(1) : 1;
     AT_CHECK(
@@ -53,8 +54,8 @@ Tensor BatchNormImpl::pure_forward(Tensor input, Tensor mean, Tensor variance) {
       input,
       weight,
       bias,
-      running_mean,
-      running_variance,
+      mean,
+      variance,
       is_training(),
       options.momentum_,
       options.eps_,
diff --git a/torch/csrc/api/src/nn/modules/conv.cpp b/torch/csrc/api/src/nn/modules/conv.cpp
index 06ed1d10ea3170..30a25714de4277 100644
--- a/torch/csrc/api/src/nn/modules/conv.cpp
+++ b/torch/csrc/api/src/nn/modules/conv.cpp
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <cstdint>
 #include <functional>
+#include <utility>
 #include <vector>
 
 namespace torch {
@@ -61,7 +62,7 @@ void ConvImpl<D, Derived>::reset() {
       options.input_channels_,
       std::multiplies<int64_t>{});
   const auto stdv = 1.0 / std::sqrt(number_of_features);
-  NoGradGuard no_grad;;
+  NoGradGuard no_grad;
   for (auto& p : this->parameters()) {
     p->uniform_(-stdv, stdv);
   }
diff --git a/torch/csrc/api/src/nn/modules/dropout.cpp b/torch/csrc/api/src/nn/modules/dropout.cpp
index 68dbac486d2c4c..c11c0e108413ac 100644
--- a/torch/csrc/api/src/nn/modules/dropout.cpp
+++ b/torch/csrc/api/src/nn/modules/dropout.cpp
@@ -20,31 +20,18 @@ DropoutImplBase<Derived>::DropoutImplBase(DropoutOptions options_)
 template <typename Derived>
 void DropoutImplBase<Derived>::reset() {}
 
-template <typename Derived>
-Tensor DropoutImplBase<Derived>::forward(Tensor input) {
-  if (options.rate_ == 0 || !this->is_training()) {
-    return input;
-  }
-
-  auto scale = 1.0f / (1.0f - options.rate_);
-  auto boolean_mask = noise_mask(input).uniform_(0, 1) > options.rate_;
-  auto noise = boolean_mask.to(input.dtype()).mul_(scale);
-
-  return input * noise;
-}
-
 template class DropoutImplBase<DropoutImpl>;
-template class DropoutImplBase<Dropout2dImpl>;
+template class DropoutImplBase<FeatureDropoutImpl>;
 } // namespace detail
 
 DropoutOptions::DropoutOptions(double rate) : rate_(rate) {}
 
-Tensor DropoutImpl::noise_mask(Tensor input) const {
-  return torch::empty_like(input);
+Tensor DropoutImpl::forward(Tensor input) {
+  return torch::dropout(input, options.rate_, this->is_training());
 }
 
-Tensor Dropout2dImpl::noise_mask(Tensor input) const {
-  return torch::empty({input.size(0), input.size(1), 1, 1}, input.options());
+Tensor FeatureDropoutImpl::forward(Tensor input) {
+  return torch::feature_dropout(input, options.rate_, this->is_training());
 }
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/src/nn/modules/functional.cpp b/torch/csrc/api/src/nn/modules/functional.cpp
index 591634db23640e..1820e8f4df2384 100644
--- a/torch/csrc/api/src/nn/modules/functional.cpp
+++ b/torch/csrc/api/src/nn/modules/functional.cpp
@@ -13,11 +13,11 @@ FunctionalImpl::FunctionalImpl(Function function)
 void FunctionalImpl::reset() {}
 
 Tensor FunctionalImpl::forward(Tensor input) {
-  return function_(input);
+  return function_(std::move(input));
 }
 
 Tensor FunctionalImpl::operator()(Tensor input) {
-  return forward(input);
+  return forward(std::move(input));
 }
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index a8dc01fa5ec3a8..d0ecc017b42b50 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -8,6 +8,7 @@
 
 #include <ATen/DeviceGuard.h>
 #include <ATen/ExpandUtils.h>
+#include <ATen/Error.h>
 
 #include <atomic>
 #include <condition_variable>
@@ -302,7 +303,7 @@ static void validate_outputs(const edge_list& edges, variable_list& grads, const
     std::stringstream ss;
     ss << "invalid number of gradients - expected ";
     ss << edges.size() << ", but got " << grads.size();
-    throw std::runtime_error(format_error(ss.str()));
+    AT_ERROR(format_error(ss.str()));
   }
   for (size_t i = 0; i < grads.size(); i++) {
     const auto& edge = edges[i];
@@ -314,7 +315,7 @@ static void validate_outputs(const edge_list& edges, variable_list& grads, const
       // FIXME: TestJit.test_ge_optimized fails this assertion.
       // std::stringstream ss;
       // ss << "undefined gradient at index " << i;
-      // throw std::runtime_error(format_error(ss.str()));
+      // AT_ERROR(format_error(ss.str()));
       continue;
     }
     if (!grads[i].sizes().equals(metadata.shape())) {
@@ -323,7 +324,7 @@ static void validate_outputs(const edge_list& edges, variable_list& grads, const
         ss << "invalid gradient at index " << i << " - got ";
         ss << grads[i].sizes() << " but expected shape compatible with ";
         ss << metadata.shape();
-        throw std::runtime_error(format_error(ss.str()));
+        AT_ERROR(format_error(ss.str()));
       }
       grads[i] = at::sum_to(grads[i], metadata.shape());
     }
@@ -331,14 +332,14 @@ static void validate_outputs(const edge_list& edges, variable_list& grads, const
       std::stringstream ss;
       ss << "invalid gradient at index " << i << " - expected type ";
       ss << metadata.type() << " but got " << grads[i].type();
-      throw std::runtime_error(format_error(ss.str()));
+      AT_ERROR(format_error(ss.str()));
     }
     const auto output_device = output.is_cuda() ? output.get_device() : -1;
     if (output_device != metadata.device()) {
       std::stringstream ss;
       ss << "invalid gradient at index " << i << " - expected device ";
       ss << metadata.device() << " but got " << output_device;
-      throw std::runtime_error(format_error(ss.str()));
+      AT_ERROR(format_error(ss.str()));
     }
   }
 }
diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp
index e077fadcb8214d..57fe29810c4c8f 100644
--- a/torch/csrc/autograd/function.cpp
+++ b/torch/csrc/autograd/function.cpp
@@ -21,6 +21,10 @@ namespace torch { namespace autograd {
 /// numbers.
 thread_local uint64_t Function_next_sequence_nr_ = 0;
 
+uint64_t Function::peek_at_next_sequence_nr() {
+  return Function_next_sequence_nr_;
+}
+
 uint64_t& Function::get_next_sequence_nr() {
   return Function_next_sequence_nr_;
 }
@@ -79,9 +83,9 @@ thread_local size_t deleteFunctionRecursionDepth = 0;
  *   times have gotten the following numbers: ~8300, 3669
  */
 #ifdef _WIN32
-constexpr size_t kDeleteFunctionMaxRecursionDepth = 3000;
+size_t deleteFunctionMaxRecursionDepth = 3000;
 #else
-constexpr size_t kDeleteFunctionMaxRecursionDepth = 10000;
+size_t deleteFunctionMaxRecursionDepth = 10000;
 #endif
 
 struct RecursionDepthCounter {
@@ -107,7 +111,7 @@ struct RecursionDepthCounter {
 void deleteFunction(Function* function) {
   RecursionDepthCounter recursion_depth;
 
-  if (recursion_depth.value() > kDeleteFunctionMaxRecursionDepth) {
+  if (recursion_depth.value() > deleteFunctionMaxRecursionDepth) {
     deleteFunctionQueue.push_back(function);
     return;
   }
@@ -117,7 +121,7 @@ void deleteFunction(Function* function) {
   if (deleteFunctionQueue.empty()) {
     return;
   }
-  if (recursion_depth.value() != kDeleteFunctionMaxRecursionDepth) {
+  if (recursion_depth.value() != deleteFunctionMaxRecursionDepth) {
     AT_ERROR("Only one deleter per thread should be able to process "
              "the delete queue. Please open an issue.");
   }
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index b4c90b1489a261..7b01b4e7e112f9 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -33,6 +33,8 @@ using edge_list = std::vector<Edge>;
 using saved_variable_list = std::vector<SavedVariable>;
 using IndexRange = std::pair<size_t, size_t>;
 
+TORCH_API extern size_t deleteFunctionMaxRecursionDepth;
+
 // Custom deleter to prevent stack overflows.
 void deleteFunction(Function* function);
 
@@ -313,6 +315,8 @@ struct TORCH_API Function : std::enable_shared_from_this<Function> {
     return nullptr;
   }
 
+  static uint64_t peek_at_next_sequence_nr();
+
  protected:
   static uint64_t& get_next_sequence_nr();
 
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index ca1575699cf08f..b306426025e025 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -5,6 +5,7 @@
 #include "torch/csrc/autograd/grad_mode.h"
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/python_function.h"
+#include "torch/csrc/autograd/function.h"
 
 PyObject * THPAutograd_initExtension(PyObject *_unused)
 {
@@ -49,6 +50,11 @@ PyObject * THPAutograd_initExtension(PyObject *_unused)
   });
   m.def("_pop_range", []() { torch::autograd::profiler::popRange(); });
 
+  /// TODO: Replace this ASAP with a better solution for deep autograd graphs!
+  m.def("_unsafe_set_delete_function_max_recursion_depth", [](size_t value) {
+    torch::autograd::deleteFunctionMaxRecursionDepth = value;
+  });
+
   Py_RETURN_TRUE;
 }
 
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 66ff6d1a09c7c0..77c884e9b71b64 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -1,6 +1,8 @@
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/function.h"
 
+#include <sstream>
+
 namespace torch { namespace autograd { namespace profiler {
 
 ProfilerState state = ProfilerState::Disabled;
@@ -37,13 +39,19 @@ void mark(std::string name, bool include_cuda /* = true */) {
   }
 }
 
-void pushRange(std::string name) {
+void pushRange(std::string name, const char* msg/*= ""*/, int64_t sequence_nr/*= -1*/) {
   if (state == ProfilerState::Disabled) {
     return;
   }
   if (state == ProfilerState::NVTX) {
 #ifdef USE_CUDA
-    nvtxRangePushA(name.c_str());
+    if(sequence_nr >= 0) {
+      std::stringstream s;
+      s << name << msg << sequence_nr;
+      nvtxRangePushA(s.str().c_str());
+    } 
+    else
+      nvtxRangePushA(name.c_str());
 #else
     throw std::logic_error(
         "pushRange called with NVTX tracing, but compiled without CUDA");
@@ -95,6 +103,13 @@ RecordFunction::RecordFunction(const char* name) {
   pushRange(name);
 }
 
+RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr) 
+{
+  if (state == ProfilerState::Disabled)
+    return;
+  pushRange(name, ", seq=", current_sequence_nr);
+}
+
 RecordFunction::~RecordFunction() {
   if (state == ProfilerState::Disabled)
     return;
@@ -102,7 +117,7 @@ RecordFunction::~RecordFunction() {
 }
 
 void RecordFunction::pushFunctionRange(Function* fn) {
-  pushRange(fn->name());
+  pushRange(fn->name(), ", stashed seq=", fn->sequence_nr());
 }
 
 #ifdef USE_CUDA
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index ba0fee1510baa2..29dc1044fb2f10 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -166,7 +166,7 @@ enum class ProfilerState {
 
 TORCH_API RangeEventList& getEventList();
 TORCH_API void mark(std::string name, bool include_cuda = true);
-TORCH_API void pushRange(std::string name);
+TORCH_API void pushRange(std::string name, const char* msg = "", int64_t sequence_nr = -1);
 TORCH_API void popRange();
 
 struct TORCH_API RecordFunction {
@@ -176,6 +176,8 @@ struct TORCH_API RecordFunction {
 
   explicit RecordFunction(const char* name);
 
+  explicit RecordFunction(const char* name, int64_t current_sequence_nr);
+
   ~RecordFunction();
 
   // Needed only because we don't have Function defined yet.
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 2301df22e1a661..ace53fdfb0aed9 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -575,9 +575,9 @@ static Node* _trace_pre_record(
   for (int i = 0; i < num_args; i++) {
     PyObject *arg_object = PyTuple_GET_ITEM(input_objects, i);
     if (THPVariable_Check(arg_object)) {
-      arg_types.push_back('t');
+      arg_types.push_back('d');
     } else {
-      arg_types.push_back('s');
+      arg_types.push_back('c');
       Py_INCREF(arg_object);
       scalar_args.emplace_back(arg_object);
     }
@@ -597,21 +597,34 @@ static void _trace_post_record(
     PyObject* op_obj,
     const variable_list& input_vars,
     PyObject *output_objects,
-    bool is_inplace) {
+    bool is_inplace,
+    bool unpack_output) {
   if (!jit::tracer::isTracing()) {
     return;
   }
 
+  node->i_(attr::inplace, is_inplace);
+
   // Isolate C variable ptrs in a vector
   int num_outputs = PyTuple_GET_SIZE(output_objects);
   variable_list output_vars(num_outputs);
+  auto graph = node->owningGraph();
+  node->addOutput();
+  if (!unpack_output) {
+    std::vector<TypePtr> tuple_values(num_outputs, DynamicType::get());
+    TypePtr tuple_type = TupleType::create(std::move(tuple_values));
+    node->output()->setType(tuple_type);
+    auto unpacked = graph->createTupleUnpack(node->output())->insertAfter(node);
+    node = unpacked;
+  }
   for (int i = 0; i < num_outputs; ++i) {
     auto var = (THPVariable*)PyTuple_GET_ITEM(output_objects, i);
-    jit::tracer::addOutput(node, var->cdata);
+    Value* value = node->outputs()[i];
+    if (var->cdata.defined()) {
+      value->inferTypeFrom(var->cdata);
+      jit::tracer::setValueTrace(autograd::as_variable_ref(var->cdata), value);
+    }
   }
-
-  node->i_(attr::inplace, is_inplace);
-
 }
 
 PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const UnpackedInput& unpacked,
@@ -637,7 +650,7 @@ PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const Unpacked
 
   bool is_inplace = static_cast<bool>(grad_fn->dirty_tensors);
   _wrap_outputs(grad_fn, inputs, raw_output, outputs, is_executable);
-  _trace_post_record(node, op_obj, unpacked.input_vars, outputs, is_inplace);
+  _trace_post_record(node, op_obj, unpacked.input_vars, outputs, is_inplace, unpack_output);
   if (is_executable) {
     _save_variables(grad_fn);
   } else {
diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp
index 339e58cde4e56c..3ba7ff94bc1fd7 100644
--- a/torch/csrc/autograd/python_legacy_variable.cpp
+++ b/torch/csrc/autograd/python_legacy_variable.cpp
@@ -6,6 +6,7 @@
 #include "torch/csrc/autograd/python_function.h"
 #include "torch/csrc/autograd/python_variable.h"
 #include "torch/csrc/tensor/python_tensor.h"
+#include "torch/csrc/jit/tracer.h"
 
 using namespace at;
 
@@ -67,6 +68,12 @@ static PyObject *THPVariable_pynew(PyTypeObject* type, PyObject *args, PyObject
     var.set_name(name);
   }
 
+  if (jit::tracer::isTracing() && data && data != Py_None && THPVariable_Check(data)) {
+    if (auto *v = jit::tracer::getValueTrace(((THPVariable*)data)->cdata)) {
+      jit::tracer::setValueTrace(var, v);
+    }
+  }
+
   return THPVariable_Wrap(std::move(var));
   END_HANDLE_TH_ERRORS
 }
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index b1c2d85592d6dd..c3f9bd510a8289 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -22,6 +22,7 @@
 #include "torch/csrc/utils/python_strings.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/tensor_new.h"
+#include "torch/csrc/jit/tracer.h"
 
 #include <ATen/ATen.h>
 
@@ -125,6 +126,7 @@ static void THPVariable_dealloc(THPVariable* self)
 static PyObject *THPVariable_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
 {
   HANDLE_TH_ERRORS
+  jit::tracer::warn("torch.Tensor", jit::tracer::WARN_CONSTRUCTOR);
   auto& default_type = torch::tensors::get_default_tensor_type();
   auto tensor = torch::utils::legacy_tensor_ctor(default_type, args, kwargs);
   return THPVariable_NewWithVar(type, std::move(tensor));
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index a3bfd2f7749a60..5f7631512c715e 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -22,7 +22,7 @@
 namespace torch {
 namespace autograd {
 Variable::Impl::Impl(at::Tensor data, bool requires_grad, Edge gradient_edge)
-    : TensorImpl(data.type().type_id(), data.type().scalarType(), data.type().allocator(), /* is variable */ true),
+    : TensorImpl(data.type().type_id(), data.type().typeMeta(), data.type().allocator(), /* is variable */ true),
       data_(std::move(data)),
       grad_fn_(std::move(gradient_edge.function)),
       requires_grad_(false),
@@ -108,7 +108,9 @@ std::shared_ptr<Function> Variable::Impl::get_grad_accumulator() {
   if (result)
     return result;
 
-  result = std::make_shared<AccumulateGrad>(Variable(this, true));
+  c10::raw::intrusive_ptr::incref(this);
+  auto intrusive_from_this = c10::intrusive_ptr<Variable::Impl>::reclaim(this);
+  result = std::make_shared<AccumulateGrad>(Variable(std::move(intrusive_from_this)));
   grad_accumulator_ = result;
   return result;
 }
@@ -157,7 +159,7 @@ void Variable::Impl::set_data(Tensor new_data) {
   }
 
   // Updates metadata
-  scalar_type_ = new_data.type().scalarType();
+  data_type_ = new_data.type().typeMeta();
   type_id_ = new_data.type().type_id();
   is_variable_ = true;
   data_ = std::move(new_data);
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index 5a59afda5a8640..bd2e475645975a 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -265,7 +265,7 @@ struct TORCH_API Variable : public at::Tensor {
   // Private Methods
   //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-  Variable(Variable::Impl* self, bool retain);
+  Variable(c10::intrusive_ptr<Variable::Impl> self);
   Impl* get() const;
 };
 
@@ -417,7 +417,7 @@ inline Variable make_variable_view(
     Edge gradient_edge = Edge()) {
   if (data.defined()) {
     return Variable(c10::make_intrusive<Variable::ViewImpl>(
-            std::move(base), std::move(data), std::move(gradient_edge)).release(), false);
+            std::move(base), std::move(data), std::move(gradient_edge)));
   }
   return Variable();
 }
@@ -427,7 +427,7 @@ inline Variable make_variable(at::Tensor data, bool requires_grad = false) {
       !data.is_variable(),
       "Must not create a new variable from a variable, use its .data()");
   if (data.defined()) {
-    return Variable(c10::make_intrusive<Variable::Impl>(data, requires_grad).release(), false);
+    return Variable(c10::make_intrusive<Variable::Impl>(data, requires_grad));
   }
   return Variable();
 }
@@ -437,7 +437,7 @@ inline Variable make_variable(at::Tensor data, Edge gradient_edge) {
       !data.is_variable(),
       "Must not create a new variable from a variable, use its .data()");
   if (data.defined()) {
-    return Variable(c10::make_intrusive<Variable::Impl>(data, false, std::move(gradient_edge)).release(), false);
+    return Variable(c10::make_intrusive<Variable::Impl>(data, false, std::move(gradient_edge)));
   }
   return Variable();
 }
@@ -594,8 +594,8 @@ inline PyObject* Variable::pyobj() const noexcept {
 // Private Methods
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-inline Variable::Variable(Variable::Impl* self, bool retain)
-    : at::Tensor(self, retain) {}
+inline Variable::Variable(c10::intrusive_ptr<Variable::Impl> self)
+    : at::Tensor(std::move(self)) {}
 
 inline Variable::Impl* Variable::get() const {
   AT_CHECK(defined(), "Called Variable::get() on an undefined Variable");
diff --git a/torch/csrc/byte_order.cpp b/torch/csrc/byte_order.cpp
index 8bc1ff10f98c32..7818ca2191b966 100644
--- a/torch/csrc/byte_order.cpp
+++ b/torch/csrc/byte_order.cpp
@@ -2,34 +2,94 @@
 
 #include <string.h>
 
+#if defined(_MSC_VER)
+#include <stdlib.h>
+#endif
+
+static inline void swapBytes16(void *ptr)
+{
+  uint16_t output;
+  memcpy(&output, ptr, sizeof(uint16_t));
+#if defined(_MSC_VER) && !defined(_DEBUG)
+  output = _byteswap_ushort(output);
+#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
+  output = __builtin_bswap16(output);
+#else
+  uint16_t Hi = output >> 8;
+  uint16_t Lo = output << 8;
+  output = Hi | Lo;
+#endif
+  memcpy(ptr, &output, sizeof(uint16_t));
+}
+
+static inline void swapBytes32(void *ptr)
+{
+  uint32_t output;
+  memcpy(&output, ptr, sizeof(uint32_t));
+#if defined(_MSC_VER) && !defined(_DEBUG)
+  output = _byteswap_ulong(output);
+#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
+  output = __builtin_bswap32(output);
+#else
+   uint32_t Byte0 = output & 0x000000FF;
+   uint32_t Byte1 = output & 0x0000FF00;
+   uint32_t Byte2 = output & 0x00FF0000;
+   uint32_t Byte3 = output & 0xFF000000;
+   output = (Byte0 << 24) | (Byte1 << 8) | (Byte2 >> 8) | (Byte3 >> 24);
+#endif
+  memcpy(ptr, &output, sizeof(uint32_t));
+}
+
+static inline void swapBytes64(void *ptr)
+{
+  uint64_t output;
+  memcpy(&output, ptr, sizeof(uint64_t));
+#if defined(_MSC_VER) && !defined(_DEBUG)
+  output = _byteswap_uint64(output);
+#elif defined(__llvm__) || defined(__GNUC__) && !defined(__ICC)
+  output = __builtin_bswap64(output);
+#else
+   uint64_t Hi = SwapByteOrder_32(uint32_t(value));
+   uint32_t Lo = SwapByteOrder_32(uint32_t(value >> 32));
+   return (Hi << 32) | Lo;
+#endif
+  memcpy(ptr, &output, sizeof(uint64_t));
+}
+
 static inline uint16_t decodeUInt16LE(const uint8_t *data) {
-  return (data[0]<<0) | (data[1]<<8);
+  uint16_t output;
+  memcpy(&output, data, sizeof(uint16_t));
+  return output;
 }
 
 static inline uint16_t decodeUInt16BE(const uint8_t *data) {
-  return (data[1]<<0) | (data[0]<<8);
+  uint16_t output = decodeUInt16LE(data);
+  swapBytes16(&output);
+  return output;
 }
 
 static inline uint32_t decodeUInt32LE(const uint8_t *data) {
-  return (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
+  uint32_t output;
+  memcpy(&output, data, sizeof(uint32_t));
+  return output;
 }
 
 static inline uint32_t decodeUInt32BE(const uint8_t *data) {
-  return (data[3]<<0) | (data[2]<<8) | (data[1]<<16) | (data[0]<<24);
+  uint32_t output = decodeUInt32LE(data);
+  swapBytes32(&output);
+  return output;
 }
 
 static inline uint64_t decodeUInt64LE(const uint8_t *data) {
-  return (((uint64_t)data[0])<< 0) | (((uint64_t)data[1])<< 8) |
-         (((uint64_t)data[2])<<16) | (((uint64_t)data[3])<<24) |
-         (((uint64_t)data[4])<<32) | (((uint64_t)data[5])<<40) |
-         (((uint64_t)data[6])<<48) | (((uint64_t)data[7])<<56);
+  uint64_t output;
+  memcpy(&output, data, sizeof(uint64_t));
+  return output;
 }
 
 static inline uint64_t decodeUInt64BE(const uint8_t *data) {
-  return (((uint64_t)data[7])<< 0) | (((uint64_t)data[6])<< 8) |
-         (((uint64_t)data[5])<<16) | (((uint64_t)data[4])<<24) |
-         (((uint64_t)data[3])<<32) | (((uint64_t)data[2])<<40) |
-         (((uint64_t)data[1])<<48) | (((uint64_t)data[0])<<56);
+  uint64_t output = decodeUInt64LE(data);
+  swapBytes64(&output);
+  return output;
 }
 
 THPByteOrder THP_nativeByteOrder()
@@ -92,24 +152,12 @@ void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order,
   }
 }
 
-template<size_t size>
-static void swapBytes(uint8_t *ptr)
-{
-  uint8_t tmp;
-  for (size_t i = 0; i < size / 2; i++) {
-    tmp = ptr[i];
-    ptr[i] = ptr[size-i];
-    ptr[size-i] = tmp;
-  }
-}
-
-
 void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order, size_t len)
 {
   memcpy(dst, src, sizeof(int16_t) * len);
   if (order != THP_nativeByteOrder()) {
     for (size_t i = 0; i < len; i++) {
-      swapBytes<sizeof(int16_t)>(dst);
+      swapBytes16(dst);
       dst += sizeof(int16_t);
     }
   }
@@ -120,7 +168,7 @@ void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order,
   memcpy(dst, src, sizeof(int32_t) * len);
   if (order != THP_nativeByteOrder()) {
     for (size_t i = 0; i < len; i++) {
-      swapBytes<sizeof(int32_t)>(dst);
+      swapBytes32(dst);
       dst += sizeof(int32_t);
     }
   }
@@ -131,7 +179,7 @@ void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order,
   memcpy(dst, src, sizeof(int64_t) * len);
   if (order != THP_nativeByteOrder()) {
     for (size_t i = 0; i < len; i++) {
-      swapBytes<sizeof(int64_t)>(dst);
+      swapBytes64(dst);
       dst += sizeof(int64_t);
     }
   }
@@ -142,7 +190,7 @@ void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, s
   memcpy(dst, src, sizeof(float) * len);
   if (order != THP_nativeByteOrder()) {
     for (size_t i = 0; i < len; i++) {
-      swapBytes<sizeof(float)>(dst);
+      swapBytes32(dst);
       dst += sizeof(float);
     }
   }
@@ -153,7 +201,7 @@ void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order,
   memcpy(dst, src, sizeof(double) * len);
   if (order != THP_nativeByteOrder()) {
     for (size_t i = 0; i < len; i++) {
-      swapBytes<sizeof(double)>(dst);
+      swapBytes64(dst);
       dst += sizeof(double);
     }
   }
diff --git a/torch/csrc/cuda/THCP.h b/torch/csrc/cuda/THCP.h
index bd3b2d1c5580cd..8ed6f6bf91ce2c 100644
--- a/torch/csrc/cuda/THCP.h
+++ b/torch/csrc/cuda/THCP.h
@@ -4,7 +4,7 @@
 #include "torch/csrc/python_headers.h"
 #include <TH/TH.h>
 #include <THC/THC.h>
-#include <THC/THCHalf.h>
+#include <TH/THHalf.h>
 #include <THC/THCTensor.hpp>
 
 #include "torch/csrc/THP.h"
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 5ca60c7c62a2a0..716a1d30c3c9cd 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -72,6 +72,9 @@ tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t bu
                    [&](const at::Tensor& t) { return t.get_device() == devices[0]; })) {
     throw std::runtime_error("all tensors must be on devices[0]");
   }
+#ifdef USE_NCCL
+  buffer_size = std::min(torch::cuda::nccl::get_max_count(), buffer_size);
+#endif
 
   tensor_list2d outputs(devices.size());
   outputs[0] = tensors.vec();
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index f69e69e633b2e6..e769b85f8e2d90 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -5,6 +5,8 @@
 
 #include <unordered_map>
 #include <sstream>
+#include <limits>
+#include <type_traits>
 #include <ATen/ATen.h>
 #include <THC/THC.h>
 #include <THC/THCStream.h>
@@ -26,7 +28,7 @@ struct NcclCommList {
   int ndevices;
   NcclCommList(const std::vector<int>& devices)
     : comms(new ncclComm_t[devices.size()]), ndevices(devices.size()) {
-    CHECK(ncclCommInitAll(comms.get(), devices.size(), devices.data()));
+    NCCL_CHECK(ncclCommInitAll(comms.get(), devices.size(), devices.data()));
   }
   NcclCommList(NcclCommList&& foo) = default;
   ~NcclCommList() {
@@ -177,6 +179,28 @@ std::uint64_t version() {
 #endif
 }
 
+namespace {
+  // NCCL changed the numerical type used for count between NCCL1 and NCCL2.
+  // So we use the following struct, which gets the type of the second argument
+  // of T, if T is a function type, with ncclBcast, to get that type statically
+  // and programmatically.
+
+  template<typename T>
+  struct GetSecondArgType;
+
+  template<typename R, typename Arg0, typename Arg1, typename ...Args>
+  struct GetSecondArgType<R(Arg0, Arg1, Args...)> {
+    typedef typename std::decay<Arg1>::type type;
+  };
+
+  constexpr auto count_max = std::numeric_limits<GetSecondArgType<decltype(ncclBcast)>::type>::max();
+}
+
+size_t get_max_count() {
+  return count_max;
+}
+
+
 void broadcast(TensorList tensors, const stream_list& streams, const comm_list& user_comms) {
 #ifdef USE_NCCL
   using namespace torch::cuda::nccl::detail;
@@ -192,7 +216,10 @@ void broadcast(TensorList tensors, const stream_list& streams, const comm_list&
     device_guard.set_index(tensors[i].get_device());
     // TODO: use current stream
     const auto stream = (streams.empty() || !streams[i]) ? nullptr : THCStream_stream(streams[i]);
-    CHECK(ncclBcast(tensors[i].data_ptr(), numel, data_type, 0, comms[i], stream));
+    AT_CHECK(static_cast<uint64_t>(numel) <= static_cast<uint64_t>(count_max),
+             "Broadcast tensor has ", numel, " elements, which exceeds the "
+             "maximum NCCL supports (", count_max, ")");
+    NCCL_CHECK(ncclBcast(tensors[i].data_ptr(), numel, data_type, 0, comms[i], stream));
   }
 #else
   throw std::runtime_error("PyTorch built without NCCL support");
diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h
index 57bb5952454e38..349d8bcfdf507f 100644
--- a/torch/csrc/cuda/nccl.h
+++ b/torch/csrc/cuda/nccl.h
@@ -12,7 +12,7 @@ namespace detail {
 
 void throw_nccl_error(ncclResult_t status);
 
-static inline void CHECK(ncclResult_t status) {
+static inline void NCCL_CHECK(ncclResult_t status) {
   if (status != ncclSuccess) {
     throw_nccl_error(status);
   }
@@ -21,12 +21,12 @@ static inline void CHECK(ncclResult_t status) {
 struct AutoNcclGroup {
   AutoNcclGroup() {
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
-    CHECK(ncclGroupStart());
+    NCCL_CHECK(ncclGroupStart());
 #endif
   }
   ~AutoNcclGroup() {
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
-    CHECK(ncclGroupEnd());
+    NCCL_CHECK(ncclGroupEnd());
 #endif
   }
 };
@@ -47,4 +47,6 @@ void broadcast(at::TensorList tensors,
                const stream_list& streams = {},
                const comm_list& user_comms = {});
 
+size_t get_max_count();
+
 }}}
diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp
index 49d36e51649a4a..79a859f20d1075 100644
--- a/torch/csrc/cuda/python_nccl.cpp
+++ b/torch/csrc/cuda/python_nccl.cpp
@@ -26,7 +26,7 @@ PyObject * THCPModule_nccl_version(PyObject *self, PyObject *args) {
 PyObject * THCPModule_nccl_unique_id(PyObject *self, PyObject *args) {
   HANDLE_TH_ERRORS
   ncclUniqueId id;
-  CHECK(ncclGetUniqueId(&id));
+  NCCL_CHECK(ncclGetUniqueId(&id));
   return PyBytes_FromStringAndSize((char*)&id, NCCL_UNIQUE_ID_BYTES);
   END_HANDLE_TH_ERRORS
 }
@@ -109,7 +109,7 @@ PyObject * THCPModule_nccl_init_rank(PyObject *self, PyObject *args) {
   memcpy(&commId, id, NCCL_UNIQUE_ID_BYTES);
   ncclComm_t comm;
   with_no_gil([&]{
-    CHECK(ncclCommInitRank(&comm, nranks, commId, rank));
+    NCCL_CHECK(ncclCommInitRank(&comm, nranks, commId, rank));
   });
   return PyCapsule_New(comm, COMM_CAPSULE_NAME, &destroy_nccl_comm);
   END_HANDLE_TH_ERRORS
@@ -149,7 +149,7 @@ PyObject * THCPModule_nccl_reduce(PyObject *self, PyObject *args) {
       int device = inputs[i].get_device();
       device_guard.set_index(device);
       auto stream = (streams[i] == nullptr) ? nullptr : THCStream_stream(streams[i]);
-      CHECK(ncclReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
+      NCCL_CHECK(ncclReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
            count, data_type, (ncclRedOp_t) op, root, comms[i], stream));
     }
   });
@@ -191,7 +191,7 @@ PyObject * THCPModule_nccl_all_reduce(PyObject *self, PyObject *args) {
       int device = inputs[i].get_device();
       device_guard.set_index(device);
       auto stream = (streams[i] == nullptr) ? nullptr : THCStream_stream(streams[i]);
-      CHECK(ncclAllReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
+      NCCL_CHECK(ncclAllReduce(inputs[i].data_ptr(), outputs[i].data_ptr(),
           count, data_type, (ncclRedOp_t) op, comms[i], stream));
     }
   });
@@ -255,10 +255,10 @@ PyObject * THCPModule_nccl_all_gather(PyObject *self, PyObject *args) {
       device_guard.set_index(device);
       auto stream = (streams[i] == nullptr) ? nullptr : THCStream_stream(streams[i]);
     #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
-      CHECK(ncclAllGather(inputs[i].data_ptr(), outputs[i].data_ptr(),
+      NCCL_CHECK(ncclAllGather(inputs[i].data_ptr(), outputs[i].data_ptr(),
         count, data_type, comms[i], stream));
     #else
-      CHECK(ncclAllGather(inputs[i].data_ptr(), count, data_type,
+      NCCL_CHECK(ncclAllGather(inputs[i].data_ptr(), count, data_type,
         outputs[i].data_ptr(), comms[i], stream));
     #endif
     }
@@ -299,7 +299,7 @@ PyObject * THCPModule_nccl_reduce_scatter(PyObject *self, PyObject *args) {
       int device = inputs[i].get_device();
       device_guard.set_index(device);
       auto stream = (streams[i] == nullptr) ? nullptr : THCStream_stream(streams[i]);
-      CHECK(ncclReduceScatter(inputs[i].data_ptr(), outputs[i].data_ptr(),
+      NCCL_CHECK(ncclReduceScatter(inputs[i].data_ptr(), outputs[i].data_ptr(),
           count, data_type, (ncclRedOp_t) op, comms[i], stream));
     }
   });
diff --git a/torch/csrc/distributed/Module.cpp b/torch/csrc/distributed/Module.cpp
index 9bf833e0b7a570..84d655288fe373 100644
--- a/torch/csrc/distributed/Module.cpp
+++ b/torch/csrc/distributed/Module.cpp
@@ -121,7 +121,7 @@ static THDReduceOp _getReduceOp(PyObject *obj)
   auto it = obj2reduceop.find(obj);
   if (it == obj2reduceop.end()) {
     throw std::runtime_error("op should be a constant from "
-        "torch.distributed.reduce_op");
+        "torch.distributed.deprecated.reduce_op");
   }
   return it->second;
 }
@@ -132,7 +132,7 @@ static THDGroup _getGroup(PyObject *obj)
   if (it == obj2group.end()) {
     if (!THPUtils_checkLong(obj))
       throw std::runtime_error("group should be an int or one of the values "
-          "from torch.distributed.group");
+          "from torch.distributed.deprecated.group");
     return THPUtils_unpackLong(obj);
   }
   return it->second;
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 021e5e01247d15..72bcac27cf8754 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -34,7 +34,7 @@ using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 
 PyObject* c10d_init(PyObject* _unused) {
   auto c10d_module =
-      THPObjectPtr(PyImport_ImportModule("torch.distributed.c10d"));
+      THPObjectPtr(PyImport_ImportModule("torch.distributed"));
   if (!c10d_module) {
     throw python_error();
   }
@@ -258,7 +258,8 @@ PyObject* c10d_init(PyObject* _unused) {
               "recv_anysource",
               [](::c10d::ProcessGroup& pg,
                  std::vector<at::Tensor>& input,
-                 at::Tensor& srcRankTensor) {
+                 at::Tensor& srcRankTensor,
+                 int tag) {
                 if (srcRankTensor.type().scalarType() != at::kInt) {
                   throw std::runtime_error(
                       "source rank tensor needs to be "
@@ -270,10 +271,11 @@ PyObject* c10d_init(PyObject* _unused) {
                       "contain only one element");
                 }
                 return pg.recvAnysource(
-                    input, static_cast<int*>(srcRankTensor.data_ptr()));
+                    input, static_cast<int*>(srcRankTensor.data_ptr()), tag);
               },
               py::arg("tensors"),
               py::arg("src_rank"),
+              py::arg("tag"),
               py::call_guard<py::gil_scoped_release>())
 
           .def(
@@ -375,6 +377,7 @@ PyObject* c10d_init(PyObject* _unused) {
           &::c10d::ProcessGroup::Work::wait,
           py::call_guard<py::gil_scoped_release>());
 
+#ifdef USE_CUDA
   module.def(
       "_dist_broadcast_coalesced",
       &::c10d::distBroadcastCoalesced,
@@ -392,6 +395,7 @@ PyObject* c10d_init(PyObject* _unused) {
       py::arg("broadcast_bucket_size"),
       py::arg("broadcast_buffers"),
       py::call_guard<py::gil_scoped_release>());
+#endif
 
   Py_RETURN_TRUE;
 }
diff --git a/torch/csrc/generic/utils.h b/torch/csrc/generic/utils.h
index ade810564fd2ae..bd7c2fb1050bff 100644
--- a/torch/csrc/generic/utils.h
+++ b/torch/csrc/generic/utils.h
@@ -15,7 +15,7 @@ typedef class THPPointer<THWStorage>      THWStoragePtr;
 typedef class THPPointer<THWTensor>       THWTensorPtr;
 typedef class THPPointer<THPStorage>     THPStoragePtr;
 
-#if (!defined(THC_GENERIC_FILE) || defined(THC_REAL_IS_HALF)) && \
+#if (!defined(THC_GENERIC_FILE)) && \
     (!defined(THD_GENERIC_FILE))
 template<>
 struct THPUtils_typeTraits<scalar_t> {
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index 3d53ad3967c8cd..ab6a07cbb1d959 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -72,25 +72,15 @@ bool isDifferentiable(Graph & g) {
 }
 
 
-bool outputRequiresGrad(Node* node, std::function<bool(Value*)> requires_grad) {
-  switch (node->kind()) {
-    case aten::le:
-    case aten::ge:
-    case aten::lt:
-    case aten::gt:
-    case aten::ne:
-    case aten::eq:
-      return false;
-    case aten::type_as:
-      // type_as has two inputs, the second of which (setting type) might require grad,
-      // but it still won't affect the output of type_as requiring grad.
-      return requires_grad(node->inputs().at(0));
-    default:
-      return std::any_of(node->inputs().begin(), node->inputs().end(), requires_grad);
-  }
-}
-
 static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_values) {
+  static const OperatorSet comparison_ops = {
+    "aten::lt(Tensor self, Tensor other) -> Tensor",
+    "aten::le(Tensor self, Tensor other) -> Tensor",
+    "aten::gt(Tensor self, Tensor other) -> Tensor",
+    "aten::ge(Tensor self, Tensor other) -> Tensor",
+    "aten::eq(Tensor self, Tensor other) -> Tensor",
+    "aten::ne(Tensor self, Tensor other) -> Tensor"
+  };
   const auto build_sym_grad = [node](const std::vector<SymbolicVariable>& grads) -> std::vector<SymbolicVariable> {
     auto inputs = fmap<SymbolicVariable>(node->inputs());
     auto outputs = fmap<SymbolicVariable>(node->outputs());
@@ -212,6 +202,8 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
         tensor_grads.push_back(nullptr); // for attr::dim
         return tensor_grads;
       }
+    } else if (comparison_ops.find(node)) {
+      return {nullptr, nullptr};
 
     } else if (node->kind() == prim::Constant) {
       return {};
@@ -226,28 +218,6 @@ static std::vector<Value*> gradientForNode(Node* node, ArrayRef<Value*> grad_val
   return fmap(sym_grads, [](const SymbolicVariable &v) { return v.value(); });
 }
 
-static value_set findAllRequiresGradNodes(
-        Graph& graph, const std::vector<bool>& input_requires_grad) {
-  JIT_ASSERT(graph.inputs().size() == input_requires_grad.size());
-  std::unordered_set<Value*> requires_grad_set;
-  const auto requires_grad = [&](Value *v) { return requires_grad_set.count(v) > 0; };
-
-  auto inputs = graph.inputs();
-  for (size_t i = 0, num_inputs = inputs.size(); i < num_inputs; ++i) {
-    if (!input_requires_grad[i]) continue;
-    requires_grad_set.emplace(inputs[i]);
-  }
-
-  for (Node * node : graph.nodes()) {
-    if (!outputRequiresGrad(node, requires_grad)) continue;
-    for (Value * output : node->outputs())
-      requires_grad_set.emplace(output);
-  }
-
-  return requires_grad_set;
-}
-
-
 // If we have a function y = f(x) with jacobian J, the backwards of f is dx = J^t dy.
 // Note that because the backwards always implements this matrix multiply,
 // we know that it maps an input vector of zeros to an output vector of zero
@@ -277,13 +247,11 @@ static std::vector<Value*> linearGradientForNode(Node* node, ArrayRef<Value*> gr
 }
 
 struct ReverseDetails {
-  ReverseDetails(value_map&& grad_map, value_set&& requires_grad_set, Block * reverse_block)
+  ReverseDetails(value_map&& grad_map, Block * reverse_block)
     : grad_map(std::move(grad_map))
-    , requires_grad_set(std::move(requires_grad_set))
     , reverse_block(reverse_block) {}
 
   value_map grad_map;
-  value_set requires_grad_set;
   Block * reverse_block;
 };
 
@@ -305,8 +273,7 @@ static Value* createAutogradAdd(Value* a, Value* b) {
 //     and vjp outputs for all primal inputs that require_grad
 //   - grad_desc has df_input_vjps and df_output_vjps set
 //     (but df_input_vjps will be modified later as well)
-static ReverseDetails addReverseInline(Gradient& grad_desc,
-                                  const std::vector<bool>& input_requires_grad) {
+static ReverseDetails addReverseInline(Gradient& grad_desc) {
   auto & graph = *grad_desc.f;
   // note: reverse_node is intentionally not inserted to avoid
   // accidentally acting on it (e.g. in elminate dead code),
@@ -314,8 +281,6 @@ static ReverseDetails addReverseInline(Gradient& grad_desc,
   auto reverse_node = graph.create(prim::Reverse, 0);
   auto reverse_block = reverse_node->addBlock();
   WithInsertPoint guard(reverse_block);
-  auto requires_grad_set = findAllRequiresGradNodes(graph, input_requires_grad);
-  const auto requires_grad = [&](Value *v) { return requires_grad_set.count(v) > 0; };
 
   value_map grad_map; // x -> dx mapping
   const auto get_grad = [&](Value* v) -> Value* {
@@ -337,7 +302,7 @@ static ReverseDetails addReverseInline(Gradient& grad_desc,
   auto outputs = graph.outputs();
   for (size_t i = 0, num_outputs = outputs.size(); i < num_outputs; ++i) {
     Value * output = outputs[i];
-    if (!requires_grad(output))
+    if (!output->requires_grad())
       continue;
     Value * output_grad = reverse_block->addInput()->setType(output->type());
     set_grad(output, output_grad);
@@ -347,12 +312,15 @@ static ReverseDetails addReverseInline(Gradient& grad_desc,
   for (auto it = graph.nodes().rbegin(), end = graph.nodes().rend(); it != end; ++it) {
     Node *node = *it;
     auto inputs = node->inputs();
-    if (!outputRequiresGrad(node, requires_grad)) continue;
+    auto outputs = node->outputs();
+    if (std::all_of(outputs.begin(), outputs.end(), [](Value *v) { return !v->requires_grad(); })) {
+      continue;
+    }
 
     value_list grad_inputs = linearGradientForNode(node, fmap(node->outputs(), get_grad));
     JIT_ASSERT(grad_inputs.size() == node->inputs().size());
     for (size_t i = 0, num_inputs = grad_inputs.size(); i < num_inputs; ++i) {
-      if (!requires_grad(inputs[i])) continue;
+      if (!inputs[i]->requires_grad()) continue;
       // NB: Not returning a gradient w.r.t. a value that requires grad is normal if the
       // input is non-differentiable. This happens e.g. in the aten::type_as case.
       if (!grad_inputs[i]) continue;
@@ -363,7 +331,7 @@ static ReverseDetails addReverseInline(Gradient& grad_desc,
   auto inputs = graph.inputs();
   for (size_t i = 0, num_inputs = inputs.size(); i < num_inputs; ++i) {
     Value * input = inputs[i];
-    if (!requires_grad(input))
+    if (!input->requires_grad())
       continue;
     // NB: Not having a gradient defined w.r.t. an input to the graph which requires grad
     // can happen and is not an error. It might have been used only in non-differentiable
@@ -373,7 +341,7 @@ static ReverseDetails addReverseInline(Gradient& grad_desc,
     reverse_block->registerOutput(get_grad(input));
     grad_desc.df_output_vjps.push_back(i);
   }
-  return ReverseDetails(std::move(grad_map), std::move(requires_grad_set), reverse_block);
+  return ReverseDetails(std::move(grad_map), reverse_block);
 }
 
 // Any temporary value from the primal graphs needs to be captured for later use in the
@@ -500,7 +468,7 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) {
   for (size_t i = grad_desc.f_real_outputs; i < graph.outputs().size(); ++i) {
     Value * tmp = graph.outputs().at(i);
     // Add VJP inputs only for intermediates that actually required grad.
-    if (rev_info.requires_grad_set.count(tmp) == 0) continue;
+    if (!tmp->requires_grad()) continue;
     Value * tmp_vjp_in = reverse_block->addInput()->setType(tmp->type());
     Value * tmp_vjp_prev = rev_info.grad_map.at(tmp);
     // This is quite weird because we can't first make a sum and then replace all uses
@@ -536,7 +504,7 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) {
   reverse_block->owningNode()->destroy();
 }
 
-Gradient differentiate(std::shared_ptr<Graph>& graph, const std::vector<bool>& requires_grad) {
+Gradient differentiate(std::shared_ptr<Graph>& graph) {
   Gradient grad_desc;
   // Take ownership of the graph
   JIT_ASSERTM(graph.use_count() == 1,
@@ -547,7 +515,7 @@ Gradient differentiate(std::shared_ptr<Graph>& graph, const std::vector<bool>& r
 
   WithInsertPoint guard(grad_desc.f->block());
   // Fills in df_input_vjps and df_output_vjps
-  auto rev_info = addReverseInline(grad_desc, requires_grad);
+  auto rev_info = addReverseInline(grad_desc);
   // Lift constants captured for the reverse graph into it
   liftConstants(grad_desc, rev_info);
   // addReverseInline has to call gradientForNode if *any* of the outputs
diff --git a/torch/csrc/jit/autodiff.h b/torch/csrc/jit/autodiff.h
index ea2b7a1170efeb..ffb07a767083cc 100644
--- a/torch/csrc/jit/autodiff.h
+++ b/torch/csrc/jit/autodiff.h
@@ -85,9 +85,7 @@ struct Gradient {
   //   - Interpret df
   //   - Wrap outputs of df into Variables (that don't require grad)
 };
-// XXX: When calling this function, graph should have complete type information.
-// Use the shape analysis pass to fill in the gaps if it doesn't.
-TORCH_API Gradient differentiate(std::shared_ptr<Graph>& graph, const std::vector<bool>& requires_grad);
+TORCH_API Gradient differentiate(std::shared_ptr<Graph>& graph);
 
 // can we take a derivative of this node symbolically?
 TORCH_API bool isDifferentiable(Node * n);
diff --git a/torch/csrc/jit/catch_utils.hpp b/torch/csrc/jit/catch_utils.hpp
new file mode 100644
index 00000000000000..b9b0a87990a9ce
--- /dev/null
+++ b/torch/csrc/jit/catch_utils.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+#define CATCH_CONFIG_PREFIX_ALL
+#include <catch.hpp>
+
+// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes warning;
+// define our own version that doesn't warn.
+#define _CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index d7876411c687a6..4cdb193d8434d8 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -33,8 +33,12 @@ Value* insertConstant(
     }));
     n->output()->setType(ListType::ofTensors());
   } else if(val.isString()) {
-    n->s_(attr::string, val.toString()->string());
+    n->s_(attr::value, val.toString()->string());
     n->output()->setType(StringType::get());
+  } else if(val.isNone()) {
+    n->destroy();
+    n = g.create(prim::None);
+    n->output()->setType(NoneType::get());
   } else {
     throw constant_not_supported_error("Unsupported value kind: " + val.tagKind());
   }
@@ -86,7 +90,7 @@ RegisterOperators reg({
             return 0;
           };
         } else if (type == StringType::get()) {
-          auto s = node->s(attr::string);
+          auto s = node->s(attr::value);
           return [s](Stack& stack) {
             push(stack, s);
             return 0;
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index e5e6cf1960df60..1984f35fcc8974 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -1,7 +1,7 @@
 #include "torch/csrc/jit/export.h"
 #include "torch/csrc/jit/serialization.h"
 #include "torch/csrc/autograd/symbolic.h"
-#include "onnx/onnx.pb.h"
+#include "onnx/onnx_pb.h"
 #include "torch/csrc/onnx/onnx.h"
 
 #include "torch/csrc/utils/functional.h"
@@ -179,6 +179,8 @@ void EncoderBase::EncodeValueInfo(
       shape->mutable_dim(i)->set_dim_value(sizes[i]);
     }
     tensor_type->set_elem_type(ATenTypeToOnnxType(node_type->scalarType()));
+  } else {
+    tensor_type->set_elem_type(onnx::TensorProto_DataType_UNDEFINED);
   }
 }
 
@@ -500,6 +502,7 @@ void ModuleEncoder::EncodeTypeInfo(
   auto kind = type->kind();
   if (kind == TypeKind::DynamicType) {
     type_proto->set_denotation("DynamicType");
+    tensortype_proto->set_elem_type(onnx::TensorProto_DataType_UNDEFINED);
   } else if (kind == TypeKind::TensorType) {
     type_proto->set_denotation("TensorType");
     // encode the number of dimensions by pushing that number of ones into the shape proto
@@ -557,8 +560,11 @@ void ModuleEncoder::EncodeTypeInfo(
     type_proto->set_denotation("IntType");
   } else if (kind == TypeKind::NoneType) {
     type_proto->set_denotation("NoneType");
-  }
-  else {
+  } else if (kind == TypeKind::GeneratorType) {
+    type_proto->set_denotation("GeneratorType");
+  } else if (kind == TypeKind::StringType) {
+    type_proto->set_denotation("StringType");
+  } else {
     throw std::runtime_error("unexpected type kind");
   }
 }
@@ -684,7 +690,7 @@ void ModuleEncoder::EncodeTensor(
     }
 
     auto record_number = file_writer_.writeRecord(
-      static_cast<char*>(t.storage().data()), t.type().elementSizeInBytes() * t.numel());
+      static_cast<char*>(t.storage().data()), t.type().elementSizeInBytes() * t.storage().size());
     tensor_proto->add_int64_data(record_number);
     storage_dedup_map_[storage_ptr] = record_number;
   }
diff --git a/torch/csrc/jit/fusers/Config.h.in b/torch/csrc/jit/fusers/Config.h.in
new file mode 100644
index 00000000000000..0809591ca702e9
--- /dev/null
+++ b/torch/csrc/jit/fusers/Config.h.in
@@ -0,0 +1,4 @@
+#pragma once
+
+#define USE_CPU_FUSER @USE_CPU_FUSER@
+#define USE_CUDA_FUSER @USE_CUDA_FUSER@
diff --git a/torch/csrc/jit/fusers/common/annotated_graph.h b/torch/csrc/jit/fusers/common/annotated_graph.h
new file mode 100644
index 00000000000000..bf8a6e862c4dac
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/annotated_graph.h
@@ -0,0 +1,26 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+
+#include "torch/csrc/jit/ir.h"
+
+namespace torch { namespace jit {
+
+struct AnnotatedGraph {
+  // short-term storage only, so it borrows Graph.
+  AnnotatedGraph(Graph& graph, int device)
+  : graph(&graph), device(device) {}
+  
+  Graph* graph = nullptr; // TODO: this should really be const
+  int device = kCPUDevice;
+  std::vector<TensorDesc> input_desc;
+  std::vector<TensorDesc> output_desc;
+};
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/common/fused_kernel.cpp b/torch/csrc/jit/fusers/common/fused_kernel.cpp
new file mode 100644
index 00000000000000..2021b9fa1b832d
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/fused_kernel.cpp
@@ -0,0 +1,556 @@
+#include "torch/csrc/jit/fusers/common/fused_kernel.h"
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/cpu/resource_strings.h"
+#include "torch/csrc/jit/fusers/cuda/resource_strings.h"
+#include "torch/csrc/jit/fusers/common/partition_desc.h"
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+#include "torch/csrc/jit/fusers/common/tensor_info.h"
+
+#include "torch/csrc/jit/code_template.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/assertions.h"
+
+#include "ATen/ATen.h"
+
+#if USE_CUDA_FUSER
+  #include "THC/THCTensorRandom.h"
+  #include "THC/THCGenerator.hpp"
+  THCGenerator* THCRandom_getGenerator(THCState* state);
+#endif // USE_CUDA_FUSER
+
+#include <tuple>
+#include <iostream>
+#include <sstream>
+#include <cstdint>
+#include <vector>
+
+namespace torch { namespace jit {
+
+// curDimIndex = linearId % sizes[i]; // % sizes[i] is not needed for d == 0, because we already guard for numel outside the index calculation
+// offset += curDimIndex*strides[i]; // *strides[i] is optional if list_is_cont becaause strides.back() == 1
+// linearId /= sizes[i];
+auto dim_calc = CodeTemplate(R"(
+//printf("tensor ${tensor} sizes[${d}] = %d, strides[${d}] = %d\n", ${tensor}.sizes[${d}],${tensor}.strides[${d}]);
+size_t ${tensor}_dimIndex${d} = ${tensor}_linearIndex ${mod_sizes};
+${tensor}_offset += ${tensor}_dimIndex${d} ${times_stride};
+)");
+
+// XXX: this code assumes that inputs are 32-bit addressable
+static uint32_t computeNumel(at::ArrayRef<int64_t> sizes) {
+  uint32_t result = 1;
+  if (sizes.size() == 0) {
+    return 1; // scalar tensor
+  }
+  for (int64_t size : sizes) {
+    result *= size;
+  }
+  return result;
+}
+
+// XXX: Assumes that after at::chunk, all inputs are the same size
+static std::vector<int64_t> computeMapSize(
+    const at::Tensor& tensor,
+    const PartitionDesc& chunkDesc) {
+  std::vector<int64_t> sizes(tensor.sizes().begin(), tensor.sizes().end());
+  // Should have been checked in graph fuser
+  JIT_ASSERT(sizes[chunkDesc.dim] % chunkDesc.nSubtensors == 0);
+  sizes[chunkDesc.dim] /= chunkDesc.nSubtensors;
+  return sizes;
+}
+
+// Tries to compress sizes and strides according to cont. Emits the result t
+// c_sizes, c_strides and throws an error on failure (if can't compress)
+static void compressContiguous(
+  at::IntList sizes
+, at::IntList strides
+, const std::vector<bool> & cont
+, uint32_t* c_sizes
+, uint32_t* c_strides) {
+  size_t compressed_dims = 0;
+  size_t cur = 0;
+  size_t ndim = sizes.size();
+  while (cur < ndim) {
+    size_t total_size = sizes[cur];
+    cur++;
+    while (cont[cur-1] && cur < ndim) {
+      JIT_ASSERT(strides[cur-1] == sizes[cur]*strides[cur]);
+      total_size *= sizes[cur];
+      cur++;
+    }
+   // cur starts pointing at the beginning of run to compress
+   // cur ends one _after_ the terminating false or end of list.
+   // total_size is the size of all dimensions [begin,end)
+   // examples:
+   // f = not cont.
+   // t = cont.
+   // x = don't care, including past end of list
+   // s = start of cur
+   // e = end of cur
+
+
+   // f x x x
+   // s e
+
+   //  t f x x
+   //  s   e
+
+   //  t t f x
+   //  s     e
+
+    c_sizes[compressed_dims] = total_size;
+    c_strides[compressed_dims] = strides[cur-1];
+    compressed_dims++;
+  }
+  if (ndim > 0) {
+    JIT_ASSERT(!cont.back() || strides.back() == 1);
+  }
+}
+
+void FusedKernel::launch_with_tensors(
+  at::ArrayRef<at::Tensor> inputs
+, at::ArrayRef<at::Tensor> outputs) {
+  at::DeviceGuard device_guard(inputs);
+  JIT_ASSERT(inputs.size() == input_desc.size());
+  JIT_ASSERT(outputs.size() == output_desc.size());
+  size_t flat_inputs_size = 0;
+  size_t flat_outputs_size = 0;
+  for (auto& c : chunk_desc)
+    flat_inputs_size += c.nSubtensors;
+  for (auto& c : concat_desc)
+    flat_outputs_size += c.nSubtensors;
+  // XXX: this code assumes that inputs are 32-bit addressable
+  // XXX: this code assumes that all inputs are of the same size
+  JIT_ASSERT(inputs[0].numel() <= std::numeric_limits<uint32_t>::max());
+
+  // Compute map_size, numel from the first input
+  at::IntList map_size;
+  uint32_t numel;
+  std::vector<int64_t> keep_alive_size;
+  if (chunk_desc[0].isNoop()) {
+    map_size = inputs[0].sizes();
+    numel = inputs[0].numel();
+  } else {
+    keep_alive_size = computeMapSize(inputs[0], chunk_desc[0]);
+    map_size = keep_alive_size;
+    numel = computeNumel(map_size);
+  }
+
+  // Compute the storage needed to store TensorInfo structs for inputs and outputs.
+  size_t uncompressedDim = input_desc.at(0).contiguity.size();
+  size_t maxPossibleTensorInfoSize = sizeof(TensorInfo) + 2 * sizeof(uint32_t) * uncompressedDim;
+  size_t maxPossibleBufferSize = maxPossibleTensorInfoSize * (flat_inputs_size + flat_outputs_size);
+  std::vector<char> buffer(maxPossibleBufferSize);
+  char* buffer_next = buffer.data();
+  // A vector of arguments to the kernel. It's (numel, *input_descs, *output_descs)
+  std::vector<void*> arguments;
+  arguments.reserve(3 + flat_inputs_size + flat_outputs_size);
+  auto addTensorInfoRaw = [&](TensorDesc & desc, void* data_ptr, at::IntList sizes, at::IntList strides) {
+    size_t nDim = desc.nDim(); // NOTE: this is the compressed dim
+    JIT_ASSERT(nDim <= uncompressedDim); // We'd overflow the space otherwise
+    auto ti = reinterpret_cast<TensorInfo*>(buffer_next);
+    ti->data = data_ptr;
+    compressContiguous(sizes, strides, desc.contiguity, ti->sizes(nDim), ti->strides(nDim));
+    buffer_next += maxPossibleTensorInfoSize;
+    arguments.push_back(ti);
+  };
+  // Asserts that t's dims can be compressed in the same way as in desc
+  // (that's what the kernel assumes), and appends it to the arguments vector.
+  auto addTensorInfo = [&](TensorDesc & desc, const at::Tensor & t) {
+    addTensorInfoRaw(desc, t.data_ptr(), t.sizes(), t.strides());
+  };
+  arguments.push_back(&numel);
+  for (size_t i = 0; i < input_desc.size(); ++i) {
+    auto & chunk = chunk_desc[i];
+    const at::Tensor& tensor = inputs[i];
+    if (chunk.isNoop()) {
+      addTensorInfo(input_desc[i], tensor);
+    } else {
+      size_t chunk_offset = map_size[chunk.dim] * tensor.stride(chunk.dim) * elementSize(tensor.type().scalarType());
+      char * data_ptr = reinterpret_cast<char*>(tensor.data_ptr());
+      for (size_t chunks = 0; chunks < chunk.nSubtensors; ++chunks) {
+        addTensorInfoRaw(*chunk.subtensorDesc, data_ptr, map_size, tensor.strides());
+        data_ptr += chunk_offset;
+      }
+    }
+  }
+  for (size_t i = 0; i < output_desc.size(); ++i) {
+    auto & c = concat_desc[i];
+    at::Tensor o = outputs[i];
+    if (c.isNoop()) {
+      o.resize_(map_size);
+      addTensorInfo(output_desc[i], outputs[i]);
+    } else {
+      size_t small_size = map_size[c.dim];
+      std::vector<int64_t> concat_size(map_size.begin(), map_size.end());
+      concat_size[c.dim] = small_size * c.nSubtensors;
+      o.resize_(concat_size);
+      size_t offset = 0;
+      for(size_t j = 0; j < c.nSubtensors; ++j) {
+        // because the concatenated_output stays live, the underlying data
+        // in this view remains live through the end of this function
+        // so there is not need to hold onto this tensor
+        auto view = o.narrow(c.dim, offset, small_size);
+        addTensorInfo(*c.subtensorDesc, view);
+        offset += small_size;
+      }
+    }
+  }
+
+  // If the kernel call contains a random op, we need to pass in random seeds as
+  // well.
+  #if USE_CUDA_FUSER
+    if (has_random && this->backend() == at::Backend::CUDA) {
+      auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState());
+      uint64_t offset =
+          gen_->state.philox_seed_offset.fetch_add(this->get_rand_offset(numel));
+      arguments.push_back(&gen_->state.initial_seed);
+      arguments.push_back(&offset);
+    }
+  #endif // USE_CUDA_FUSER
+  
+  launch_raw(numel, arguments.data());
+}
+
+void FusedKernel::launch(
+  at::ArrayRef<at::Tensor> inputs
+, std::vector<at::Tensor> & outputs) {
+  at::DeviceGuard guard(inputs.back());
+  JIT_ASSERT(inputs.size() > 0);
+  auto & ref_type = inputs[0].type();
+  outputs.clear();
+  outputs.reserve(outputDescriptors().size());
+  for(auto & od : outputDescriptors()) {
+    outputs.push_back(ref_type.toScalarType(od.scalar_type).tensor());
+  }
+
+  launch_with_tensors(inputs, outputs);
+}
+
+static std::string valueName(Value * n) {
+  return "n" + std::to_string(n->unique());
+}
+
+static std::string scalarValue(int64_t v) {
+  return std::to_string(v);
+}
+
+static std::string scalarValue(double v) {
+  std::ostringstream out;
+  out << std::scientific << v << "f";
+  return out.str();
+}
+
+static const char * scalarTypeName(at::ScalarType type) {
+  if (type == at::ScalarType::Half) {
+    return "half";
+  }
+
+  switch(type) {
+    #define DEFINE_CASE(ctype,name,_) \
+      case at::ScalarType::name: return #ctype;
+    AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(DEFINE_CASE)
+    #undef DEFINE_CASE
+    default:
+      throw std::runtime_error("unknown scalar type");
+  }
+}
+
+static std::string encodeRHS(Node* n) {
+  static std::unordered_map<NodeKind, std::string> simple_map_ops = {
+    // unary
+    {aten::abs, "absf(${0})"},
+    {aten::sigmoid, "1.f / (1.f + expf(-${0}))"},
+    {aten::relu, "${0} < 0 ? 0.f : ${0} "},
+    {aten::log, "logf(${0})"},
+    {aten::log10, "log10f(${0})"},
+    {aten::log1p, "log1pf(${0})"},
+    {aten::log2,  "log2f(${0})"},
+    {aten::lgamma, "lgammaf(${0})"},
+    {aten::exp, "expf(${0})"},
+    {aten::expm1, "expm1f(${0})"},
+    {aten::cos, "cosf(${0})"},
+    {aten::acos, "acosf(${0})"},
+    {aten::cosh, "coshf(${0})"},
+    {aten::sin, "sinf(${0})"},
+    {aten::asin, "asinf(${0})"},
+    {aten::sinh, "sinhf(${0})"},
+    {aten::tan, "tanf(${0})"},
+    {aten::atan, "atanf(${0})"},
+    {aten::tanh, "tanhf(${0})"},
+    {aten::sqrt, "sqrtf(${0})"},
+    {aten::rsqrt, "rsqrtf(${0})"},
+    {aten::ceil, "ceilf(${0})"},
+    {aten::floor, "floorf(${0})"},
+    {aten::round, "roundf(${0})"},
+    {aten::trunc, "truncf(${0})"},
+    {aten::frac, "fracf(${0})"},
+    {aten::reciprocal, "reciprocalf(${0})"},
+    {aten::neg, "-${0}"},
+    //simple binary
+    {aten::atan2, "atan2(${0}, ${1})"},
+    {aten::min, "fminf(${0}, ${1})"},
+    {aten::max, "fmaxf(${0}, ${1})"},
+
+    //binary with other
+    // TODO: some of these ops will not get generated because
+    // we only work on float inputs/outputs, but they are here to record
+    // that they are valid mappable ops once we handle more type
+    {aten::__and__, "${0} && ${1}"},
+    {aten::__lshift__, "${0} << ${1}"},
+    {aten::__or__, "${0} || ${1}"},
+    {aten::__rshift__, "${0} >> ${1}"},
+    {aten::__xor__, "${0} ^ ${1}"},
+    {aten::div, "${0} / ${1}"},
+    {aten::eq, "${0} == ${1}"},
+    {aten::fmod, "fmodf(${0}, ${1})"},
+    {aten::ge, "(${0} >= ${1})"},
+    {aten::gt, "${0} > ${1}"},
+    {aten::le, "(${0} <= ${1})"},
+    {aten::lt, "${0} < ${1}"},
+    {aten::type_as, "(${0})"}, //everything is implicitly convertible to float
+    {aten::mul, "${0} * ${1}"},
+    {aten::ne, "${0} != ${1}"},
+    {aten::remainder, "remainderf(${0}, ${1})"},
+    {aten::pow, "powf(${0}, ${1})"},
+
+    //alpha
+    {aten::add, "${0} + ${2}*${1}"},
+    {aten::sub, "(${0} - ${2}*${1})"},
+    {aten::rand_like, "uniform(rnd())"},
+
+    // simple derivatives
+    {aten::_sigmoid_backward, "${0} * ${1} * (1.f - ${1})"},
+    {aten::_tanh_backward,    "${0} * (1.f - ${1} * ${1})"},
+  };
+
+  if (n->kind() == prim::Constant) {
+    auto val = toIValue(n->output()).value();
+    if (val.isDouble()) {
+      return scalarValue(val.toDouble());
+    } else {
+      JIT_ASSERT(val.isInt());
+      return scalarValue(val.toInt());
+    }
+  }
+
+  TemplateEnv env;
+  size_t i = 0;
+  for(auto in : n->inputs()) {
+    env.s(std::to_string(i++), valueName(in));
+  }
+
+  const auto & str = simple_map_ops.at(n->kind());
+  return format(str, env);
+}
+
+static Node* usedInFusedChunk(Value* input) {
+  auto uses = input->uses();
+  if (uses.size() == 1) {
+    Node *user = uses[0].user;
+    if (user->kind() == prim::ConstantChunk) {
+      return user;
+    }
+  }
+  return nullptr;
+}
+
+static void emitIndexingFor(
+  std::ostream& out
+, const std::string& tensor
+, int ndim
+, bool last_is_cont) {
+  TemplateEnv env;
+  env.s("tensor",tensor);
+  out << format("IndexType ${tensor}_offset = 0;\n",env);
+  out << format("IndexType ${tensor}_linearIndex = linearIndex;\n",env);
+  for (int d = ndim - 1; d >= 0; --d) {
+    env.d("d",d);
+    env.s("mod_sizes", d > 0 ? format("% ${tensor}.sizes[${d}]",env) : "");
+    env.s("times_stride",(d < ndim - 1 || !last_is_cont) ?
+      format("* ${tensor}.strides[${d}]",env) : "");
+    out << dim_calc.format(env);
+    if (d > 0) {
+      out << format("${tensor}_linearIndex /= ${tensor}.sizes[${d}];\n",env);
+    }
+  }
+}
+
+// Returns: (input chunk metadata, output concat metadata, is_random)
+std::tuple<
+    std::vector<PartitionDesc>
+  , std::vector<PartitionDesc>
+  , bool> 
+  emitCompilationUnit(
+    std::ostream& out
+  , const std::string& name
+  , AnnotatedGraph& agraph
+  , bool use_cuda) {
+  bool has_random = false;
+  Graph& subgraph = *agraph.graph;
+  TemplateEnv env;
+  env.s("kernelName", name);
+  // TODO: handle cases where we need to generate > 2^32 element tensors
+  env.s("IndexType","unsigned int"); //avoiding slow header includes to get uint32_t
+
+  std::stringstream body;
+  std::stringstream tensorOffsets;
+  std::vector<std::string> formals;
+  std::vector<std::string> argument_loads;
+  auto emitFormal = [&](Value * n, const TensorDesc & desc) {
+    std::string tensor = "t" + std::to_string(formals.size()); //can't be unique() because Param may be an output
+    size_t nDim = desc.nDim();
+    emitIndexingFor(tensorOffsets, tensor, nDim,  desc.lastIsContiguous());
+    env.s("tensor",tensor);
+    env.d("formal_index", formals.size() + 1); // + 1 because the first argument is the linearIndex
+    env.d("nDim",nDim);
+    env.s("scalar_type",scalarTypeName(desc.scalar_type));
+    formals.push_back(format("TensorInfo<${scalar_type},${nDim}> ${tensor}", env));
+    argument_loads.push_back(format("*static_cast<TensorInfo<${scalar_type},${nDim}>*>(args[${formal_index}])", env));
+  };
+
+  std::vector<PartitionDesc> chunk_desc;
+  std::vector<std::pair<Value*,TensorDesc&>> flat_inputs;
+  {
+    size_t input_index = 0;
+    for(auto p : subgraph.inputs()) {
+      if (Node * chunk = usedInFusedChunk(p)) {
+        int64_t dim = chunk->i(attr::dim);
+        int64_t chunks = chunk->i(attr::chunks);
+        chunk_desc.emplace_back(agraph.input_desc[input_index++], chunks, dim);
+        for (auto * o : chunk->outputs()) {
+          flat_inputs.emplace_back(o, *chunk_desc.back().subtensorDesc);
+        }
+      } else {
+        chunk_desc.emplace_back();
+        flat_inputs.emplace_back(p, agraph.input_desc[input_index++]);
+      }
+    }
+    for (auto & input : flat_inputs) {
+      emitFormal(input.first, input.second);
+    }
+  }
+
+  std::vector<PartitionDesc> concat_desc;
+  std::vector<std::pair<Value*,TensorDesc>> flat_output_nodes;
+  {
+    size_t i = 0;
+    for(auto o : subgraph.outputs()) {
+      auto & desc = agraph.output_desc[i++];
+      if(o->node()->kind() != prim::FusedConcat) {
+        emitFormal(o, desc);
+        concat_desc.emplace_back();
+        flat_output_nodes.emplace_back(o, desc);
+      } else {
+        auto cat = o->node();
+        concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim));
+        for(auto c : cat->inputs()) {
+          emitFormal(c, *concat_desc.back().subtensorDesc);
+          flat_output_nodes.emplace_back(c, desc);
+        }
+      }
+    }
+  }
+
+  #if USE_CUDA_FUSER
+    bool has_half_tensor = false;
+  #endif // USE_CUDA_FUSER
+  size_t formal_count = 0;
+  for(auto input : flat_inputs) {
+    auto p = input.first;
+    env.s("node", valueName(p));
+    env.d("formal", formal_count++);
+
+    // Acquires and converts (if needed) inputs
+    bool is_half = input.second.scalar_type == at::ScalarType::Half;
+    if (is_half) {
+      AT_ASSERT(use_cuda);
+      #if USE_CUDA_FUSER
+        env.s(
+          "access"
+        , format("__half2float(t${formal}.data[t${formal}_offset])", env));
+        has_half_tensor = true;
+      #endif // USE_CUDA_FUSER
+    } else {
+      env.s("access", format("t${formal}.data[t${formal}_offset]", env));
+    }
+
+    //TODO: actual type propagation rather than relying on auto..
+    body << format("auto ${node} = ${access};\n", env);
+  }
+
+  for (auto n : subgraph.nodes()) {
+    // FusedConcat nodes work by narrowing the output Tensors before the kernel runs
+    if (n->kind() == prim::FusedConcat)
+      continue;
+    if (n->kind() == prim::ConstantChunk)
+      continue;
+    if (n->kind() == aten::rand_like) {
+      has_random = true;
+      if (!use_cuda)
+        throw std::runtime_error("Fusion doesn't support rand on CPU");
+    }
+    env.s("node",valueName(n->output()));
+    env.s("rhs", encodeRHS(n));
+    body << format("auto ${node} = ${rhs};\n",env);
+  }
+
+  for (auto output : flat_output_nodes) {
+    auto o = output.first;
+    env.d("formal",formal_count++);
+    env.s("access",format("t${formal}.data[t${formal}_offset]",env));
+    env.s("node",valueName(o));
+
+    // Acquires and converts (if needed) outputs
+    bool is_half = output.second.scalar_type == at::ScalarType::Half;
+    if (is_half) {
+      AT_ASSERT(use_cuda);
+      #if USE_CUDA_FUSER
+        body << format("${access} = __float2half(${node});\n",env);
+        has_half_tensor = true;
+      #endif // USE_CUDA_FUSER
+    } else {
+      body << format("${access} = ${node};\n",env);
+    }
+  }
+
+  // Includes half support if any half tensors are involved
+  #if USE_CUDA_FUSER
+    if (has_half_tensor) {
+      env.s("HalfHeader", cudafuser::half_support_literal);
+    } else {
+      env.s("HalfHeader", "");
+    }
+
+    if (has_random) {
+      env.s("RandHeader", cudafuser::rand_support_literal);
+      env.s("RandParam", cudafuser::rand_param);
+      env.s("RandInit", cudafuser::rand_init);
+    } else {
+      env.s("RandHeader", "");
+      env.s("RandParam", "");
+      env.s("RandInit", "");
+    }
+  #endif // USE_CUDA_FUSER
+
+  env.s("tensorOffsets", tensorOffsets.str());
+  env.s("kernelBody", body.str());
+  env.v("formals", formals);
+  env.v("argument_loads", argument_loads);
+  if (use_cuda) {
+    #if USE_CUDA_FUSER
+      env.s("type_declarations", cudafuser::type_declarations_template.format(env));
+      out << cudafuser::cuda_compilation_unit_template.format(env);
+    #else
+      throw std::runtime_error("CUDA Fusion requested but not supported.");
+    #endif // USE_CUDA_FUSER
+  } else {
+    env.s("type_declarations", cpufuser::type_declarations_template.format(env));
+    out << cpufuser::cpu_compilation_unit_template.format(env);
+  }
+
+  return std::make_tuple(std::move(chunk_desc), std::move(concat_desc), has_random);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/common/fused_kernel.h b/torch/csrc/jit/fusers/common/fused_kernel.h
new file mode 100644
index 00000000000000..fbdf5d60d0a8cd
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/fused_kernel.h
@@ -0,0 +1,88 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/common/annotated_graph.h"
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+#include "torch/csrc/jit/fusers/common/partition_desc.h"
+
+#include "torch/csrc/utils/disallow_copy.h"
+
+#include "ATen/ATen.h"
+
+#include <string>
+#include <cstdint>
+#include <vector>
+
+namespace torch { namespace jit {
+
+std::tuple<std::vector<PartitionDesc>, std::vector<PartitionDesc>, bool> emitCompilationUnit(
+  std::ostream& out
+, const std::string& name
+, AnnotatedGraph& agraph
+, bool use_cuda);
+
+struct FusedKernel {
+  TH_DISALLOW_COPY_AND_ASSIGN(FusedKernel);
+
+  FusedKernel(
+    const std::string& name
+  , AnnotatedGraph& agraph)
+  : name{name}
+  , input_desc{agraph.input_desc}
+  , output_desc{agraph.output_desc} { }
+
+  virtual ~FusedKernel() = default;
+
+  // expects outputs to be pre-allocated
+  void launch_with_tensors(
+    at::ArrayRef<at::Tensor> inputs
+  , at::ArrayRef<at::Tensor> outputs);
+
+  // creates new tensors for outputs
+  void launch(
+    at::ArrayRef<at::Tensor> inputs
+  , std::vector<at::Tensor>& outputs);
+  
+  const std::vector<TensorDesc>& outputDescriptors() const {
+    return output_desc;
+  }
+
+protected:
+
+  virtual at::Backend backend() const = 0;
+
+  // arguments is a list of pointers to the arguments for the compiled CUDA/CPU
+  // code.
+  // The format of arguments is suitable for directly passing to a call to
+  // cuLaunchKernel as the kernel arguments.
+  // Currently the first argument is a pointer to numel (for passing to
+  // CUDA code), and the remainder are pointers to the TensorInfo<T> structs
+  // that compiled code uses to load Tensor data.
+  // launch_with_tensors handles packing at::Tensors into this arguments array.
+  // CPU code uses the same convension so that launch_with_tensors can be shared.
+  virtual void launch_raw(uint32_t numel, void** arguments) = 0;
+
+  virtual uint64_t get_rand_offset(uint32_t numel) = 0;
+  bool has_random;
+  std::string name;
+  // We keep these around for debugging
+  std::string compilation_unit;
+  std::vector<TensorDesc> input_desc;
+  std::vector<TensorDesc> output_desc;
+
+  // same size as output_desc, describes whether
+  // an output is actually a concatenation of
+  // many subtensors that the fusion group produces
+  std::vector<PartitionDesc> concat_desc;
+
+  // same size as input_desc, describes whether an
+  // input should be broken into subtensors (chunks)
+  // to be consumed by the fusion group
+  std::vector<PartitionDesc> chunk_desc;
+};
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/common/fusion_arg_spec.h b/torch/csrc/jit/fusers/common/fusion_arg_spec.h
new file mode 100644
index 00000000000000..b8780f8ada9e77
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/fusion_arg_spec.h
@@ -0,0 +1,46 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+
+#include "torch/csrc/utils/functional.h" // fmap
+#include "torch/csrc/utils/hash.h"
+
+#include "ATen/ATen.h"
+
+#include <vector>
+#include <cstdint>
+
+namespace torch { namespace jit {
+
+struct FusionArgSpec {
+  FusionArgSpec(at::TensorList inputs)
+  : descs_(fmap<TensorDesc>(inputs))
+  , hash_code_(torch::get_hash(inputs.size(), descs_)) {}
+
+  bool operator==(const FusionArgSpec& spec) const {
+    return hash_code_ == spec.hash_code_ && descs_ == spec.descs_;
+  }
+
+  bool operator!=(const FusionArgSpec& spec) const {
+    return !(*this == spec);
+  }
+
+  static size_t hash(const FusionArgSpec& spec) {
+    return spec.hash_code_;
+  }
+
+  const std::vector<TensorDesc>& descs() const {
+    return descs_;
+  }
+
+private:
+  std::vector<TensorDesc> descs_;
+  size_t hash_code_;
+};
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/common/fusion_handle_impl.cpp b/torch/csrc/jit/fusers/common/fusion_handle_impl.cpp
new file mode 100644
index 00000000000000..052a08cac93ede
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/fusion_handle_impl.cpp
@@ -0,0 +1,400 @@
+#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h"
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/fusion_arg_spec.h"
+#include "torch/csrc/jit/fusers/common/annotated_graph.h"
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+#include "torch/csrc/jit/fusers/cpu/fused_kernel.h"
+#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h"
+#include "torch/csrc/jit/fusers/cuda/fused_kernel.h"
+
+#include "torch/csrc/jit/interpreter.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/custom_operator.h"
+
+#include "torch/csrc/utils/functional.h" //fmap
+
+#include "ATen/ATen.h"
+#include "ATen/ExpandUtils.h"
+
+#include <unordered_set>
+#include <tuple>
+#include <algorithm>
+#include <exception>
+
+namespace torch { namespace jit {
+
+////////////////////////////////////////////////////////////////////////////////
+// FusedKernelCache
+
+// Note [Run-time shape checking code]
+// There are multiple assumptions that our codegen makes, which we can't check
+// in the fusion pass, because we don't have the shape information. Most notably,
+// that all values (post-input-chunk, and pre-output-concat) have the same shape
+// (hereinafter referred to as map size). One way to check this would be to run
+// shape propagation for every size configuration we get as an input, but that
+// requires a full graph traversal, and might incur unnecessary overhead. The code
+// below uses a few nice properties of broadcasting rules and their interactions with
+// pointwise operations, and takes a smarter approach, to quickly verify validity of
+// the kernel.
+//
+// Notation:
+//   - a.s when a is a tensor is a shorthand for a.shape.
+//   - B is a shorthand for the broadcasting/expanding function. It is used as a
+//     vararg function.
+//   - E is a shorthand for expand function.
+//   - Every pointwise operation can be equivalently rewritten as
+//     f(a, b) = f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))),
+//     where f^ is a non-broadcasting verison of f.
+//   - A set of inputs that are used to produce a certain graph output is referred to
+//     as the output's broadcasting group (see Lemma 2. for explanation why).
+//
+// Lemma 1. Set of lists of integers (shapes) + { _|_ (bottom/error marker) }, with the
+//          operation of broadcasting (returning bottom upon shape mismatch) forms a monoid.
+//          In simpler terms: broadcasting is associative, i.e. B(a, B(b, c)) == B(B(a, b), c).
+//
+// Proof.   Satisfies all monoid laws:
+//            - Closed under broadcasting (trivial)
+//            - Empty shape is the identity element: B(a, []) == B([], a) == a
+//            - Associativity: A simple visual proof is that you can expand 3 tensors
+//                at the same time by stacking their sizes (with alignment to the right),
+//                just as you'd do in the case of 2 tensors, but with an intermediate
+//                (the algorithm ends up being pretty much the same).
+//
+// Lemma 2. Shape of an output of an arbitrary DAG of pointwise ops depends only on the set
+//          of inputs used in this DAG and is equal to B([i.shape for i in used_inputs]).
+//
+// Proof.   Let G be any DAG of pointwise ops and < be any valid topological
+//          ordering on nodes of G. Proof by induction over <.
+//          Base case (graph input):
+//            Trivial (input is also an output).
+//          Step (n = f(q, r)):
+//            Let QS (RS) be the set of shapes of inputs that q (r) depends on.
+//            Note that the set of inputs that n depends on is exactly QS + RS.
+//            shape(n) == shape(f(q, r))
+//                          (def of f)
+//                     == shape(f^(E(q, B(q.s, r.s)), E(r, B(q.s, r.s))))
+//                          (output shape of f^ is equal to either of argument shapes)
+//                     == shape(E(q, B(q.s, r.s)))
+//                          (property of expand)
+//                     == B(q.s, r.s)
+//                          (induction assumption)
+//                     == B(B(QS...), B(RS...))
+//                          (Lemma 1.)
+//                     == B(QS..., RS...)
+//                          (repeated shapes don't matter for broadcasting)
+//                     == B((QS + RS)...)
+//
+// Lemma 3. Expands are distributive over pointwise ops, i.e. E(f(a, b), s) = f(E(a, s), E(b, s))
+// Lemma 4. Expands can be collapsed, i.e. E(E(x, s1), s2) = E(x, B(s1, s2)).
+// Proof.   A simple exercise for the reader :)
+//
+// Theorem. If all (pre-concat-)outputs have equal shapes, then we can push the expands to
+//          (post-chunk-)inputs, and have all intermediates of the same shape
+//          (no broadcasting happening in the body).
+//
+// Proof.   Using the above lemmas we can easily show that a graph with a single output
+//          can be easily rewritten by taking the shape given by B applied to all input
+//          shapes, expanding inputs to it, and using only non-broadcasting operations.
+//          Example:
+//
+//          let d = f(a, b) in
+//          let e = h(b, c) in
+//          g(d, e)
+//
+//          (By def. of broadcasting pointwise ops applied to g, f and h)
+//          (Lemma 2. for a closed formula for the size of g = gs)
+//
+//          let gs = B(a.s, b.s, c.s) in
+//          let d' = E(f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))), gs) in
+//          let e' = E(h^(E(b, B(b.s, c.s)), E(c, B(b.s, c.s))), gs) in
+//          g^(d', e')
+//
+//          (Lemma 3.)
+//
+//          let gs = B(a.s, b.s, c.s) in
+//          let d' = f^(E(E(a, B(a.s, b.s)), gs), E(E(b, B(a.s, b.s)), gs)) in
+//          let e' = h^(E(E(b, B(b.s, c.s)), gs), E(E(c, B(b.s, c.s)), gs)) in
+//          g^(d', e')
+//
+//          (Lemma 4. + Lemma 1. to simplify broadcasting function)
+//
+//          let gs = B(a.s, b.s, c.s) in
+//          let d' = f^(E(a, gs), E(b, gs)) in
+//          let e' = h^(E(b, gs), E(c, gs)) in
+//          g^(d', e')
+//
+//          (Simple rewrite)
+//
+//          let gs = B(a.s, b.s, c.s) in
+//          let a' = E(a, gs) in
+//          let b' = E(b, gs) in
+//          let c' = E(c, gs) in
+//          let d' = f^(a', b') in
+//          let e' = h^(b', c') in
+//          g^(d', e')
+//
+//          This example can be easily formalized to arbitrary DAGs using induction
+//          over topological ordering, similar to Lemma 2. Now, if broadcasting groups
+//          for all outputs have the same shape, then performing an expand to this size
+//          on all inputs will ensure that all intermediates on all paths to outputs
+//          will have the same shape, proving that the body of the kernel is valid.
+//
+//          This shows the part until post-chunk-inputs. Extending it to pre-chunk-inputs
+//          is straightforward (needs a simple lemma for moving expands through chunks).
+
+// Register implementations of fused operators, so that we can reuse the fused graph
+// to generate fallback code.
+RegisterOperators reg_fused_operators({
+  Operator(
+    prim::FusedConcat,
+    [](Node* node) {
+      int64_t dim = node->i(attr::dim);
+      int64_t num_inputs = node->inputs().size();
+      return [dim, num_inputs](Stack& stack) {
+        auto result = at::cat(
+          fmap(last(stack, num_inputs), [](const IValue& i) { return i.toTensor(); }),
+          dim
+        );
+        drop(stack, num_inputs);
+        pack(stack, std::move(result));
+        return 0;
+      };
+    })
+});
+
+FusionHandleImpl::FusionHandleImpl(
+  std::shared_ptr<Graph> _graph
+, int device)
+: device(device)
+, fallback_code(_graph)
+, graph(std::move(_graph))
+, input_broadcast_groups(getInputBroadcastGroups())
+, input_chunks(getInputChunkDescriptors())
+, kernels() { }
+
+std::atomic<size_t> FusionHandleImpl::next_kernel_id {0};
+
+static Node* usedInFusedChunk(Value* input) {
+  auto uses = input->uses();
+  if (uses.size() == 1) {
+    Node *user = uses[0].user;
+    if (user->kind() == prim::ConstantChunk) {
+      return user;
+    }
+  }
+  return nullptr;
+}
+
+auto FusionHandleImpl::getInputChunkDescriptors() -> std::vector<PartitionInfo> {
+  std::vector<PartitionInfo> descs;
+  descs.reserve(graph->inputs().size());
+  for (Value* input : graph->inputs()) {
+    if (Node* chunk = usedInFusedChunk(input)) {
+      descs.emplace_back(chunk->i(attr::chunks), chunk->i(attr::dim));
+    } else {
+      descs.emplace_back(1, 0);
+    }
+  }
+  return descs;
+}
+
+// NB: this vector is really a set, but we want to keep it contiguous in memory for faster access
+static std::vector<int64_t> getInputDependencies(Value* output) {
+  // Run a DFS traversal to find all inputs that affect a given output value
+  std::vector<Value*> queue { output };
+  std::unordered_set<Value*> inputs;
+  std::unordered_set<Value*> seen;
+  while (!queue.empty()) {
+    Value* val = queue.back(); queue.pop_back();
+    Node* producer = val->node();
+    if (producer->kind() == prim::Param) {
+      inputs.insert(val);
+      continue;
+    }
+    for (Value* input : producer->inputs()) {
+      if (/*bool inserted = */seen.insert(input).second) {
+        queue.push_back(input);
+      }
+    }
+  }
+
+  // Convert Value* into offsets into the graph's input list
+  std::vector<int64_t> offsets;
+  offsets.reserve(inputs.size());
+  for (Value* input : inputs) {
+    offsets.push_back(input->offset());
+  }
+
+  std::sort(offsets.begin(), offsets.end());
+  return offsets;
+}
+
+// See Note [Run-time shape checking code] for more explanation on the algorithm.
+at::optional<std::vector<int64_t>> FusionHandleImpl::canRunKernel(at::TensorList args) {
+  AT_CHECK(args.size() == input_chunks.size(),
+           "Expected ", input_chunks.size(), " arguments, but got ", args.size());
+
+  at::optional<std::vector<int64_t>> map_size;
+  for (const auto & broadcast_group : input_broadcast_groups) {
+    if (!map_size) {
+      map_size = getMapSize(args, broadcast_group);
+      if (!map_size) {
+        return at::nullopt;
+      }
+    } else {
+      auto group_map_size = getMapSize(args, broadcast_group);
+      // NB: this checks that group_map_size is defined AND equal to map_size
+      if (map_size != group_map_size) {
+        return at::nullopt;
+      }
+    }
+  }
+  return map_size;
+}
+
+std::unique_ptr<FusedKernel> FusionHandleImpl::compileSpec(
+  const FusionArgSpec& spec
+, const std::vector<int64_t>& map_size) {
+  AnnotatedGraph agraph{*graph, device};
+
+  agraph.input_desc = spec.descs();
+  // XXX: this assumes that fused kernels only operate on floating-point values inside
+  at::optional<at::ScalarType> scalar_type;
+  for (TensorDesc& desc : agraph.input_desc) {
+    if (isFloatingType(desc.scalar_type)) {
+      scalar_type = desc.scalar_type;
+      break;
+    }
+  }
+  JIT_ASSERT(scalar_type);
+
+  for (Value * output : graph->outputs()) {
+    std::vector<int64_t> sizes = map_size;
+    if (output->node()->kind() == prim::FusedConcat) {
+      sizes.at(output->node()->i(attr::dim)) *= output->node()->inputs().size();
+    }
+    auto type = CompleteTensorType::create(*scalar_type, device, sizes);
+    agraph.output_desc.emplace_back(std::move(type));
+  }
+
+  std::string name = "kernel_" + std::to_string(next_kernel_id++);
+  FusedKernel* raw_func;
+  if (device != kCPUDevice) {
+    #if USE_CUDA_FUSER
+      raw_func = new cudafuser::CUDAFusedKernel(name, agraph);
+    #else
+      throw std::runtime_error("CUDA Fusion is not supported on this build.");
+    #endif // USE_CUDA_FUSER
+  } else {
+    raw_func = new cpufuser::CPUFusedKernel(
+      name
+    , agraph
+    , cpufuser::getFusionCompiler().getConfig());
+  }
+  return std::unique_ptr<FusedKernel>(raw_func);
+}
+
+// NB: args are mutated in this call. map_size is mutated too, but is restored to its original
+// value before this function returns (it's an optimization).
+void FusionHandleImpl::expandArgs(std::vector<at::Tensor>& args, std::vector<int64_t>& map_size) {
+  for (size_t i = 0; i < args.size(); ++i) {
+    auto& arg = args[i];
+    auto& pdesc = input_chunks[i];
+    if (pdesc.nSubtensors == 1) {
+      if (arg.sizes().equals(map_size)) continue;
+      arg = arg.expand(map_size);
+    } else {
+      map_size.at(pdesc.dim) *= pdesc.nSubtensors;
+      if (!arg.sizes().equals(map_size)) {
+        arg = arg.expand(map_size);
+      }
+      map_size.at(pdesc.dim) /= pdesc.nSubtensors;
+    }
+  }
+}
+
+std::vector<std::vector<int64_t>> FusionHandleImpl::getInputBroadcastGroups() {
+  std::unordered_set<std::vector<int64_t>, torch::hash<std::vector<int64_t>>> broadcast_groups;
+  for (Value* output : graph->outputs()) {
+    broadcast_groups.insert(getInputDependencies(output));
+  }
+  return std::vector<std::vector<int64_t>>{broadcast_groups.begin(), broadcast_groups.end()};
+}
+
+void FusionHandleImpl::run(Stack& stack) {
+  int64_t num_inputs = graph->inputs().size();
+  auto args = fmap(last(stack, num_inputs), [](const IValue& i) {
+    return i.toTensor();
+  });
+
+  auto maybe_map_size = canRunKernel(args);
+  if (!maybe_map_size) {
+    return runFallback(stack);
+  }
+  expandArgs(args, *maybe_map_size);
+
+  FusionArgSpec spec{args};
+  auto it = kernels.find(spec);
+  if (it == kernels.end()) {
+    std::tie(it, std::ignore) = kernels.emplace(spec, compileSpec(spec, *maybe_map_size));
+  }
+  auto& fn = it->second;
+
+  std::vector<at::Tensor> outputs;
+  fn->launch(args, outputs);
+  drop(stack, num_inputs);
+  stack.insert(
+    stack.end()
+  , std::make_move_iterator(outputs.begin())
+  , std::make_move_iterator(outputs.end()));
+}
+
+at::optional<std::vector<int64_t>> FusionHandleImpl::getMapSize(
+  at::TensorList args
+, at::IntList arg_subset) {
+  int64_t dim_after_broadcast = 0;
+  for (int64_t arg_idx : arg_subset) {
+    dim_after_broadcast = std::max(dim_after_broadcast, args[arg_idx].dim());
+  }
+  // TODO: this keeps reallocating map_size at every iteration, but we know
+  // exactly how much storage do we need, so this could be fixed in-place at
+  // every step. We're just missing a few functions for ATen, but the fix
+  // should be straightforward.
+  // NB: we leave this uninitialized, because an empty size is trivially
+  // broadcastable to any other size.
+  std::vector<int64_t> map_size;
+  for (size_t i = 0; i < arg_subset.size(); ++i) {
+    auto& arg = args.at(arg_subset[i]);
+    auto& chunk_desc = input_chunks.at(arg_subset[i]);
+    if (chunk_desc.nSubtensors == 1) {
+      try {
+        map_size = at::infer_size(map_size, arg.sizes());
+      } catch (std::exception& e) {
+        return at::nullopt;
+      }
+    } else {
+      auto tensor_sizes = arg.sizes().vec();
+      int64_t num_chunks = chunk_desc.nSubtensors;
+      int64_t dim = at::maybe_wrap_dim(chunk_desc.dim, tensor_sizes.size());
+      if (tensor_sizes[dim] % num_chunks != 0) {
+        return at::nullopt;
+      }
+      tensor_sizes[dim] /= num_chunks;
+      try {
+        map_size = at::infer_size(map_size, tensor_sizes);
+      } catch (std::exception& e) {
+        return at::nullopt;
+      }
+    }
+  }
+
+  return {map_size};
+}
+
+void FusionHandleImpl::runFallback(Stack& stack) {
+  InterpreterState(fallback_code).runOneStage(stack);
+}
+
+} // namespace jit 
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/common/fusion_handle_impl.h b/torch/csrc/jit/fusers/common/fusion_handle_impl.h
new file mode 100644
index 00000000000000..cfc601ca385d21
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/fusion_handle_impl.h
@@ -0,0 +1,72 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/fusion_arg_spec.h"
+#include "torch/csrc/jit/fusers/common/fused_kernel.h"
+
+#include "torch/csrc/jit/stack.h"
+#include "torch/csrc/jit/interpreter.h"
+#include "torch/csrc/jit/ir.h"
+
+#include "ATen/ATen.h"
+
+#include <memory>
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+
+namespace torch { namespace jit {
+
+// FusionCompiler has very limited shape information available at the time getOrCompile
+// is called, and this is why it can't really prepare the kernels at that time. Instead,
+// it returns this object, which will take care of matching the run-time shapes to whatever
+// kernels we have compiled already.
+//
+// Two configurations are considered eligible for the same fused kernel if:
+//   - the shapes satisfy graph invariants for our fused code (e.g. that all intermediate shapes
+//     are the same - see fusion_compiler.cpp for more details).
+//   - their FusionArgSpecs compare equal
+struct FusionHandleImpl : public FusionHandle {
+  FusionHandleImpl(
+    std::shared_ptr<Graph> _graph
+  , int device);
+
+  void run(Stack& inputs);
+
+private:
+  struct PartitionInfo {
+    PartitionInfo(int64_t nsub, int64_t dim)
+    : nSubtensors(nsub), dim(dim) { };
+
+    int64_t nSubtensors;
+    int64_t dim;
+  };
+
+  void runFallback(Stack& stack);
+  void expandArgs(std::vector<at::Tensor>& args, std::vector<int64_t>& map_size);
+  at::optional<std::vector<int64_t>> canRunKernel(at::TensorList args);
+  at::optional<std::vector<int64_t>> getMapSize(at::TensorList args, at::IntList arg_subset);
+  std::vector<std::vector<int64_t>> getInputBroadcastGroups();
+  std::vector<PartitionInfo> getInputChunkDescriptors();
+  std::unique_ptr<FusedKernel> compileSpec(
+        const FusionArgSpec& spec, const std::vector<int64_t>& map_size);
+
+  static std::atomic<size_t> next_kernel_id;
+
+  int device;
+  Code fallback_code;
+  std::shared_ptr<Graph> graph;
+  std::vector<std::vector<int64_t>> input_broadcast_groups;
+  std::vector<PartitionInfo> input_chunks;
+  std::unordered_map<
+    FusionArgSpec
+  , std::unique_ptr<FusedKernel>
+  , torch::hash<FusionArgSpec>> kernels;
+};
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/common/partition_desc.h b/torch/csrc/jit/fusers/common/partition_desc.h
new file mode 100644
index 00000000000000..b9825155f55ca1
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/partition_desc.h
@@ -0,0 +1,48 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+
+#include "torch/csrc/jit/assertions.h"
+
+#include <memory>
+#include <cstdint>
+#include <vector>
+
+namespace torch { namespace jit {
+
+// Descriptor for chunk-ing an input tensor into subtensors
+// OR concat-ing an output tensor from subtensors
+struct PartitionDesc {
+  
+  PartitionDesc()
+  : nSubtensors(1), dim(0) {}
+
+  PartitionDesc(const TensorDesc& desc, size_t nSubtensors, size_t dim)
+  : nSubtensors(nSubtensors), dim(dim) {
+    JIT_ASSERT(nSubtensors > 1);
+    std::vector<bool> cont = desc.contiguity;
+    if(dim > 0) {
+      // when we narrow the concatenated output/chunked input
+      // we make the size[dim] smaller while keeping the stride[dim] the same,
+      // meaning: stride[dim - 1] != stride[dim]*size[dim]
+      // so dim - 1 is no longer contiguous
+      cont[dim - 1] = false;
+    }
+    subtensorDesc.reset(new TensorDesc(desc.scalar_type, cont));
+  }
+
+  bool isNoop() const {
+    return nSubtensors == 1;
+  }
+
+  size_t nSubtensors; // == 1 for tensors that should not be operated on via chunk/cat
+  size_t dim; // dimension along which the chunk/concat occurs
+  std::unique_ptr<TensorDesc> subtensorDesc; // descriptor for the subtensor, if it exists
+};
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/common/tensor_desc.cpp b/torch/csrc/jit/fusers/common/tensor_desc.cpp
new file mode 100644
index 00000000000000..5b1de58e3129dc
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/tensor_desc.cpp
@@ -0,0 +1,20 @@
+#include "torch/csrc/jit/fusers/common/tensor_desc.h"
+
+#include "torch/csrc/jit/assertions.h"
+
+namespace torch { namespace jit {
+
+std::vector<bool> TensorDesc::findContiguous(
+  const at::IntList& sizes
+, const at::IntList& strides) {
+  JIT_ASSERT(sizes.size() == strides.size());
+  std::vector<bool> cont(sizes.size());
+  for(size_t i = 0; i < sizes.size(); ++i) {
+    int64_t expected_stride = (i + 1 < sizes.size()) ? sizes[i+1]*strides[i+1] : 1;
+    cont[i] = strides[i] == expected_stride;
+  }
+  return cont;
+}
+
+} // namespace jit 
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/common/tensor_desc.h b/torch/csrc/jit/fusers/common/tensor_desc.h
new file mode 100644
index 00000000000000..ff5d640bfd6cbb
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/tensor_desc.h
@@ -0,0 +1,83 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/type.h"
+
+#include "torch/csrc/utils/hash.h"
+
+#include "ATen/ATen.h"
+
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+namespace torch { namespace jit {
+
+// type information needed by the compiler for input/outputs
+// contiguity[i] is true if the dim i is contiguous with dim i + 1.
+// contiguity.back() == true means strides.back() == 1.
+struct TensorDesc {
+  at::ScalarType scalar_type;
+  std::vector<bool> contiguity;
+
+  TensorDesc(const at::ScalarType& type, const std::vector<bool>& contiguity)
+  : scalar_type{type}, contiguity{contiguity} {
+    if (contiguity.size() == 0) {
+      nDim_ = 0;
+    } else {
+      nDim_ = std::count(contiguity.begin(), contiguity.end(), false) + (lastIsContiguous() ? 1 : 0);
+    }
+  }
+
+  TensorDesc(const at::ScalarType& type, const at::IntList& sizes, const at::IntList& strides)
+  : TensorDesc(type, TensorDesc::findContiguous(sizes, strides)) {}
+
+  TensorDesc(const at::Tensor& t)
+  : TensorDesc(t.type().scalarType(), t.sizes(), t.strides()) {}
+
+  TensorDesc(CompleteTensorTypePtr type)
+  : TensorDesc(type->scalarType(), type->sizes(), type->strides()) {}
+
+  // number of dimensions after contiguity compression
+  size_t nDim() const {
+    return nDim_;
+  }
+
+  // do we have inner stride == 1?
+  bool lastIsContiguous() const {
+    return contiguity.size() == 0 || contiguity.back();
+  }
+
+  static std::vector<bool> findContiguous(
+    const at::IntList& sizes,
+    const at::IntList& strides);
+
+  bool operator==(const TensorDesc & desc) const {
+    return scalar_type == desc.scalar_type && contiguity == desc.contiguity;
+  }
+
+  bool operator!=(const TensorDesc & desc) const {
+    return !(*this == desc);
+  }
+
+  static size_t hash(const TensorDesc& spec) {
+    return torch::get_hash(spec.scalar_type, spec.nDim_, std::hash<std::vector<bool>>{}(spec.contiguity));
+  }
+
+private:
+  size_t nDim_;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const TensorDesc& d) {
+  out << d.scalar_type << "[";
+  for (auto b : d.contiguity)
+    out << b << ";";
+  out << "]";
+  return out;
+}
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/common/tensor_info.h b/torch/csrc/jit/fusers/common/tensor_info.h
new file mode 100644
index 00000000000000..9798a44bd877e8
--- /dev/null
+++ b/torch/csrc/jit/fusers/common/tensor_info.h
@@ -0,0 +1,25 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER || USE_CUDA_FUSER
+#pragma once
+
+#include <cstdint>
+
+namespace torch { namespace jit {
+
+// Host-side view of TensorInfo (that visivle for the kernel is defined above).
+// Note dims[0] - we need to dynamically allocate the dims.
+struct TensorInfo {
+  
+  uint32_t* sizes(size_t nDim) { return &sizes_strides[0]; }
+  uint32_t* strides(size_t nDim) { return &sizes_strides[nDim]; }
+
+  void* data;
+  #pragma GCC diagnostic ignored "-Wpedantic"
+    uint32_t sizes_strides[0];
+  #pragma GCC diagnostic pop
+};
+
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER || USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/cpu/dynamic_library.h b/torch/csrc/jit/fusers/cpu/dynamic_library.h
new file mode 100644
index 00000000000000..ec16698a75b960
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/dynamic_library.h
@@ -0,0 +1,42 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER
+#pragma once
+
+#include "dlfcn.h"
+
+namespace torch { namespace jit { namespace cpufuser {
+
+static void* checkDL(void* x) {
+  if (!x) {
+    AT_ERROR("error in dlopen or dlsym: ", dlerror());
+  }
+
+  return x;
+}
+
+struct DynamicLibrary {
+  TH_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
+
+  DynamicLibrary(const char* name) {
+    handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW));
+  }
+
+  void* sym(const char* name) {
+    JIT_ASSERT(handle);
+    return checkDL(dlsym(handle, name));
+  }
+
+  ~DynamicLibrary() {
+    if (!handle) return;
+    dlclose(handle);
+  }
+
+private:
+  void* handle = nullptr;
+};
+
+} // namespace cpufuser
+} // namespace jit
+} // namespace torch
+
+#endif // USE_CPU_FUSER
diff --git a/torch/csrc/jit/fusers/cpu/fused_kernel.cpp b/torch/csrc/jit/fusers/cpu/fused_kernel.cpp
new file mode 100644
index 00000000000000..277ec7b0c20975
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/fused_kernel.cpp
@@ -0,0 +1,95 @@
+#include "torch/csrc/jit/fusers/cpu/fused_kernel.h"
+
+#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h"
+#include "torch/csrc/jit/fusers/cpu/temp_file.h"
+#include "torch/csrc/jit/fusers/cpu/dynamic_library.h"
+#include "torch/csrc/jit/fusers/common/annotated_graph.h"
+
+#include "torch/csrc/jit/assertions.h"
+#include "torch/csrc/jit/code_template.h"
+
+#include <sstream>
+#include <tuple>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+
+
+namespace torch { namespace jit { namespace cpufuser {
+
+static const std::string so_template = "/tmp/pytorch_fuserXXXXXX.so";
+static const std::string cpp_template = "/tmp/pytorch_fuserXXXXXX.cpp";
+
+// NB: -march=native not supported on PPC64 g++.  It's a bit annoying
+// to do a configure-style test to decide whether or not the g++
+// actually supports it or not, so we heuristically use the host
+// compiler to predict if the runtime compiler supports the option we
+// want.  This probably won't work if you're cross-compiling.
+// NB: -march=native is disabled because it has caused problems where
+// compiler and assembler do not agree on what native instruction they
+// understand for AVX512. When we need better CPU performance this
+// optimization can be re-enabled by tracking down the platforms where
+// this error occurs and only selectively disabling it.
+static const std::string compile_string =
+  "\"${cxx}\" -O3 -g "
+#ifndef __PPC64__
+//  "-march=native "
+#endif
+  "-std=c++11 -fPIC ${fopenmp} -shared \"${cpp_file}\" -o \"${so_file}\" -lm";
+
+static void runCompiler(
+  CPUFusionCompilerConfig& config
+, const std::string& cpp_file
+, const std::string& so_file) {
+  TemplateEnv env;
+  env.s("cxx", config.cxx);
+  env.s("fopenmp", config.openmp ? "-fopenmp" : "");
+  env.s("cpp_file",cpp_file);
+  env.s("so_file",so_file);
+  std::string result = format(compile_string, env);
+  int r = system(result.c_str());
+  if (config.openmp && r != 0) {
+    std::cerr << "warning: pytorch jit fuser failed to compile with openmp, trying without it...\n";
+    config.openmp = false; // disable for future compiles
+    return runCompiler(config, cpp_file, so_file);
+  }
+  JIT_ASSERTM(r == 0, "Failed to compile a fused CPU kernel");
+}
+
+static const std::string disas_string =
+  "objdump -M  intel -d \"${so_file}\"";
+static void disas(const std::string& so_file) {
+  TemplateEnv env;
+  env.s("so_file", so_file);
+  std::string cmd = format(disas_string, env);
+  int r = system(cmd.c_str());
+  JIT_ASSERT(r == 0);
+}
+
+CPUFusedKernel::CPUFusedKernel(
+  const std::string& name
+, AnnotatedGraph& agraph
+, CPUFusionCompilerConfig& config)
+: FusedKernel(name, agraph) {
+  TempFile so_file(so_template, 3);
+  TempFile cpp_file(cpp_template, 4);
+
+  std::stringstream cu;
+  std::tie(chunk_desc, concat_desc, has_random) = emitCompilationUnit(cu, name, agraph, false);
+  JIT_ASSERT(!has_random);
+  compilation_unit = cu.str();
+  cpp_file.write(compilation_unit);
+  cpp_file.sync();
+  runCompiler(config, cpp_file.name(), so_file.name());
+  if (config.debug) {
+    disas(so_file.name());
+  }
+  so_lib.reset(new DynamicLibrary(so_file.name().c_str()));
+  #pragma GCC diagnostic ignored "-Wpedantic"
+    kernel = reinterpret_cast<void(*)(uint32_t, void**)>(so_lib->sym(name.c_str()));
+  #pragma GCC diagnostic pop
+}
+
+} // namespace cpufuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/cpu/fused_kernel.h b/torch/csrc/jit/fusers/cpu/fused_kernel.h
new file mode 100644
index 00000000000000..d32f4c351733bf
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/fused_kernel.h
@@ -0,0 +1,45 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h"
+#include "torch/csrc/jit/fusers/cpu/dynamic_library.h"
+#include "torch/csrc/jit/fusers/common/fused_kernel.h"
+#include "torch/csrc/jit/fusers/common/annotated_graph.h"
+
+#include "ATen/ATen.h"
+
+#include <string>
+#include <cstdint>
+#include <memory>
+
+namespace torch { namespace jit { namespace cpufuser {
+
+struct CPUFusedKernel : public ::torch::jit::FusedKernel {
+  CPUFusedKernel(
+    const std::string& name
+  , AnnotatedGraph& agraph
+  , CPUFusionCompilerConfig& config);
+
+protected:
+  virtual at::Backend backend() const override {
+    return at::Backend::CPU;
+  }
+
+  virtual uint64_t get_rand_offset(uint32_t numel) override {
+    return numel;
+  }
+
+  virtual void launch_raw(uint32_t numel, void** arguments) override {
+    kernel(numel, arguments);
+  }
+
+  std::unique_ptr<DynamicLibrary> so_lib;
+  void (*kernel)(uint32_t, void**) = nullptr;
+};
+
+} // namespace cpufuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER
diff --git a/torch/csrc/jit/fusers/cpu/fusion_compiler.cpp b/torch/csrc/jit/fusers/cpu/fusion_compiler.cpp
new file mode 100644
index 00000000000000..9285ae4caa179b
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/fusion_compiler.cpp
@@ -0,0 +1,85 @@
+#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h"
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h"
+
+#include "torch/csrc/jit/passes/shape_analysis.h" // EraseShapeInformation
+#include "torch/csrc/utils/functional.h" //fmap
+#include "torch/csrc/jit/ivalue.h" // IValue
+#include "torch/csrc/jit/code_template.h"
+#include "torch/csrc/jit/assertions.h"
+
+#include <cstdlib>
+#include <string>
+#include <sstream>
+#include <tuple>
+
+namespace torch { namespace jit { namespace cpufuser {
+
+CPUFusionCompiler& getFusionCompiler() {
+  static CPUFusionCompiler compiler;
+  return compiler;
+}
+
+static const std::string check_exists_string = "which '${program}' > /dev/null";
+static bool programExists(const std::string& program) {
+  TemplateEnv env;
+  env.s("program", program);
+  std::string cmd = format(check_exists_string, env);
+  return 0 == system(cmd.c_str());
+}
+
+CPUFusionCompiler::CPUFusionCompiler() {
+  const char* cxx_env = getenv("CXX");
+  if (cxx_env != nullptr) {
+    config_.cxx = cxx_env;
+  }
+
+  if (!programExists(config_.cxx)) {
+    config_.cxx = "";
+  }
+  
+  const char* debug_env = getenv("PYTORCH_FUSION_DEBUG");
+  config_.debug = debug_env && atoi(debug_env) != 0;
+}
+
+std::shared_ptr<FusionHandle> CPUFusionCompiler::getFusionHandle(Node* fusion_group) {
+  int device = fusion_group->i(attr::device);
+  JIT_ASSERT(device == kCPUDevice);
+  auto graph = fusion_group->g(attr::Subgraph)->copy();
+  EraseShapeInformation(*graph);
+  std::stringstream key;
+  key << "device " << device << "\n";
+  key << *graph << "\n";
+  std::string key_ = key.str();
+  auto it = cache_map.find(key_);
+  if (it == cache_map.end()) {
+    std::tie(it, std::ignore) = cache_map.emplace(key_, std::make_shared<FusionHandleImpl>(graph, device));
+  }
+  return it->second;
+}
+
+std::vector<at::Tensor> CPUFusionCompiler::debugLaunchGraph(
+  Graph& graph
+, int device
+, at::ArrayRef<at::Tensor> inputs) {
+  auto wrapper_graph = std::make_shared<Graph>();
+  Node* fusion_group = wrapper_graph->insertNode(wrapper_graph->createFusionGroup(device));
+  fusion_group->g_(attr::Subgraph, graph.copy());
+  for (size_t i = 0; i < graph.inputs().size(); ++i) {
+    fusion_group->addInput(wrapper_graph->addInput());
+  }
+  for (size_t i = 0; i < graph.outputs().size(); ++i) {
+    wrapper_graph->registerOutput(fusion_group->addOutput());
+  }
+  auto cache = getFusionHandle(fusion_group);
+  Stack stack = fmap<IValue>(inputs);
+  cache->run(stack);
+  return fmap(stack, [](const IValue& iv) { return iv.toTensor(); });
+}
+
+
+
+} // namespace cpufuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/cpu/fusion_compiler.h b/torch/csrc/jit/fusers/cpu/fusion_compiler.h
new file mode 100644
index 00000000000000..d41fbc7de4b550
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/fusion_compiler.h
@@ -0,0 +1,55 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h"
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/utils/disallow_copy.h"
+
+#include "ATen/ATen.h"
+
+#include <memory>
+#include <vector>
+#include <unordered_map>
+#include <string>
+
+namespace torch { namespace jit { namespace cpufuser {
+
+struct CPUFusionCompilerConfig {
+  std::string cxx = "g++"; // compiler location
+  bool debug = false; // emit debugging information about fusions
+  bool openmp = true;
+};
+
+struct CPUFusionCompiler {
+  TH_DISALLOW_COPY_AND_ASSIGN(CPUFusionCompiler);
+
+  CPUFusionCompiler();
+
+  ~CPUFusionCompiler() = default;
+
+  std::shared_ptr<FusionHandle> getFusionHandle(Node* fusion_group);
+  
+  std::vector<at::Tensor> debugLaunchGraph(
+    Graph& graph
+  , int device
+  , at::ArrayRef<at::Tensor> inputs);
+
+  CPUFusionCompilerConfig& getConfig() {
+    return config_;
+  }
+
+private:
+  CPUFusionCompilerConfig config_;
+  std::unordered_map<std::string, std::shared_ptr<FusionHandleImpl>> cache_map;
+};
+
+CPUFusionCompiler& getFusionCompiler();
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER
diff --git a/torch/csrc/jit/fusers/cpu/interface.h b/torch/csrc/jit/fusers/cpu/interface.h
new file mode 100644
index 00000000000000..01f6feb8e2dd8d
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/interface.h
@@ -0,0 +1,32 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/cpu/fusion_compiler.h"
+
+#include "ATen/ATen.h"
+
+#include "torch/csrc/jit/ir.h"
+
+#include <vector>
+#include <memory>
+
+namespace torch { namespace jit { namespace cpufuser {
+
+inline std::shared_ptr<FusionHandle> getFusionHandle(Node* fusion_group) {
+  return getFusionCompiler().getFusionHandle(fusion_group);
+}
+
+std::vector<at::Tensor> debugLaunchGraph(
+  Graph& graph
+, int device
+, at::ArrayRef<at::Tensor> inputs) {
+  return getFusionCompiler().debugLaunchGraph(graph, device, inputs);
+}
+
+} // namespace cpufuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER
diff --git a/torch/csrc/jit/fusers/cpu/resource_strings.h b/torch/csrc/jit/fusers/cpu/resource_strings.h
new file mode 100644
index 00000000000000..60c1c0faaa4fea
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/resource_strings.h
@@ -0,0 +1,56 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER
+#pragma once
+
+#include "torch/csrc/jit/code_template.h"
+
+namespace torch { namespace jit { namespace cpufuser {
+
+/*with type_as not checking type of its input, a fusion group can have non-fp32 tensor as input.
+Correct code for this case is generated, however, nvrtc does not know how to handle int*_t integer types,
+so typedefs help it handle those cases*/
+
+auto type_declarations_template = CodeTemplate(R"(
+typedef ${IndexType} IndexType;
+template<typename T, size_t N>
+struct TensorInfo {
+  T* data;
+  IndexType sizes[N];
+  IndexType strides[N];
+};
+template<typename T>
+struct TensorInfo<T, 0> {
+  T * data;
+};
+)");
+
+auto cpu_compilation_unit_template = CodeTemplate(R"(
+#include <cstddef>
+#include <cstdint>
+#include <math.h>
+${type_declarations}
+
+#define OMP_THRESHOLD 100000
+static void ${kernelName}_kernel(IndexType totalElements, ${formals}) {
+  #pragma omp parallel for if(totalElements > OMP_THRESHOLD)
+  for (IndexType linearIndex = 0;
+        linearIndex < totalElements;
+        linearIndex += 1) {
+      // Convert `linearIndex` into an offset of tensor:
+      ${tensorOffsets}
+      // calculate the results
+      ${kernelBody}
+    }
+}
+
+extern "C"
+void ${kernelName}(IndexType totalElements, void ** args) {
+  ${kernelName}_kernel(totalElements ${,argument_loads});
+}
+)");
+
+} // namespace cpufuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER
diff --git a/torch/csrc/jit/fusers/cpu/temp_file.h b/torch/csrc/jit/fusers/cpu/temp_file.h
new file mode 100644
index 00000000000000..2b2c21290c9ff5
--- /dev/null
+++ b/torch/csrc/jit/fusers/cpu/temp_file.h
@@ -0,0 +1,70 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CPU_FUSER
+#pragma once
+
+#include "torch/csrc/jit/assertions.h"
+
+#include "torch/csrc/WindowsTorchApiMacro.h"
+#include "torch/csrc/utils/disallow_copy.h"
+
+#include "ATen/ATen.h"
+
+
+#include "unistd.h"
+
+#include <string>
+#include <vector>
+
+namespace torch { namespace jit { namespace cpufuser {
+
+struct TempFile {
+  TH_DISALLOW_COPY_AND_ASSIGN(TempFile);
+
+  TempFile(const std::string& t, int suffix) {
+    // mkstemps edits its first argument in places
+    // so we make a copy of the string here, including null terminator
+    std::vector<char> tt(t.c_str(), t.c_str() + t.size() + 1);
+    int fd = mkstemps(tt.data(), suffix);
+    JIT_ASSERT(fd != -1);
+    file_ = fdopen(fd, "r+");
+
+    // - 1 becuase tt.size() includes the null terminator,
+    // but std::string does not expect one
+    name_ = std::string(tt.begin(), tt.end() - 1);
+  }
+
+  const std::string& name() const {
+    return name_;
+  }
+
+  void sync() {
+    fflush(file_);
+  }
+
+  void write(const std::string & str) {
+    size_t result = fwrite(str.c_str(), 1, str.size(), file_);
+    JIT_ASSERT(str.size() == result);
+  }
+
+  FILE* file()  {
+    return file_;
+  }
+
+  ~TempFile() {
+    if (file_ != nullptr) {
+      // unlink first to ensure another mkstemps doesn't
+      // race between close and unlink
+      unlink(name_.c_str());
+      fclose(file_);
+    }
+  }
+private:
+  FILE* file_ = nullptr;
+  std::string name_;
+};
+
+} // namespace cpufuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CPU_FUSER
diff --git a/torch/csrc/jit/fusers/cuda/fused_kernel.cpp b/torch/csrc/jit/fusers/cuda/fused_kernel.cpp
new file mode 100644
index 00000000000000..4067e6cc7bba7e
--- /dev/null
+++ b/torch/csrc/jit/fusers/cuda/fused_kernel.cpp
@@ -0,0 +1,111 @@
+#include "torch/csrc/jit/fusers/cuda/fused_kernel.h"
+
+#include "torch/csrc/jit/resource_guard.h"
+
+#include "ATen/cuda/CUDAContext.h"
+#include "THC/THC.h"
+#include "THC/THCGenerator.hpp"
+#include "torch/csrc/cuda/cuda_check.h"
+
+#include "nvrtc.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+#include <stdexcept>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <algorithm>
+
+namespace torch { namespace jit { namespace cudafuser {
+
+void checkCUDAVersion(const cudaDeviceProp& prop) {
+  if ((prop.major >= 6 && CUDA_VERSION < 8000) ||
+      (prop.major >= 7 && CUDA_VERSION < 9000)) {
+    std::stringstream err_string;
+    err_string << "In CUDAFusedKernel, PyTorch compiled with insufficient CUDA version: "
+         << CUDA_VERSION << " for the current GPU device " << prop.name
+         << " with device capability " << prop.major << "." << prop.minor;
+    throw std::runtime_error(err_string.str());
+  }
+}
+
+CUDAFusedKernel::CUDAFusedKernel(
+  const std::string& name
+, AnnotatedGraph& agraph)
+: FusedKernel(name, agraph) {
+  at::DeviceGuard device_guard(agraph.device);
+
+  TORCH_CUDA_CHECK(cudaGetDeviceProperties(&prop, agraph.device));
+  checkCUDAVersion(prop);
+
+  std::stringstream cu;
+  std::tie(chunk_desc, concat_desc, has_random) = emitCompilationUnit(cu, name, agraph, true);
+  compilation_unit = cu.str();
+  nvrtcProgram program;
+  TORCH_NVRTC_CHECK(nvrtcCreateProgram(&program, compilation_unit.c_str(), nullptr, 0, nullptr, nullptr));
+
+  std::string compute = "--gpu-architecture=compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
+  std::vector<const char *> args = {"--std=c++11", compute.c_str(), "-default-device"};
+  nvrtcResult result = nvrtcCompileProgram(program, args.size(), args.data());
+  if (result == NVRTC_ERROR_COMPILATION) {
+    size_t logsize;
+    nvrtcGetProgramLogSize(program, &logsize);
+    std::vector<char> log(logsize);
+    nvrtcGetProgramLog(program, log.data());
+    cu << log.data();
+    throw std::runtime_error(cu.str());
+  }
+  ResourceGuard holdProgram([&] {
+    TORCH_NVRTC_CHECK(nvrtcDestroyProgram(&program));
+  });
+  TORCH_NVRTC_CHECK(result);
+
+  size_t ptx_size;
+  TORCH_NVRTC_CHECK(nvrtcGetPTXSize(program, &ptx_size));
+  ptx.resize(ptx_size);
+  TORCH_NVRTC_CHECK(nvrtcGetPTX(program, ptx.data()));
+  CUcontext pctx = 0;
+  TORCH_CU_CHECK(cuCtxGetCurrent(&pctx));
+  if (!pctx) {
+     std::unique_lock<std::mutex> cudaFreeMutexLock(
+     *(THCCachingAllocator_getCudaFreeMutex()));
+     cudaFree(0);
+  }
+  TORCH_CU_CHECK(cuModuleLoadData(&module, ptx.data()));
+  TORCH_CU_CHECK(cuModuleGetFunction(&function, module, name.c_str()));
+
+  TORCH_CU_CHECK(cuOccupancyMaxActiveBlocksPerMultiprocessor(
+    &maxBlocks, function, 128, 0));
+  maxBlocks *= prop.multiProcessorCount;
+}
+
+void CUDAFusedKernel::launch_raw(uint32_t numel, void** arguments) {
+  int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
+
+     //std::cout << "maxBlocks = " << maxBlocks << " needed blocks: " << ceilDiv(numel,blockSize)
+     //          << " numblocks =  " << numBlocks;
+
+     // it is possible that this is the first cuda call on this thread
+     // so make sure we initialize the Driver API's context
+     // cudaFree(0) accomplishes this.
+     CUcontext pctx = 0;
+     TORCH_CU_CHECK(cuCtxGetCurrent(&pctx));
+     if (!pctx) {
+        std::unique_lock<std::mutex> cudaFreeMutexLock(
+            *(THCCachingAllocator_getCudaFreeMutex()));
+        cudaFree(0);
+     }
+     CUstream stream = at::cuda::getCurrentCUDAStream();
+     TORCH_CU_CHECK(cuLaunchKernel(
+       function,
+       numBlocks, 1, 1,
+       blockSize, 1, 1,
+       0, stream,
+       arguments,
+       nullptr));
+}
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/cuda/fused_kernel.h b/torch/csrc/jit/fusers/cuda/fused_kernel.h
new file mode 100644
index 00000000000000..bd6a0ec8b95606
--- /dev/null
+++ b/torch/csrc/jit/fusers/cuda/fused_kernel.h
@@ -0,0 +1,59 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/common/fused_kernel.h"
+#include "torch/csrc/jit/fusers/common/annotated_graph.h"
+
+#include "ATen/ATen.h"
+
+#include "nvrtc.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+#include <cstdint>
+#include <vector>
+#include <string>
+
+namespace torch { namespace jit { namespace cudafuser {
+
+struct CUDAFusedKernel : public ::torch::jit::FusedKernel {
+  CUDAFusedKernel(const std::string& name, AnnotatedGraph& agraph);
+
+  virtual ~CUDAFusedKernel() override {
+    cuModuleUnload(module);
+  }
+
+protected:
+  virtual at::Backend backend() const override {
+    return at::Backend::CUDA;
+  }
+
+  int ceilDiv(int a, int b) {
+    return (a + b - 1) / b;
+  }
+
+  virtual uint64_t get_rand_offset(uint32_t numel) override {
+     int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
+     return 4 * (ceil(numel/(4 * blockSize * numBlocks)) + 1);
+  }
+
+  virtual void launch_raw(uint32_t numel, void ** arguments) override;
+
+  std::vector<char> ptx;
+  CUmodule module;
+  CUfunction function;
+
+  // we record prop/device so if they are availiable for launch heuristics
+  // querying at launch is too slow for device properties.
+  int device;
+  cudaDeviceProp prop;
+  int blockSize = 128;
+  int maxBlocks;
+};
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/cuda/fusion_compiler.cpp b/torch/csrc/jit/fusers/cuda/fusion_compiler.cpp
new file mode 100644
index 00000000000000..3add518bdeaf21
--- /dev/null
+++ b/torch/csrc/jit/fusers/cuda/fusion_compiler.cpp
@@ -0,0 +1,71 @@
+#include "torch/csrc/jit/fusers/cuda/fusion_compiler.h"
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h"
+
+#include "torch/csrc/jit/passes/shape_analysis.h" // EraseShapeInformation
+#include "torch/csrc/utils/functional.h" //fmap
+#include "torch/csrc/jit/ivalue.h" // IValue
+
+#include "torch/csrc/jit/assertions.h"
+
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <cstdlib>
+
+namespace torch { namespace jit { namespace cudafuser {
+CUDAFusionCompiler& getFusionCompiler() {
+  static CUDAFusionCompiler compiler;
+  return compiler;
+}
+
+std::shared_ptr<FusionHandle> CUDAFusionCompiler::getFusionHandle(
+  Node* fusion_group) {
+  // verifies on GPU
+  const auto device = fusion_group->i(attr::device);
+  JIT_ASSERT(device != kCPUDevice);
+
+  auto graph = fusion_group->g(attr::Subgraph)->copy();
+  EraseShapeInformation(*graph);
+  std::stringstream key;
+  key << "device " << device << "\n";
+  key << *graph << "\n";
+  std::string key_ = key.str();
+  auto it = cache_map.find(key_);
+  if (it == cache_map.end()) {
+    std::tie(it, std::ignore) = 
+      cache_map.emplace(
+        key_
+      , std::make_shared<FusionHandleImpl>(graph, device));
+  }
+
+  return it->second;
+}
+
+std::vector<at::Tensor> CUDAFusionCompiler::debugLaunchGraph(
+  Graph& graph
+, int device
+, at::ArrayRef<at::Tensor> inputs) {
+  auto wrapper_graph = std::make_shared<Graph>();
+  Node* fusion_group = 
+    wrapper_graph->insertNode(wrapper_graph->createFusionGroup(device));
+  fusion_group->g_(attr::Subgraph, graph.copy());
+  
+  for (size_t i = 0; i < graph.inputs().size(); ++i) {
+    fusion_group->addInput(wrapper_graph->addInput());
+  }
+  
+  for (size_t i = 0; i < graph.outputs().size(); ++i) {
+    wrapper_graph->registerOutput(fusion_group->addOutput());
+  }
+
+  auto cache = getFusionHandle(fusion_group);
+  Stack stack = fmap<IValue>(inputs);
+  cache->run(stack);
+  return fmap(stack, [](const IValue& iv) { return iv.toTensor(); });
+}
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/cuda/fusion_compiler.h b/torch/csrc/jit/fusers/cuda/fusion_compiler.h
new file mode 100644
index 00000000000000..800ea2150dfecc
--- /dev/null
+++ b/torch/csrc/jit/fusers/cuda/fusion_compiler.h
@@ -0,0 +1,44 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/common/fusion_handle_impl.h"
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/utils/disallow_copy.h"
+
+#include "ATen/ATen.h"
+
+#include <memory>
+#include <vector>
+#include <unordered_map>
+#include <string>
+
+namespace torch { namespace jit { namespace cudafuser {
+
+struct CUDAFusionCompiler {
+  TH_DISALLOW_COPY_AND_ASSIGN(CUDAFusionCompiler);
+
+  CUDAFusionCompiler() = default;
+
+  ~CUDAFusionCompiler() = default;
+
+  std::shared_ptr<FusionHandle> getFusionHandle(Node* fusion_group);
+  
+  std::vector<at::Tensor> debugLaunchGraph(
+    Graph& graph
+  , int device
+  , at::ArrayRef<at::Tensor> inputs);
+
+private:
+  std::unordered_map<std::string, std::shared_ptr<FusionHandleImpl>> cache_map;
+};
+
+CUDAFusionCompiler& getFusionCompiler();
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CUDA_FUSER
diff --git a/torch/csrc/jit/fusers/cuda/interface.h b/torch/csrc/jit/fusers/cuda/interface.h
new file mode 100644
index 00000000000000..4a6ee3f8e5aa36
--- /dev/null
+++ b/torch/csrc/jit/fusers/cuda/interface.h
@@ -0,0 +1,32 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/fusers/interface.h"
+#include "torch/csrc/jit/fusers/cuda/fusion_compiler.h"
+
+#include "torch/csrc/jit/ir.h"
+
+#include "ATen/ATen.h"
+
+#include <vector>
+#include <memory>
+
+namespace torch { namespace jit { namespace cudafuser {
+
+inline std::shared_ptr<FusionHandle> getFusionHandle(Node* fusion_group) {
+  return getFusionCompiler().getFusionHandle(fusion_group);
+}
+
+std::vector<at::Tensor> debugLaunchGraph(
+  Graph& graph
+, int device
+, at::ArrayRef<at::Tensor> inputs) {
+  return getFusionCompiler().debugLaunchGraph(graph, device, inputs);
+}
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CUDA_FUSER
\ No newline at end of file
diff --git a/torch/csrc/jit/fusers/cuda/resource_strings.h b/torch/csrc/jit/fusers/cuda/resource_strings.h
new file mode 100644
index 00000000000000..0063288721d727
--- /dev/null
+++ b/torch/csrc/jit/fusers/cuda/resource_strings.h
@@ -0,0 +1,197 @@
+#include "torch/csrc/jit/fusers/Config.h"
+#if USE_CUDA_FUSER
+#pragma once
+
+#include "torch/csrc/jit/code_template.h"
+
+namespace torch { namespace jit { namespace cudafuser {
+
+/*with type_as not checking type of its input, a fusion group can have non-fp32 tensor as input.
+Correct code for this case is generated, however, nvrtc does not know how to handle int*_t integer types,
+so typedefs help it handle those cases*/
+
+auto type_declarations_template = CodeTemplate(R"(
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+typedef short int  int16_t;
+typedef long long int int64_t;
+${HalfHeader}
+${RandHeader}
+
+typedef ${IndexType} IndexType;
+template<typename T, size_t N>
+struct TensorInfo {
+  T* data;
+  IndexType sizes[N];
+  IndexType strides[N];
+};
+template<typename T>
+struct TensorInfo<T, 0> {
+  T * data;
+};
+)");
+
+// We rewrite the code for philox RNG from curand as nvrtc couldn't resolve the
+// curand header correctly.
+constexpr auto rand_support_literal = R"(
+
+  class Philox {
+  public:
+    __device__ inline Philox(unsigned long long seed,
+                             unsigned long long subsequence,
+                             unsigned long long offset) {
+      key.x = (unsigned int)seed;
+      key.y = (unsigned int)(seed >> 32);
+      counter = make_uint4(0, 0, 0, 0);
+      counter.z = (unsigned int)(subsequence);
+      counter.w = (unsigned int)(subsequence >> 32);
+      STATE = 0;
+      incr_n(offset / 4);
+    }
+
+    __device__ inline unsigned long operator()() {
+      if(STATE == 0) {
+        uint4 counter_ = counter;
+        uint2 key_ = key;
+        for(int i = 0; i < 9; i++) {
+          counter_ = single_round(counter_, key_);
+          key_.x += (kPhilox10A); key_.y += (kPhilox10B);
+        }
+        output = single_round(counter_, key_);
+        incr();
+      }
+      unsigned long ret;
+      switch(STATE) {
+        case 0: ret = output.x; break;
+        case 1: ret = output.y; break;
+        case 2: ret = output.z; break;
+        case 3: ret = output.w; break;
+      }
+      STATE = (STATE + 1) % 4;
+      return ret;
+    }
+
+  private:
+    uint4 counter;
+    uint4 output;
+    uint2 key;
+    unsigned int STATE;
+    __device__ inline void incr_n(unsigned long long n) {
+      unsigned int nlo = (unsigned int)(n);
+      unsigned int nhi = (unsigned int)(n >> 32);
+      counter.x += nlo;
+      if (counter.x < nlo)
+        nhi++;
+      counter.y += nhi;
+      if (nhi <= counter.y)
+        return;
+      if (++counter.z)
+        return;
+      ++counter.w;
+    }
+    __device__ inline void incr() {
+      if (++counter.x)
+        return;
+      if (++counter.y)
+        return;
+      if (++counter.z)
+        return;
+      ++counter.w;
+    }
+    __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
+                                      unsigned int *result_high) {
+      *result_high = __umulhi(a, b);
+      return a*b;
+    }
+
+    __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
+      unsigned int hi0;
+      unsigned int hi1;
+      unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
+      unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
+
+      uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
+      return ret;
+    }
+
+    static const unsigned long kPhilox10A = 0x9E3779B9;
+    static const unsigned long kPhilox10B = 0xBB67AE85;
+    static const unsigned long kPhiloxSA = 0xD2511F53;
+    static const unsigned long kPhiloxSB = 0xCD9E8D57;
+  };
+
+  // Inverse of 2^32.
+  #define M_RAN_INVM32 2.3283064e-10f
+  __device__  __inline__ float uniform(unsigned int x) {
+    return x * M_RAN_INVM32;
+  }
+)";
+
+constexpr auto rand_param = ",unsigned long long seed, unsigned long long offset";
+
+constexpr auto rand_init = R"(
+  int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  Philox rnd(seed, idx, offset);
+)";
+
+auto cuda_compilation_unit_template = CodeTemplate(R"(
+${type_declarations}
+
+extern "C" __global__
+void ${kernelName}(IndexType totalElements, ${formals} ${RandParam}) {
+  ${RandInit}
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+        linearIndex < totalElements;
+        linearIndex += gridDim.x * blockDim.x) {
+      // Convert `linearIndex` into an offset of tensor:
+      ${tensorOffsets}
+      // calculate the results
+      ${kernelBody}
+    }
+}
+)");
+
+
+// This snippet enables half support in the jit. Following the pattern for
+// reductions, fp16 input data is immediately upconverted to float
+// with __half2float(). All mathematical operations are done on float
+// values, and if needed the intermediate float representation is
+// converted to half with __float2half() when writing to a half tensor.
+constexpr auto half_support_literal  = R"(
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#if defined(__cplusplus)
+  struct __align__(2) __half {
+    __host__ __device__ __half() { }
+
+  protected:
+    unsigned short __x;
+  };
+
+  /* All intrinsic functions are only available to nvcc compilers */
+  #if defined(__CUDACC__)
+    /* Definitions of intrinsics */
+    __device__ __half __float2half(const float f) {
+      __half val;
+      asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f));
+      return val;
+    }
+
+    __device__ float __half2float(const __half h) {
+      float val;
+      asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
+      return val;
+    }
+  #endif /* defined(__CUDACC__) */
+#endif /* defined(__cplusplus) */
+#undef __HALF_TO_US
+#undef __HALF_TO_CUS
+
+typedef __half half;
+)";
+
+} // namespace cudafuser
+} // namespace jit 
+} // namespace torch
+
+#endif // USE_CUDA_FUSER
\ No newline at end of file
diff --git a/torch/csrc/jit/fusers/interface.cpp b/torch/csrc/jit/fusers/interface.cpp
new file mode 100644
index 00000000000000..0b175730f8cfde
--- /dev/null
+++ b/torch/csrc/jit/fusers/interface.cpp
@@ -0,0 +1,81 @@
+#include "torch/csrc/jit/fusers/interface.h"
+
+#include "torch/csrc/jit/fusers/Config.h"
+
+#if USE_CPU_FUSER
+  #include "torch/csrc/jit/fusers/cpu/interface.h"
+#endif // USE_CPU_FUSER
+
+#if USE_CUDA_FUSER
+  #include "torch/csrc/jit/fusers/cuda/interface.h"
+#endif // USE_CUDA_FUSER
+
+#include <stdexcept>
+
+namespace torch { namespace jit {
+
+namespace detail {
+
+bool cpu_fuser_enabled = false;
+
+} // namespace detail
+
+// Pure virtual destructor definition
+FusionHandle::~FusionHandle() { }
+
+std::shared_ptr<FusionHandle> getFusionHandle(Node* fusion_group) {
+  const auto device = fusion_group->i(attr::device);
+  if (device == kCPUDevice) {
+    #if USE_CPU_FUSER
+      return cpufuser::getFusionHandle(fusion_group);
+    #endif
+    throw std::runtime_error("CPU fusion is not supported on this build.");
+  }
+
+  #if USE_CUDA_FUSER
+    return cudafuser::getFusionHandle(fusion_group);
+  #endif // USE_CUDA_FUSER
+
+  throw std::runtime_error("CUDA fusion is not supported on this build.");
+}
+
+bool canFuseOnCPU() {
+  #if USE_CPU_FUSER
+    return detail::cpu_fuser_enabled;
+  #endif // USE_CPU_FUSER
+
+  return false;
+}
+
+bool canFuseOnGPU() {
+  #if USE_CUDA_FUSER
+    return true;
+  #endif  // USE_CUDA_FUSER
+
+  return false;
+}
+
+void overrideCanFuseOnCPU(bool value) {
+  detail::cpu_fuser_enabled = value;
+}
+
+std::vector<at::Tensor> debugLaunchGraph(
+  Graph& graph
+, int device
+, at::ArrayRef<at::Tensor> inputs) {
+  if (device == kCPUDevice) {
+    #if USE_CPU_FUSER
+      return cpufuser::debugLaunchGraph(graph, device, inputs);
+    #endif // USE_CPU_FUSER
+    throw std::runtime_error("CPU fusion is not supported on this build.");
+  }
+
+  #if USE_CUDA_FUSER
+    return cudafuser::debugLaunchGraph(graph, device, inputs);
+  #endif // USE_CUDA_FUSER
+
+  throw std::runtime_error("CUDA fusion is not supported on this build.");
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/fusers/interface.h b/torch/csrc/jit/fusers/interface.h
new file mode 100644
index 00000000000000..cd4956a8870d87
--- /dev/null
+++ b/torch/csrc/jit/fusers/interface.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/stack.h"
+
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
+#include "ATen/ATen.h"
+
+#include <memory>
+#include <vector>
+
+namespace torch { namespace jit {
+
+constexpr int kCPUDevice = -1;
+
+struct TORCH_API FusionHandle {
+  virtual void run(Stack& inputs) = 0;
+
+  virtual ~FusionHandle() = 0;
+};
+
+TORCH_API std::shared_ptr<FusionHandle> getFusionHandle(Node* fusion_group);
+
+TORCH_API bool canFuseOnCPU();
+TORCH_API bool canFuseOnGPU();
+
+// CPU fuser is disabled by default, but we still want to test it.
+TORCH_API void overrideCanFuseOnCPU(bool value);
+
+TORCH_API std::vector<at::Tensor> debugLaunchGraph(
+  Graph& graph
+, int device
+, at::ArrayRef<at::Tensor> inputs);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp
deleted file mode 100644
index e21d00f00f40de..00000000000000
--- a/torch/csrc/jit/fusion_compiler.cpp
+++ /dev/null
@@ -1,1615 +0,0 @@
-#ifndef _WIN32
-#include "torch/csrc/jit/fusion_compiler.h"
-
-#include "torch/csrc/jit/ir.h"
-#include "torch/csrc/jit/code_template.h"
-#include "torch/csrc/jit/resource_guard.h"
-#include "torch/csrc/jit/constants.h"
-#include "torch/csrc/jit/passes/shape_analysis.h"
-#include "torch/csrc/jit/custom_operator.h"
-
-#include "torch/csrc/utils/disallow_copy.h"
-#include "torch/csrc/variable_tensor_functions.h"
-#include "torch/csrc/utils/hash.h"
-#include <torch/csrc/jit/assertions.h>
-
-#include "ATen/ATen.h"
-#include "ATen/ExpandUtils.h"
-#include "ATen/WrapDimUtils.h"
-
-#ifdef USE_CUDA
-#include "ATen/cuda/CUDAContext.h"
-#include "THC/THC.h"
-#include <THC/THCGenerator.hpp>
-#include "torch/csrc/cuda/cuda_check.h"
-#include <nvrtc.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#endif
-
-#include <string>
-#include <algorithm>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include <sstream>
-#include <iostream>
-#include <dlfcn.h>
-#include <unistd.h>
-
-#ifdef USE_CUDA
-THCGenerator* THCRandom_getGenerator(THCState* state);
-#endif
-
-namespace torch { namespace jit {
-
-std::vector<bool> TensorDesc::findContiguous(
-    const at::IntList& sizes,
-    const at::IntList& strides) {
-  JIT_ASSERT(sizes.size() == strides.size());
-  std::vector<bool> cont(sizes.size());
-  for(size_t i = 0; i < sizes.size(); ++i) {
-    int64_t expected_stride = (i + 1 < sizes.size()) ? sizes[i+1]*strides[i+1] : 1;
-    cont[i] = strides[i] == expected_stride;
-  }
-  return cont;
-}
-
-// Descriptor for chunk-ing an input tensor into subtensors
-// OR concat-ing an output tensor from subtensors
-struct PartitionDesc {
-  size_t nSubtensors; // == 1 for tensors that should not be operated on via chunk/cat
-  size_t dim; // dimension along which the chunk/concat occurs
-  std::unique_ptr<TensorDesc> subtensorDesc; // descriptor for the subtensor, if it exists
-  PartitionDesc()
-  : nSubtensors(1), dim(0) {}
-
-  PartitionDesc(const TensorDesc & desc, size_t nSubtensors, size_t dim)
-  : nSubtensors(nSubtensors), dim(dim) {
-    JIT_ASSERT(nSubtensors > 1);
-    std::vector<bool> cont = desc.contiguity;
-    if(dim > 0) {
-      // when we narrow the concatenated output/chunked input
-      // we make the size[dim] smaller while keeping the stride[dim] the same,
-      // meaning: stride[dim - 1] != stride[dim]*size[dim]
-      // so dim - 1 is no longer contiguous
-      cont[dim - 1] = false;
-    }
-    subtensorDesc.reset(new TensorDesc(desc.scalar_type, cont));
-  }
-
-  bool isNoop() const {
-    return nSubtensors == 1;
-  }
-};
-
-struct FusedKernel {
-  TH_DISALLOW_COPY_AND_ASSIGN(FusedKernel);
-
-  FusedKernel(const std::string & name, AnnotatedGraph & agraph);
-  virtual ~FusedKernel() = default;
-
-  // expects outputs to be pre-allocated
-  void launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);
-
-  // creates new tensors for outputs
-  void launch(at::ArrayRef<at::Tensor> inputs, std::vector<at::Tensor> & outputs);
-  const std::vector<TensorDesc> & outputDescriptors() const {
-    return output_desc;
-  }
-protected:
-  virtual at::Backend backend() const = 0;
-
-  // arguments is a list of pointers to the arguments for the compiled CUDA/CPU
-  // code.
-  // The format of arguments is suitable for directly passing to a call to
-  // cuLaunchKernel as the kernel arguments.
-  // Currently the first argument is a pointer to numel (for passing to
-  // CUDA code), and the remainder are pointers to the TensorInfo<T> structs
-  // that compiled code uses to load Tensor data.
-  // launch_with_tensors handles packing at::Tensors into this arguments array.
-  // CPU code uses the same convension so that launch_with_tensors can be shared.
-  virtual void launch_raw(uint32_t numel, void ** arguments) = 0;
-
-  virtual uint64_t get_rand_offset(uint32_t numel) = 0;
-  bool has_random;
-  std::string name;
-  // We keep these around for debugging
-  std::string compilation_unit;
-  std::vector<TensorDesc> input_desc;
-  std::vector<TensorDesc> output_desc;
-
-  // same size as output_desc, describes whether
-  // an output is actually a concatenation of
-  // many subtensors that the fusion group produces
-  std::vector<PartitionDesc> concat_desc;
-
-  // same size as input_desc, describes whether an
-  // input should be broken into subtensors (chunks)
-  // to be consumed by the fusion group
-  std::vector<PartitionDesc> chunk_desc;
-};
-
-
-namespace {
-
-#ifdef USE_CUDA
-
-static int ceilDiv(int a, int b) {
-  return (a + b - 1) / b;
-}
-
-#endif
-
-Node* usedInFusedChunk(Value * input) {
-  auto uses = input->uses();
-  if (uses.size() == 1) {
-    Node *user = uses[0].user;
-    if (user->kind() == prim::ConstantChunk) {
-      return user;
-    }
-  }
-  return nullptr;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Code generation
-
-namespace codegen {
-
-/*with type_as not checking type of its input, a fusion group can have non-fp32 tensor as input.
-Correct code for this case is generated, however, nvrtc does not know how to handle int*_t integer types,
-so typedefs help it handle those cases*/
-
-auto type_declarations_template = CodeTemplate(R"(
-#if defined(__CUDACC_RTC__)
-typedef unsigned char uint8_t;
-typedef signed char int8_t;
-typedef short int  int16_t;
-typedef long long int int64_t;
-${HalfHeader}
-${RandHeader}
-#endif
-typedef ${IndexType} IndexType;
-template<typename T, size_t N>
-struct TensorInfo {
-  T * data;
-  IndexType sizes[N];
-  IndexType strides[N];
-};
-template<typename T>
-struct TensorInfo<T, 0> {
-  T * data;
-};
-)");
-
-// We rewrite the code for philox RNG from curand as nvrtc couldn't resolve the
-// curand header correctly.
-constexpr auto rand_support_literal = R"(
-
-  class Philox {
-  public:
-    __device__ inline Philox(unsigned long long seed,
-                             unsigned long long subsequence,
-                             unsigned long long offset) {
-      key.x = (unsigned int)seed;
-      key.y = (unsigned int)(seed >> 32);
-      counter = make_uint4(0, 0, 0, 0);
-      counter.z = (unsigned int)(subsequence);
-      counter.w = (unsigned int)(subsequence >> 32);
-      STATE = 0;
-      incr_n(offset / 4);
-    }
-
-    __device__ inline unsigned long operator()() {
-      if(STATE == 0) {
-        uint4 counter_ = counter;
-        uint2 key_ = key;
-        for(int i = 0; i < 9; i++) {
-          counter_ = single_round(counter_, key_);
-          key_.x += (kPhilox10A); key_.y += (kPhilox10B);
-        }
-        output = single_round(counter_, key_);
-        incr();
-      }
-      unsigned long ret;
-      switch(STATE) {
-        case 0: ret = output.x; break;
-        case 1: ret = output.y; break;
-        case 2: ret = output.z; break;
-        case 3: ret = output.w; break;
-      }
-      STATE = (STATE + 1) % 4;
-      return ret;
-    }
-
-  private:
-    uint4 counter;
-    uint4 output;
-    uint2 key;
-    unsigned int STATE;
-    __device__ inline void incr_n(unsigned long long n) {
-      unsigned int nlo = (unsigned int)(n);
-      unsigned int nhi = (unsigned int)(n >> 32);
-      counter.x += nlo;
-      if (counter.x < nlo)
-        nhi++;
-      counter.y += nhi;
-      if (nhi <= counter.y)
-        return;
-      if (++counter.z)
-        return;
-      ++counter.w;
-    }
-    __device__ inline void incr() {
-      if (++counter.x)
-        return;
-      if (++counter.y)
-        return;
-      if (++counter.z)
-        return;
-      ++counter.w;
-    }
-    __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
-                                      unsigned int *result_high) {
-      *result_high = __umulhi(a, b);
-      return a*b;
-    }
-
-    __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
-      unsigned int hi0;
-      unsigned int hi1;
-      unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
-      unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
-
-      uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
-      return ret;
-    }
-
-    static const unsigned long kPhilox10A = 0x9E3779B9;
-    static const unsigned long kPhilox10B = 0xBB67AE85;
-    static const unsigned long kPhiloxSA = 0xD2511F53;
-    static const unsigned long kPhiloxSB = 0xCD9E8D57;
-  };
-
-  // Inverse of 2^32.
-  #define M_RAN_INVM32 2.3283064e-10f
-  __device__  __inline__ float uniform(unsigned int x) {
-    return x * M_RAN_INVM32;
-  }
-)";
-
-constexpr auto rand_param = ",unsigned long long seed, unsigned long long offset";
-constexpr auto rand_init = R"(
-  int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  Philox rnd(seed, idx, offset);
-)";
-auto cuda_compilation_unit_template = CodeTemplate(R"(
-${type_declarations}
-
-extern "C" __global__
-void ${kernelName}(IndexType totalElements, ${formals} ${RandParam}) {
-  ${RandInit}
-  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
-        linearIndex < totalElements;
-        linearIndex += gridDim.x * blockDim.x) {
-      // Convert `linearIndex` into an offset of tensor:
-      ${tensorOffsets}
-      // calculate the results
-      ${kernelBody}
-    }
-}
-)");
-
-auto cpu_compilation_unit_template = CodeTemplate(R"(
-#include <cstddef>
-#include <cstdint>
-#include <math.h>
-${type_declarations}
-
-#define OMP_THRESHOLD 100000
-static void ${kernelName}_kernel(IndexType totalElements, ${formals}) {
-  #pragma omp parallel for if(totalElements > OMP_THRESHOLD)
-  for (IndexType linearIndex = 0;
-        linearIndex < totalElements;
-        linearIndex += 1) {
-      // Convert `linearIndex` into an offset of tensor:
-      ${tensorOffsets}
-      // calculate the results
-      ${kernelBody}
-    }
-}
-
-extern "C"
-void ${kernelName}(IndexType totalElements, void ** args) {
-  ${kernelName}_kernel(totalElements ${,argument_loads});
-}
-)");
-
-// This snippet enables half support in the jit. Following the pattern for
-// reductions, fp16 input data is immediately upconverted to float
-// with __half2float(). All mathematical operations are done on float
-// values, and if needed the intermediate float representation is
-// converted to half with __float2half() when writing to a half tensor.
-constexpr auto half_support_literal  = R"(
-#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
-#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
-#if defined(__cplusplus)
-  struct __align__(2) __half {
-    __host__ __device__ __half() { }
-
-  protected:
-    unsigned short __x;
-  };
-
-  /* All intrinsic functions are only available to nvcc compilers */
-  #if defined(__CUDACC__)
-    /* Definitions of intrinsics */
-    __device__ __half __float2half(const float f) {
-      __half val;
-      asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f));
-      return val;
-    }
-
-    __device__ float __half2float(const __half h) {
-      float val;
-      asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
-      return val;
-    }
-  #endif /* defined(__CUDACC__) */
-#endif /* defined(__cplusplus) */
-#undef __HALF_TO_US
-#undef __HALF_TO_CUS
-
-typedef __half half;
-)";
-
-// curDimIndex = linearId % sizes[i]; // % sizes[i] is not needed for d == 0, because we already guard for numel outside the index calculation
-// offset += curDimIndex*strides[i]; // *strides[i] is optional if list_is_cont becaause strides.back() == 1
-// linearId /= sizes[i];
-auto dim_calc = CodeTemplate(R"(
-//printf("tensor ${tensor} sizes[${d}] = %d, strides[${d}] = %d\n", ${tensor}.sizes[${d}],${tensor}.strides[${d}]);
-size_t ${tensor}_dimIndex${d} = ${tensor}_linearIndex ${mod_sizes};
-${tensor}_offset += ${tensor}_dimIndex${d} ${times_stride};
-)");
-
-static void emitIndexingFor(std::ostream & out, const std::string & tensor, int ndim, bool last_is_cont) {
-  TemplateEnv env;
-  env.s("tensor",tensor);
-  out << format("IndexType ${tensor}_offset = 0;\n",env);
-  out << format("IndexType ${tensor}_linearIndex = linearIndex;\n",env);
-  for(int d = ndim - 1; d >= 0; --d) {
-    env.d("d",d);
-    env.s("mod_sizes", d > 0 ? format("% ${tensor}.sizes[${d}]",env) : "");
-    env.s("times_stride",(d < ndim - 1 || !last_is_cont) ?
-      format("* ${tensor}.strides[${d}]",env) : "");
-    out << dim_calc.format(env);
-    if(d > 0) {
-      out << format("${tensor}_linearIndex /= ${tensor}.sizes[${d}];\n",env);
-    }
-  }
-}
-
-static std::string valueName(Value * n) {
-  return "n" + std::to_string(n->unique());
-}
-
-static std::string scalarValue(int64_t v) {
-  return std::to_string(v);
-}
-
-static std::string scalarValue(double v) {
-  std::ostringstream out;
-  out << std::scientific << v << "f";
-  return out.str();
-}
-
-static const char * scalarTypeName(at::ScalarType type) {
-  if (type == at::ScalarType::Half) {
-    return "half";
-  }
-
-  switch(type) {
-    #define DEFINE_CASE(ctype,name,_) \
-      case at::ScalarType::name: return #ctype;
-    AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(DEFINE_CASE)
-    #undef DEFINE_CASE
-    default:
-      throw std::runtime_error("unknown scalar type");
-  }
-}
-
-std::string encodeRHS(Node * n) {
-  static std::unordered_map<NodeKind, std::string> simple_map_ops = {
-    // unary
-    {aten::abs, "absf(${0})"},
-    {aten::sigmoid, "1.f / (1.f + expf(-${0}))"},
-    {aten::relu, "${0} < 0 ? 0.f : ${0} "},
-    {aten::log, "logf(${0})"},
-    {aten::log10, "log10f(${0})"},
-    {aten::log1p, "log1pf(${0})"},
-    {aten::log2,  "log2f(${0})"},
-    {aten::lgamma, "lgammaf(${0})"},
-    {aten::exp, "expf(${0})"},
-    {aten::expm1, "expm1f(${0})"},
-    {aten::cos, "cosf(${0})"},
-    {aten::acos, "acosf(${0})"},
-    {aten::cosh, "coshf(${0})"},
-    {aten::sin, "sinf(${0})"},
-    {aten::asin, "asinf(${0})"},
-    {aten::sinh, "sinhf(${0})"},
-    {aten::tan, "tanf(${0})"},
-    {aten::atan, "atanf(${0})"},
-    {aten::tanh, "tanhf(${0})"},
-    {aten::sqrt, "sqrtf(${0})"},
-    {aten::rsqrt, "rsqrtf(${0})"},
-    {aten::ceil, "ceilf(${0})"},
-    {aten::floor, "floorf(${0})"},
-    {aten::round, "roundf(${0})"},
-    {aten::trunc, "truncf(${0})"},
-    {aten::frac, "fracf(${0})"},
-    {aten::reciprocal, "reciprocalf(${0})"},
-    {aten::neg, "-${0}"},
-    //simple binary
-    {aten::atan2, "atan2(${0}, ${1})"},
-    {aten::min, "fminf(${0}, ${1})"},
-    {aten::max, "fmaxf(${0}, ${1})"},
-
-    //binary with other
-    // TODO: some of these ops will not get generated because
-    // we only work on float inputs/outputs, but they are here to record
-    // that they are valid mappable ops once we handle more type
-    {aten::__and__, "${0} && ${1}"},
-    {aten::__lshift__, "${0} << ${1}"},
-    {aten::__or__, "${0} || ${1}"},
-    {aten::__rshift__, "${0} >> ${1}"},
-    {aten::__xor__, "${0} ^ ${1}"},
-    {aten::div, "${0} / ${1}"},
-    {aten::eq, "${0} == ${1}"},
-    {aten::fmod, "fmodf(${0}, ${1})"},
-    {aten::ge, "(${0} >= ${1})"},
-    {aten::gt, "${0} > ${1}"},
-    {aten::le, "(${0} <= ${1})"},
-    {aten::lt, "${0} < ${1}"},
-    {aten::type_as, "(${0})"}, //everything is implicitly convertible to float
-    {aten::mul, "${0} * ${1}"},
-    {aten::ne, "${0} != ${1}"},
-    {aten::remainder, "remainderf(${0}, ${1})"},
-    {aten::pow, "powf(${0}, ${1})"},
-
-    //alpha
-    {aten::add, "${0} + ${2}*${1}"},
-    {aten::sub, "(${0} - ${2}*${1})"},
-    {aten::rand_like, "uniform(rnd())"},
-
-    // simple derivatives
-    {aten::_sigmoid_backward, "${0} * ${1} * (1.f - ${1})"},
-    {aten::_tanh_backward,    "${0} * (1.f - ${1} * ${1})"},
-  };
-
-  if (n->kind() == prim::Constant) {
-    auto val = toIValue(n->output()).value();
-    if (val.isDouble()) {
-      return scalarValue(val.toDouble());
-    } else {
-      JIT_ASSERT(val.isInt());
-      return scalarValue(val.toInt());
-    }
-  }
-
-  TemplateEnv env;
-  size_t i = 0;
-  for(auto in : n->inputs()) {
-    env.s(std::to_string(i++), valueName(in));
-  }
-
-  const auto & str = simple_map_ops.at(n->kind());
-  return format(str, env);
-}
-
-// Returns: (input chunk metadata, output concat metadata, is_random)
-std::tuple<std::vector<PartitionDesc>,std::vector<PartitionDesc>,bool> emitCompilationUnit(
-    std::ostream& out,
-    const std::string& name,
-    AnnotatedGraph& agraph,
-    bool use_cuda) {
-  bool has_random = false;
-  Graph& subgraph = *agraph.graph;
-  TemplateEnv env;
-  env.s("kernelName",name);
-  // TODO: handle cases where we need to generate > 2^32 element tensors
-  env.s("IndexType","unsigned int"); //avoiding slow header includes to get uint32_t
-
-  std::stringstream body;
-  std::stringstream tensorOffsets;
-  std::vector<std::string> formals;
-  std::vector<std::string> argument_loads;
-  auto emitFormal = [&](Value * n, const TensorDesc & desc) {
-    std::string tensor = "t" + std::to_string(formals.size()); //can't be unique() because Param may be an output
-    size_t nDim = desc.nDim();
-    emitIndexingFor(tensorOffsets, tensor, nDim,  desc.lastIsContiguous());
-    env.s("tensor",tensor);
-    env.d("formal_index", formals.size() + 1); // + 1 because the first argument is the linearIndex
-    env.d("nDim",nDim);
-    env.s("scalar_type",scalarTypeName(desc.scalar_type));
-    formals.push_back(format("TensorInfo<${scalar_type},${nDim}> ${tensor}",env));
-    argument_loads.push_back(format("*static_cast<TensorInfo<${scalar_type},${nDim}>*>(args[${formal_index}])",env));
-  };
-
-  std::vector<PartitionDesc> chunk_desc;
-  std::vector<std::pair<Value*,TensorDesc&>> flat_inputs;
-  {
-    size_t input_index = 0;
-    for(auto p : subgraph.inputs()) {
-      if (Node * chunk = usedInFusedChunk(p)) {
-        int64_t dim = chunk->i(attr::dim);
-        int64_t chunks = chunk->i(attr::chunks);
-        chunk_desc.emplace_back(agraph.input_desc[input_index++], chunks, dim);
-        for (auto * o : chunk->outputs()) {
-          flat_inputs.emplace_back(o, *chunk_desc.back().subtensorDesc);
-        }
-      } else {
-        chunk_desc.emplace_back();
-        flat_inputs.emplace_back(p, agraph.input_desc[input_index++]);
-      }
-    }
-    for (auto & input : flat_inputs) {
-      emitFormal(input.first, input.second);
-    }
-  }
-
-  std::vector<PartitionDesc> concat_desc;
-  std::vector<std::pair<Value*,TensorDesc>> flat_output_nodes;
-  {
-    size_t i = 0;
-    for(auto o : subgraph.outputs()) {
-      auto & desc = agraph.output_desc[i++];
-      if(o->node()->kind() != prim::FusedConcat) {
-        emitFormal(o, desc);
-        concat_desc.emplace_back();
-        flat_output_nodes.emplace_back(o, desc);
-      } else {
-        auto cat = o->node();
-        concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim));
-        for(auto c : cat->inputs()) {
-          emitFormal(c, *concat_desc.back().subtensorDesc);
-          flat_output_nodes.emplace_back(c, desc);
-        }
-      }
-    }
-  }
-
-  bool has_half_tensor = false;
-  size_t formal_count = 0;
-  for(auto input : flat_inputs) {
-    auto p = input.first;
-    env.s("node",valueName(p));
-    env.d("formal",formal_count++);
-
-    // Acquires and converts (if needed) inputs
-    bool is_half = input.second.scalar_type == at::ScalarType::Half;
-    if (is_half) {
-      AT_ASSERT(use_cuda);
-      env.s(
-        "access"
-      , format("__half2float(t${formal}.data[t${formal}_offset])", env));
-      has_half_tensor = true;
-    } else {
-      env.s("access", format("t${formal}.data[t${formal}_offset]", env));
-    }
-
-    //TODO: actual type propagation rather than relying on auto..
-    body << format("auto ${node} = ${access};\n",env);
-  }
-
-  for(auto n : subgraph.nodes()) {
-    // FusedConcat nodes work by narrowing the output Tensors before the kernel runs
-    if (n->kind() == prim::FusedConcat)
-      continue;
-    if (n->kind() == prim::ConstantChunk)
-      continue;
-    if(n->kind() == aten::rand_like) {
-      has_random = true;
-      if(!use_cuda)
-        throw std::runtime_error("Fusion doesn't support rand on CPU");
-    }
-    env.s("node",valueName(n->output()));
-    env.s("rhs", encodeRHS(n));
-    body << format("auto ${node} = ${rhs};\n",env);
-  }
-
-  for(auto output : flat_output_nodes) {
-    auto o = output.first;
-    env.d("formal",formal_count++);
-    env.s("access",format("t${formal}.data[t${formal}_offset]",env));
-    env.s("node",valueName(o));
-
-    // Acquires and converts (if needed) outputs
-    bool is_half = output.second.scalar_type == at::ScalarType::Half;
-    if (is_half) {
-      AT_ASSERT(use_cuda);
-      body << format("${access} = __float2half(${node});\n",env);
-      has_half_tensor = true;
-    } else {
-      body << format("${access} = ${node};\n",env);
-    }
-  }
-
-  // Includes half support if any half tensors are involved
-  if (has_half_tensor) {
-    env.s("HalfHeader", half_support_literal);
-  } else {
-    env.s("HalfHeader", "");
-  }
-
-  if (has_random) {
-    env.s("RandHeader", rand_support_literal);
-    env.s("RandParam", rand_param);
-    env.s("RandInit", rand_init);
-  } else {
-    env.s("RandHeader", "");
-    env.s("RandParam", "");
-    env.s("RandInit", "");
-  }
-
-  env.s("tensorOffsets",tensorOffsets.str());
-  env.s("kernelBody",body.str());
-  env.v("formals",formals);
-  env.v("argument_loads",argument_loads);
-  env.s("type_declarations", type_declarations_template.format(env));
-  if(use_cuda) {
-    out << cuda_compilation_unit_template.format(env);
-  } else {
-    out << cpu_compilation_unit_template.format(env);
-  }
-
-  return std::make_tuple(std::move(chunk_desc), std::move(concat_desc), has_random);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-} // codegen namespace
-} // anonymous namespace
-
-////////////////////////////////////////////////////////////////////////////////
-// CompiledFunctionFunction
-
-// Host-side view of TensorInfo (that visivle for the kernel is defined above).
-// Note dims[0] - we need to dynamically allocate the dims.
-struct TensorInfo {
-  void * data;
-#pragma GCC diagnostic ignored "-Wpedantic"
-  uint32_t sizes_strides[0];
-#pragma GCC diagnostic pop
-
-  uint32_t* sizes(size_t nDim) { return &sizes_strides[0]; }
-  uint32_t* strides(size_t nDim) { return &sizes_strides[nDim]; }
-};
-
-FusedKernel::FusedKernel(const std::string & name, AnnotatedGraph & agraph)
-  : name(name)
-  , input_desc(agraph.input_desc)
-  , output_desc(agraph.output_desc) {}
-
-namespace {
-
-// Tries to compress sizes and strides according to cont. Emits the result t
-// c_sizes, c_strides and throws an error on failure (if can't compress)
-void compressContiguous(
-    at::IntList sizes,
-    at::IntList strides,
-    const std::vector<bool> & cont,
-    uint32_t * c_sizes,
-    uint32_t * c_strides) {
-  size_t compressed_dims = 0;
-  size_t cur = 0;
-  size_t ndim = sizes.size();
-  while(cur < ndim) {
-    size_t total_size = sizes[cur];
-    cur++;
-    while(cont[cur-1] && cur < ndim) {
-      JIT_ASSERT(strides[cur-1] == sizes[cur]*strides[cur]);
-      total_size *= sizes[cur];
-      cur++;
-    }
-   // cur starts pointing at the beginning of run to compress
-   // cur ends one _after_ the terminating false or end of list.
-   // total_size is the size of all dimensions [begin,end)
-   // examples:
-   // f = not cont.
-   // t = cont.
-   // x = don't care, including past end of list
-   // s = start of cur
-   // e = end of cur
-
-
-   // f x x x
-   // s e
-
-   //  t f x x
-   //  s   e
-
-   //  t t f x
-   //  s     e
-
-    c_sizes[compressed_dims] = total_size;
-    c_strides[compressed_dims] = strides[cur-1];
-    compressed_dims++;
-  }
-  if (ndim > 0) {
-    JIT_ASSERT(!cont.back() || strides.back() == 1);
-  }
-}
-
-} // anonymous namespace
-
-// XXX: Assumes that after at::chunk, all inputs are the same size
-static std::vector<int64_t> computeMapSize(
-    const at::Tensor& tensor,
-    const PartitionDesc& chunkDesc) {
-  std::vector<int64_t> sizes(tensor.sizes().begin(), tensor.sizes().end());
-  // Should have been checked in graph fuser
-  JIT_ASSERT(sizes[chunkDesc.dim] % chunkDesc.nSubtensors == 0);
-  sizes[chunkDesc.dim] /= chunkDesc.nSubtensors;
-  return sizes;
-}
-
-// XXX: this code assumes that inputs are 32-bit addressable
-static uint32_t computeNumel(at::ArrayRef<int64_t> sizes) {
-  uint32_t result = 1;
-  if (sizes.size() == 0) {
-    return 1; // scalar tensor
-  }
-  for (int64_t size : sizes) {
-    result *= size;
-  }
-  return result;
-}
-
-void FusedKernel::launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs) {
-  at::DeviceGuard device_guard(inputs);
-  JIT_ASSERT(inputs.size() == input_desc.size());
-  JIT_ASSERT(outputs.size() == output_desc.size());
-  size_t flat_inputs_size = 0;
-  size_t flat_outputs_size = 0;
-  for(auto & c : chunk_desc)
-    flat_inputs_size += c.nSubtensors;
-  for(auto & c : concat_desc)
-    flat_outputs_size += c.nSubtensors;
-  // XXX: this code assumes that inputs are 32-bit addressable
-  // XXX: this code assumes that all inputs are of the same size
-  JIT_ASSERT(inputs[0].numel() <= std::numeric_limits<uint32_t>::max());
-
-  // Compute map_size, numel from the first input
-  at::IntList map_size;
-  uint32_t numel;
-  std::vector<int64_t> keep_alive_size;
-  if (chunk_desc[0].isNoop()) {
-    map_size = inputs[0].sizes();
-    numel = inputs[0].numel();
-  } else {
-    keep_alive_size = computeMapSize(inputs[0], chunk_desc[0]);
-    map_size = keep_alive_size;
-    numel = computeNumel(map_size);
-  }
-
-  // Compute the storage needed to store TensorInfo structs for inputs and outputs.
-  size_t uncompressedDim = input_desc.at(0).contiguity.size();
-  size_t maxPossibleTensorInfoSize = sizeof(TensorInfo) + 2 * sizeof(uint32_t) * uncompressedDim;
-  size_t maxPossibleBufferSize = maxPossibleTensorInfoSize * (flat_inputs_size + flat_outputs_size);
-  std::vector<char> buffer(maxPossibleBufferSize);
-  char * buffer_next = buffer.data();
-  // A vector of arguments to the kernel. It's (numel, *input_descs, *output_descs)
-  std::vector<void*> arguments;
-  arguments.reserve(3 + flat_inputs_size + flat_outputs_size);
-  auto addTensorInfoRaw = [&](TensorDesc & desc, void* data_ptr, at::IntList sizes, at::IntList strides) {
-    size_t nDim = desc.nDim(); // NOTE: this is the compressed dim
-    JIT_ASSERT(nDim <= uncompressedDim); // We'd overflow the space otherwise
-    auto ti = reinterpret_cast<TensorInfo*>(buffer_next);
-    ti->data = data_ptr;
-    compressContiguous(sizes, strides, desc.contiguity, ti->sizes(nDim), ti->strides(nDim));
-    buffer_next += maxPossibleTensorInfoSize;
-    arguments.push_back(ti);
-  };
-  // Asserts that t's dims can be compressed in the same way as in desc
-  // (that's what the kernel assumes), and appends it to the arguments vector.
-  auto addTensorInfo = [&](TensorDesc & desc, const at::Tensor & t) {
-    addTensorInfoRaw(desc, t.data_ptr(), t.sizes(), t.strides());
-  };
-  arguments.push_back(&numel);
-  for (size_t i = 0; i < input_desc.size(); ++i) {
-    auto & chunk = chunk_desc[i];
-    const at::Tensor& tensor = inputs[i];
-    if (chunk.isNoop()) {
-      addTensorInfo(input_desc[i], tensor);
-    } else {
-      size_t chunk_offset = map_size[chunk.dim] * tensor.stride(chunk.dim) * elementSize(tensor.type().scalarType());
-      char * data_ptr = reinterpret_cast<char*>(tensor.data_ptr());
-      for (size_t chunks = 0; chunks < chunk.nSubtensors; ++chunks) {
-        addTensorInfoRaw(*chunk.subtensorDesc, data_ptr, map_size, tensor.strides());
-        data_ptr += chunk_offset;
-      }
-    }
-  }
-  for (size_t i = 0; i < output_desc.size(); ++i) {
-    auto & c = concat_desc[i];
-    at::Tensor o = outputs[i];
-    if(c.isNoop()) {
-      o.resize_(map_size);
-      addTensorInfo(output_desc[i], outputs[i]);
-    } else {
-      size_t small_size = map_size[c.dim];
-      std::vector<int64_t> concat_size(map_size.begin(), map_size.end());
-      concat_size[c.dim] = small_size * c.nSubtensors;
-      o.resize_(concat_size);
-      size_t offset = 0;
-      for(size_t j = 0; j < c.nSubtensors; ++j) {
-        // because the concatenated_output stays live, the underlying data
-        // in this view remains live through the end of this function
-        // so there is not need to hold onto this tensor
-        auto view = o.narrow(c.dim, offset, small_size);
-        addTensorInfo(*c.subtensorDesc, view);
-        offset += small_size;
-      }
-    }
-  }
-
-  // If the kernel call contains a random op, we need to pass in random seeds as
-  // well.
-  #ifdef USE_CUDA
-  if(has_random && this->backend() == at::Backend::CUDA) {
-    auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState());
-    uint64_t offset =
-        gen_->state.philox_seed_offset.fetch_add(this->get_rand_offset(numel));
-    arguments.push_back(&gen_->state.initial_seed);
-    arguments.push_back(&offset);
-  }
-  #endif
-
-  launch_raw(numel, arguments.data());
-}
-
-void FusedKernel::launch(at::ArrayRef<at::Tensor> inputs, std::vector<at::Tensor> & outputs) {
-  at::DeviceGuard guard(inputs.back());
-  JIT_ASSERT(inputs.size() > 0);
-  auto & ref_type = inputs[0].type();
-  outputs.clear();
-  outputs.reserve(outputDescriptors().size());
-  for(auto & od : outputDescriptors()) {
-    outputs.push_back(ref_type.toScalarType(od.scalar_type).tensor());
-  }
-  launch_with_tensors(inputs, outputs);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// CUDAFusedKernel
-
-#ifdef USE_CUDA
-
-void checkCUDAVersion(const cudaDeviceProp & prop) {
-  if ((prop.major >= 6 && CUDA_VERSION < 8000) ||
-      (prop.major >= 7 && CUDA_VERSION < 9000)) {
-    std::stringstream err_string;
-    err_string << "In CUDAFusedKernel, PyTorch compiled with insufficient CUDA version: "
-         << CUDA_VERSION << " for the current GPU device " << prop.name
-         << " with device capability " << prop.major << "." << prop.minor;
-    throw std::runtime_error(err_string.str());
-  }
-}
-
-struct CUDAFusedKernel : public FusedKernel {
-  CUDAFusedKernel(const std::string & name, AnnotatedGraph & agraph)
-  : FusedKernel(name, agraph) {
-    at::DeviceGuard device_guard(agraph.device);
-
-    TORCH_CUDA_CHECK(cudaGetDeviceProperties(&prop, agraph.device));
-    checkCUDAVersion(prop);
-
-    std::stringstream cu;
-    std::tie(chunk_desc, concat_desc, has_random) = codegen::emitCompilationUnit(cu, name, agraph, true);
-    compilation_unit = cu.str();
-    nvrtcProgram program;
-    TORCH_NVRTC_CHECK(nvrtcCreateProgram(&program, compilation_unit.c_str(), nullptr, 0, nullptr, nullptr));
-
-    std::string compute = "--gpu-architecture=compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
-    std::vector<const char *> args = {"--std=c++11", compute.c_str(), "-default-device"};
-    nvrtcResult result = nvrtcCompileProgram(program, args.size(), args.data());
-    if (result == NVRTC_ERROR_COMPILATION) {
-      size_t logsize;
-      nvrtcGetProgramLogSize(program, &logsize);
-      std::vector<char> log(logsize);
-      nvrtcGetProgramLog(program, log.data());
-      cu << log.data();
-      throw std::runtime_error(cu.str());
-    }
-    ResourceGuard holdProgram([&] {
-      TORCH_NVRTC_CHECK(nvrtcDestroyProgram(&program));
-    });
-    TORCH_NVRTC_CHECK(result);
-
-    size_t ptx_size;
-    TORCH_NVRTC_CHECK(nvrtcGetPTXSize(program, &ptx_size));
-    ptx.resize(ptx_size);
-    TORCH_NVRTC_CHECK(nvrtcGetPTX(program, ptx.data()));
-
-    TORCH_CU_CHECK(cuModuleLoadData(&module, ptx.data()));
-    TORCH_CU_CHECK(cuModuleGetFunction(&function, module, name.c_str()));
-
-    TORCH_CU_CHECK(cuOccupancyMaxActiveBlocksPerMultiprocessor(
-      &maxBlocks, function, 128, 0));
-    maxBlocks *= prop.multiProcessorCount;
-  }
-  virtual ~CUDAFusedKernel() override {
-    TORCH_CU_CHECK(cuModuleUnload(module));
-  }
-protected:
-  virtual at::Backend backend() const override {
-    return at::Backend::CUDA;
-  }
-  virtual uint64_t get_rand_offset(uint32_t numel) override {
-     int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
-     return 4 * (ceil(numel/(4 * blockSize * numBlocks)) + 1);
-  }
-  virtual void launch_raw(uint32_t numel, void ** arguments) override {
-     int numBlocks = std::min(maxBlocks, ceilDiv(numel, blockSize));
-
-     //std::cout << "maxBlocks = " << maxBlocks << " needed blocks: " << ceilDiv(numel,blockSize)
-     //          << " numblocks =  " << numBlocks;
-
-     // it is possible that this is the first cuda call on this thread
-     // so make sure we initialize the Driver API's context
-     // cudaFree(0) accomplishes this.
-     CUcontext pctx = 0;
-     TORCH_CU_CHECK(cuCtxGetCurrent(&pctx));
-     if (!pctx) {
-        std::unique_lock<std::mutex> cudaFreeMutexLock(
-            *(THCCachingAllocator_getCudaFreeMutex()));
-        cudaFree(0);
-     }
-     CUstream stream = at::cuda::getCurrentCUDAStream();
-     TORCH_CU_CHECK(cuLaunchKernel(
-       function,
-       numBlocks, 1, 1,
-       blockSize, 1, 1,
-       0, stream,
-       arguments,
-       nullptr));
-  }
-  std::vector<char> ptx;
-  CUmodule module;
-  CUfunction function;
-
-  // we record prop/device so if they are availiable for launch heuristics
-  // querying at launch is too slow for device properties.
-  int device;
-  cudaDeviceProp prop;
-  int blockSize = 128;
-  int maxBlocks;
-};
-
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// CPUFusedKernel
-
-struct TempFile {
-  TH_DISALLOW_COPY_AND_ASSIGN(TempFile);
-  TempFile(const std::string & t, int suffix) {
-    // mkstemps edits its first argument in places
-    // so we make a copy of the string here, including null terminator
-    std::vector<char> tt(t.c_str(), t.c_str() + t.size() + 1);
-    int fd = mkstemps(tt.data(), suffix);
-    JIT_ASSERT(fd != -1);
-    file_ = fdopen(fd, "r+");
-
-    // - 1 becuase tt.size() includes the null terminator,
-    // but std::string does not expect one
-    name_ = std::string(tt.begin(), tt.end() - 1);
-  }
-  const std::string & name() const {
-    return name_;
-  }
-  void sync() {
-    fflush(file_);
-  }
-  void write(const std::string & str) {
-    size_t result = fwrite(str.c_str(), 1, str.size(), file_);
-    JIT_ASSERT(str.size() == result);
-  }
-  FILE* file()  {
-    return file_;
-  }
-  ~TempFile() {
-    if(file_ != nullptr) {
-      // unlink first to ensure another mkstemps doesn't
-      // race between close and unlink
-      unlink(name_.c_str());
-      fclose(file_);
-    }
-  }
-private:
-  FILE * file_ = nullptr;
-  std::string name_;
-};
-
-static void* checkDL(void * x) {
-  if(!x) {
-    AT_ERROR("error in dlopen or dlsym: ", dlerror());
-  }
-  return x;
-}
-
-struct DynamicLibrary {
-  TH_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
-  DynamicLibrary(const char * name) {
-    handle = checkDL(dlopen(name, RTLD_LOCAL | RTLD_NOW));
-  }
-  void * sym(const char * name) {
-    JIT_ASSERT(handle);
-    return checkDL(dlsym(handle, name));
-  }
-  ~DynamicLibrary() {
-    if(!handle) return;
-    dlclose(handle);
-  }
-private:
-  void * handle = nullptr;
-};
-
-static const std::string so_template = "/tmp/pytorch_fuserXXXXXX.so";
-static const std::string cpp_template = "/tmp/pytorch_fuserXXXXXX.cpp";
-
-// NB: -march=native not supported on PPC64 g++.  It's a bit annoying
-// to do a configure-style test to decide whether or not the g++
-// actually supports it or not, so we heuristically use the host
-// compiler to predict if the runtime compiler supports the option we
-// want.  This probably won't work if you're cross-compiling.
-// NB: -march=native is disabled because it has caused problems where
-// compiler and assembler do not agree on what native instruction they
-// understand for AVX512. When we need better CPU performance this
-// optimization can be re-enabled by tracking down the platforms where
-// this error occurs and only selectively disabling it.
-static const std::string compile_string =
-  "\"${cxx}\" -O3 -g "
-#ifndef __PPC64__
-//  "-march=native "
-#endif
-  "-std=c++11 -fPIC ${fopenmp} -shared \"${cpp_file}\" -o \"${so_file}\" -lm";
-
-static void runCompiler(FusionCompilerConfig & config, const std::string & cpp_file, const std::string & so_file) {
-  TemplateEnv env;
-  env.s("cxx", config.cxx);
-  env.s("fopenmp", config.openmp ? "-fopenmp" : "");
-  env.s("cpp_file",cpp_file);
-  env.s("so_file",so_file);
-  std::string result = format(compile_string,env);
-  int r = system(result.c_str());
-  if(config.openmp && r != 0) {
-    std::cerr << "warning: pytorch jit fuser failed to compile with openmp, trying without it...\n";
-    config.openmp = false; // disable for future compiles
-    return runCompiler(config, cpp_file, so_file);
-  }
-  JIT_ASSERTM(r == 0, "Failed to compile a fused CPU kernel");
-}
-
-
-static const std::string disas_string =
-  "objdump -M  intel -d \"${so_file}\"";
-static void disas(const std::string & so_file) {
-  TemplateEnv env;
-  env.s("so_file", so_file);
-  std::string cmd = format(disas_string, env);
-  int r = system(cmd.c_str());
-  JIT_ASSERT(r == 0);
-}
-
-struct CPUFusedKernel : public FusedKernel {
-  CPUFusedKernel(const std::string & name, AnnotatedGraph & agraph, FusionCompilerConfig & config)
-  : FusedKernel(name, agraph) {
-    TempFile so_file(so_template, 3);
-    TempFile cpp_file(cpp_template, 4);
-
-    std::stringstream cu;
-    std::tie(chunk_desc, concat_desc, has_random) = codegen::emitCompilationUnit(cu, name, agraph, false);
-    JIT_ASSERT(!has_random);
-    compilation_unit = cu.str();
-    cpp_file.write(compilation_unit);
-    cpp_file.sync();
-    runCompiler(config, cpp_file.name(), so_file.name());
-    if(config.debug) {
-      disas(so_file.name());
-    }
-    so_lib.reset(new DynamicLibrary(so_file.name().c_str()));
-#pragma GCC diagnostic ignored "-Wpedantic"
-    kernel = reinterpret_cast<void(*)(uint32_t, void**)>(so_lib->sym(name.c_str()));
-#pragma GCC diagnostic pop
-  }
-protected:
-  virtual at::Backend backend() const override {
-    return at::Backend::CPU;
-  }
-  virtual uint64_t get_rand_offset(uint32_t numel) override {
-     return numel;
-  }
-  virtual void launch_raw(uint32_t numel, void ** arguments) override {
-    kernel(numel, arguments);
-  }
-  std::unique_ptr<DynamicLibrary> so_lib;
-  void (*kernel)(uint32_t, void**) = nullptr;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// FusedKernelCache
-
-// Note [Run-time shape checking code]
-// There are multiple assumptions that our codegen makes, which we can't check
-// in the fusion pass, because we don't have the shape information. Most notably,
-// that all values (post-input-chunk, and pre-output-concat) have the same shape
-// (hereinafter referred to as map size). One way to check this would be to run
-// shape propagation for every size configuration we get as an input, but that
-// requires a full graph traversal, and might incur unnecessary overhead. The code
-// below uses a few nice properties of broadcasting rules and their interactions with
-// pointwise operations, and takes a smarter approach, to quickly verify validity of
-// the kernel.
-//
-// Notation:
-//   - a.s when a is a tensor is a shorthand for a.shape.
-//   - B is a shorthand for the broadcasting/expanding function. It is used as a
-//     vararg function.
-//   - E is a shorthand for expand function.
-//   - Every pointwise operation can be equivalently rewritten as
-//     f(a, b) = f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))),
-//     where f^ is a non-broadcasting verison of f.
-//   - A set of inputs that are used to produce a certain graph output is referred to
-//     as the output's broadcasting group (see Lemma 2. for explanation why).
-//
-// Lemma 1. Set of lists of integers (shapes) + { _|_ (bottom/error marker) }, with the
-//          operation of broadcasting (returning bottom upon shape mismatch) forms a monoid.
-//          In simpler terms: broadcasting is associative, i.e. B(a, B(b, c)) == B(B(a, b), c).
-//
-// Proof.   Satisfies all monoid laws:
-//            - Closed under broadcasting (trivial)
-//            - Empty shape is the identity element: B(a, []) == B([], a) == a
-//            - Associativity: A simple visual proof is that you can expand 3 tensors
-//                at the same time by stacking their sizes (with alignment to the right),
-//                just as you'd do in the case of 2 tensors, but with an intermediate
-//                (the algorithm ends up being pretty much the same).
-//
-// Lemma 2. Shape of an output of an arbitrary DAG of pointwise ops depends only on the set
-//          of inputs used in this DAG and is equal to B([i.shape for i in used_inputs]).
-//
-// Proof.   Let G be any DAG of pointwise ops and < be any valid topological
-//          ordering on nodes of G. Proof by induction over <.
-//          Base case (graph input):
-//            Trivial (input is also an output).
-//          Step (n = f(q, r)):
-//            Let QS (RS) be the set of shapes of inputs that q (r) depends on.
-//            Note that the set of inputs that n depends on is exactly QS + RS.
-//            shape(n) == shape(f(q, r))
-//                          (def of f)
-//                     == shape(f^(E(q, B(q.s, r.s)), E(r, B(q.s, r.s))))
-//                          (output shape of f^ is equal to either of argument shapes)
-//                     == shape(E(q, B(q.s, r.s)))
-//                          (property of expand)
-//                     == B(q.s, r.s)
-//                          (induction assumption)
-//                     == B(B(QS...), B(RS...))
-//                          (Lemma 1.)
-//                     == B(QS..., RS...)
-//                          (repeated shapes don't matter for broadcasting)
-//                     == B((QS + RS)...)
-//
-// Lemma 3. Expands are distributive over pointwise ops, i.e. E(f(a, b), s) = f(E(a, s), E(b, s))
-// Lemma 4. Expands can be collapsed, i.e. E(E(x, s1), s2) = E(x, B(s1, s2)).
-// Proof.   A simple exercise for the reader :)
-//
-// Theorem. If all (pre-concat-)outputs have equal shapes, then we can push the expands to
-//          (post-chunk-)inputs, and have all intermediates of the same shape
-//          (no broadcasting happening in the body).
-//
-// Proof.   Using the above lemmas we can easily show that a graph with a single output
-//          can be easily rewritten by taking the shape given by B applied to all input
-//          shapes, expanding inputs to it, and using only non-broadcasting operations.
-//          Example:
-//
-//          let d = f(a, b) in
-//          let e = h(b, c) in
-//          g(d, e)
-//
-//          (By def. of broadcasting pointwise ops applied to g, f and h)
-//          (Lemma 2. for a closed formula for the size of g = gs)
-//
-//          let gs = B(a.s, b.s, c.s) in
-//          let d' = E(f^(E(a, B(a.s, b.s)), E(b, B(a.s, b.s))), gs) in
-//          let e' = E(h^(E(b, B(b.s, c.s)), E(c, B(b.s, c.s))), gs) in
-//          g^(d', e')
-//
-//          (Lemma 3.)
-//
-//          let gs = B(a.s, b.s, c.s) in
-//          let d' = f^(E(E(a, B(a.s, b.s)), gs), E(E(b, B(a.s, b.s)), gs)) in
-//          let e' = h^(E(E(b, B(b.s, c.s)), gs), E(E(c, B(b.s, c.s)), gs)) in
-//          g^(d', e')
-//
-//          (Lemma 4. + Lemma 1. to simplify broadcasting function)
-//
-//          let gs = B(a.s, b.s, c.s) in
-//          let d' = f^(E(a, gs), E(b, gs)) in
-//          let e' = h^(E(b, gs), E(c, gs)) in
-//          g^(d', e')
-//
-//          (Simple rewrite)
-//
-//          let gs = B(a.s, b.s, c.s) in
-//          let a' = E(a, gs) in
-//          let b' = E(b, gs) in
-//          let c' = E(c, gs) in
-//          let d' = f^(a', b') in
-//          let e' = h^(b', c') in
-//          g^(d', e')
-//
-//          This example can be easily formalized to arbitrary DAGs using induction
-//          over topological ordering, similar to Lemma 2. Now, if broadcasting groups
-//          for all outputs have the same shape, then performing an expand to this size
-//          on all inputs will ensure that all intermediates on all paths to outputs
-//          will have the same shape, proving that the body of the kernel is valid.
-//
-//          This shows the part until post-chunk-inputs. Extending it to pre-chunk-inputs
-//          is straightforward (needs a simple lemma for moving expands through chunks).
-
-// Register implementations of fused operators, so that we can reuse the fused graph
-// to generate fallback code.
-RegisterOperators reg_fused_operators({
-  Operator(
-    prim::FusedConcat,
-    [](Node* node) {
-      int64_t dim = node->i(attr::dim);
-      int64_t num_inputs = node->inputs().size();
-      return [dim, num_inputs](Stack& stack) {
-        auto result = at::cat(
-          fmap(last(stack, num_inputs), [](const IValue& i) { return i.toTensor(); }),
-          dim
-        );
-        drop(stack, num_inputs);
-        pack(stack, std::move(result));
-        return 0;
-      };
-    })
-});
-
-FusedKernelCache::FusedKernelCache(FusionCompiler& compiler, std::shared_ptr<Graph> _graph, int device)
-  : device(device)
-  , fallback_code(_graph)
-  , compiler(compiler)
-  , graph(std::move(_graph))
-  , input_broadcast_groups(getInputBroadcastGroups())
-  , input_chunks(getInputChunkDescriptors())
-  , kernels() {}
-
-std::atomic<size_t> FusedKernelCache::next_kernel_id {0};
-
-auto FusedKernelCache::getInputChunkDescriptors() -> std::vector<PartitionInfo> {
-  std::vector<PartitionInfo> descs;
-  descs.reserve(graph->inputs().size());
-  for (Value * input : graph->inputs()) {
-    if (Node * chunk = usedInFusedChunk(input)) {
-      descs.emplace_back(chunk->i(attr::chunks), chunk->i(attr::dim));
-    } else {
-      descs.emplace_back(1, 0);
-    }
-  }
-  return descs;
-}
-
-// NB: this vector is really a set, but we want to keep it contiguous in memory for faster access
-static std::vector<int64_t> getInputDependencies(Value* output) {
-  // Run a DFS traversal to find all inputs that affect a given output value
-  std::vector<Value*> queue { output };
-  std::unordered_set<Value*> inputs;
-  std::unordered_set<Value*> seen;
-  while (!queue.empty()) {
-    Value * val = queue.back(); queue.pop_back();
-    Node * producer = val->node();
-    if (producer->kind() == prim::Param) {
-      inputs.insert(val);
-      continue;
-    }
-    for (Value * input : producer->inputs()) {
-      if (/*bool inserted = */seen.insert(input).second) {
-        queue.push_back(input);
-      }
-    }
-  }
-
-  // Convert Value* into offsets into the graph's input list
-  std::vector<int64_t> offsets;
-  offsets.reserve(inputs.size());
-  for (Value * input : inputs) {
-    offsets.push_back(input->offset());
-  }
-  std::sort(offsets.begin(), offsets.end());
-  return offsets;
-}
-
-std::vector<std::vector<int64_t>> FusedKernelCache::getInputBroadcastGroups() {
-  std::unordered_set<std::vector<int64_t>, torch::hash<std::vector<int64_t>>> broadcast_groups;
-  for (Value * output : graph->outputs()) {
-    broadcast_groups.insert(getInputDependencies(output));
-  }
-  return std::vector<std::vector<int64_t>>{ broadcast_groups.begin(), broadcast_groups.end() };
-}
-
-void FusedKernelCache::run(Stack& stack) {
-  int64_t num_inputs = graph->inputs().size();
-  auto args = fmap(last(stack, num_inputs), [](const IValue& i) {
-                return i.toTensor();
-              });
-
-  auto maybe_map_size = canRunKernel(args);
-  if (!maybe_map_size) {
-    return runFallback(stack);
-  }
-  expandArgs(args, *maybe_map_size);
-
-  FusedKernelArgSpec spec { args };
-  auto it = kernels.find(spec);
-  if (it == kernels.end()) {
-    std::tie(it, std::ignore) = kernels.emplace(spec, compileSpec(spec, *maybe_map_size));
-  }
-  auto & fn = it->second;
-
-  std::vector<at::Tensor> outputs;
-  fn->launch(args, outputs);
-  drop(stack, num_inputs);
-  stack.insert(stack.end(), std::make_move_iterator(outputs.begin()),
-                            std::make_move_iterator(outputs.end()));
-}
-
-at::optional<std::vector<int64_t>> FusedKernelCache::getMapSize(at::TensorList args, at::IntList arg_subset) {
-  int64_t dim_after_broadcast = 0;
-  for (int64_t arg_idx : arg_subset) {
-    dim_after_broadcast = std::max(dim_after_broadcast, args[arg_idx].dim());
-  }
-  // TODO: this keeps reallocating map_size at every iteration, but we know
-  // exactly how much storage do we need, so this could be fixed in-place at
-  // every step. We're just missing a few functions for ATen, but the fix
-  // should be straightforward.
-  // NB: we leave this uninitialized, because an empty size is trivially
-  // broadcastable to any other size.
-  std::vector<int64_t> map_size;
-  for (size_t i = 0; i < arg_subset.size(); ++i) {
-    auto & arg = args.at(arg_subset[i]);
-    auto & chunk_desc = input_chunks.at(arg_subset[i]);
-    if (chunk_desc.nSubtensors == 1) {
-      try {
-        map_size = at::infer_size(map_size, arg.sizes());
-      } catch (std::exception& e) {
-        return at::nullopt;
-      }
-    } else {
-      auto tensor_sizes = arg.sizes().vec();
-      int64_t num_chunks = chunk_desc.nSubtensors;
-      int64_t dim = at::maybe_wrap_dim(chunk_desc.dim, tensor_sizes.size());
-      if (tensor_sizes[dim] % num_chunks != 0) {
-        return at::nullopt;
-      }
-      tensor_sizes[dim] /= num_chunks;
-      try {
-        map_size = at::infer_size(map_size, tensor_sizes);
-      } catch (std::exception& e) {
-        return at::nullopt;
-      }
-    }
-  }
-
-  return {map_size};
-}
-
-// See Note [Run-time shape checking code] for more explanation on the algorithm.
-at::optional<std::vector<int64_t>> FusedKernelCache::canRunKernel(at::TensorList args) {
-  AT_CHECK(args.size() == input_chunks.size(),
-           "Expected ", input_chunks.size(), " arguments, but got ", args.size());
-
-  at::optional<std::vector<int64_t>> map_size;
-  for (const auto & broadcast_group : input_broadcast_groups) {
-    if (!map_size) {
-      map_size = getMapSize(args, broadcast_group);
-      if (!map_size) {
-        return at::nullopt;
-      }
-    } else {
-      auto group_map_size = getMapSize(args, broadcast_group);
-      // NB: this checks that group_map_size is defined AND equal to map_size
-      if (map_size != group_map_size) {
-        return at::nullopt;
-      }
-    }
-  }
-  return map_size;
-}
-
-void FusedKernelCache::runFallback(Stack& stack) {
-  InterpreterState(fallback_code).runOneStage(stack);
-}
-
-// NB: args are mutated in this call. map_size is mutated too, but is restored to its original
-// value before this function returns (it's an optimization).
-void FusedKernelCache::expandArgs(std::vector<at::Tensor>& args, std::vector<int64_t>& map_size) {
-  for (size_t i = 0; i < args.size(); ++i) {
-    auto & arg = args[i];
-    auto & pdesc = input_chunks[i];
-    if (pdesc.nSubtensors == 1) {
-      if (arg.sizes().equals(map_size)) continue;
-      arg = arg.expand(map_size);
-    } else {
-      map_size.at(pdesc.dim) *= pdesc.nSubtensors;
-      if (!arg.sizes().equals(map_size)) {
-        arg = arg.expand(map_size);
-      }
-      map_size.at(pdesc.dim) /= pdesc.nSubtensors;
-    }
-  }
-}
-
-std::unique_ptr<FusedKernel> FusedKernelCache::compileSpec(
-      const FusedKernelArgSpec& spec, const std::vector<int64_t>& map_size) {
-  AnnotatedGraph agraph {*graph, device};
-
-  agraph.input_desc = spec.descs();
-  // XXX: this assumes that fused kernels only operate on floating-point values inside
-  at::optional<at::ScalarType> scalar_type;
-  for (TensorDesc& desc : agraph.input_desc) {
-    if (isFloatingType(desc.scalar_type)) {
-      scalar_type = desc.scalar_type;
-      break;
-    }
-  }
-  JIT_ASSERT(scalar_type);
-
-  for (Value * output : graph->outputs()) {
-    std::vector<int64_t> sizes = map_size;
-    if (output->node()->kind() == prim::FusedConcat) {
-      sizes.at(output->node()->i(attr::dim)) *= output->node()->inputs().size();
-    }
-    auto type = CompleteTensorType::create(*scalar_type, device, sizes);
-    agraph.output_desc.emplace_back(std::move(type));
-  }
-
-  std::string name = "kernel_" + std::to_string(next_kernel_id++);
-  FusedKernel * raw_func;
-  if (device != kCPUDevice) {
-#ifdef USE_CUDA
-    raw_func = new CUDAFusedKernel(name, agraph);
-#else
-    throw std::runtime_error("cannot compile a CUDA fusion group, CUDA is not enabled.");
-#endif
-  } else {
-    JIT_ASSERT(compiler.canCompileOnCPU());
-    raw_func = new CPUFusedKernel(name, agraph, compiler.config_);
-  }
-  return std::unique_ptr<FusedKernel>(raw_func);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// FusionCompiler
-
-std::shared_ptr<FusedKernelCache> FusionCompiler::getOrCompile(Node* fusion_group) {
-  int device = fusion_group->i(attr::device);
-  if (device == kCPUDevice) {
-    JIT_ASSERT(canCompileOnCPU());
-  } else {
-#ifndef USE_CUDA
-    throw std::runtime_error("cannot compile a CUDA fusion group - CUDA is not enabled.");
-#endif
-  }
-  auto graph = fusion_group->g(attr::Subgraph)->copy();
-  EraseShapeInformation(*graph);
-  std::stringstream key;
-  key << "device " << device << "\n";
-  key << *graph << "\n";
-  std::string key_ = key.str();
-  auto it = cache_map.find(key_);
-  if (it == cache_map.end()) {
-    std::tie(it, std::ignore) = cache_map.emplace(key_, std::make_shared<FusedKernelCache>(*this, graph, device));
-  }
-  return it->second;
-}
-
-std::vector<at::Tensor> FusionCompiler::debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs) {
-  auto wrapper_graph = std::make_shared<Graph>();
-  Node * fusion_group = wrapper_graph->insertNode(wrapper_graph->createFusionGroup(device));
-  fusion_group->g_(attr::Subgraph, graph.copy());
-  for (size_t i = 0; i < graph.inputs().size(); ++i) {
-    fusion_group->addInput(wrapper_graph->addInput());
-  }
-  for (size_t i = 0; i < graph.outputs().size(); ++i) {
-    wrapper_graph->registerOutput(fusion_group->addOutput());
-  }
-  auto cache = getOrCompile(fusion_group);
-  Stack stack = fmap<IValue>(inputs);
-  cache->run(stack);
-  return fmap(stack, [](const IValue& iv) { return iv.toTensor(); });
-}
-
-static const std::string check_exists_string =
-  "which '${program}' > /dev/null";
-
-static bool programExists(const std::string & program) {
-  TemplateEnv env;
-  env.s("program", program);
-  std::string cmd = format(check_exists_string, env);
-  return 0 == system(cmd.c_str());
-}
-
-FusionCompiler::FusionCompiler() {
-  const char * cxx_env = getenv("CXX");
-  if(cxx_env != nullptr) {
-    config_.cxx = cxx_env;
-  }
-  if(!programExists(config_.cxx)) {
-    config_.cxx = "";
-  }
-  const char * debug_env = getenv("PYTORCH_FUSION_DEBUG");
-  config_.debug = debug_env && atoi(debug_env) != 0;
-}
-
-//TODO: thread safety
-FusionCompiler & sharedFusionCompiler() {
-  static FusionCompiler compiler;
-  return compiler;
-}
-
-}}
-
-# else
-// dummy implementations for windows
-
-#include "torch/csrc/jit/fusion_compiler.h"
-#include "torch/csrc/jit/ir.h"
-#include "torch/csrc/jit/code_template.h"
-#include "torch/csrc/jit/resource_guard.h"
-#include "torch/csrc/utils/disallow_copy.h"
-#include "ATen/ATen.h"
-#ifdef USE_CUDA
-#include "torch/csrc/cuda/cuda_check.h"
-#include <nvrtc.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#endif
-#include <string>
-#include <algorithm>
-#include <unordered_map>
-#include <vector>
-#include <sstream>
-#include <iostream>
-
-namespace torch { namespace jit {
-
-struct FusedKernel {
-  char padding;
-};
-
-FusedKernelCache::FusedKernelCache(FusionCompiler& compiler, std::shared_ptr<Graph> graph, int device)
-  : compiler(compiler) {}
-void FusedKernelCache::run(Stack& inputs) {}
-void FusedKernelCache::runFallback(Stack& stack) {}
-void FusedKernelCache::expandArgs(std::vector<at::Tensor>& args, std::vector<int64_t>& map_size) {}
-at::optional<std::vector<int64_t>> FusedKernelCache::canRunKernel(at::TensorList args) { return at::nullopt; }
-at::optional<std::vector<int64_t>> FusedKernelCache::getMapSize(at::TensorList args, at::IntList arg_subset) { return at::nullopt; }
-std::vector<std::vector<int64_t>> FusedKernelCache::getInputBroadcastGroups() { return {}; }
-auto FusedKernelCache::getInputChunkDescriptors() -> std::vector<PartitionInfo> { return {}; }
-std::unique_ptr<FusedKernel> FusedKernelCache::compileSpec(
-      const FusedKernelArgSpec& spec, const std::vector<int64_t>& map_size) { return nullptr; }
-std::atomic<size_t> FusedKernelCache::next_kernel_id {0};
-
-FusionCompiler::FusionCompiler() {}
-std::shared_ptr<FusedKernelCache> FusionCompiler::getOrCompile(Node* fusion_group) { return nullptr; }
-std::vector<at::Tensor> FusionCompiler::debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs) { return {}; }
-
-FusionCompiler & sharedFusionCompiler() {
-  throw std::runtime_error("NYI: fuser is not supported on Windows.");
-}
-
-}}
-
-# endif
diff --git a/torch/csrc/jit/fusion_compiler.h b/torch/csrc/jit/fusion_compiler.h
deleted file mode 100644
index a24fe0817e0517..00000000000000
--- a/torch/csrc/jit/fusion_compiler.h
+++ /dev/null
@@ -1,188 +0,0 @@
-#pragma once
-
-#include <torch/csrc/jit/ir.h>
-#include "torch/csrc/utils/disallow_copy.h"
-#include "torch/csrc/utils/hash.h"
-#include <torch/csrc/jit/assertions.h>
-#include <torch/csrc/jit/stack.h>
-#include <torch/csrc/jit/argument_spec.h>
-#include <torch/csrc/jit/interpreter.h>
-
-#include "ATen/ATen.h"
-#include <string>
-#include <algorithm>
-#include <unordered_map>
-#include <vector>
-#include <memory>
-
-namespace torch { namespace jit {
-
-struct FusedKernel;
-struct FusionCompiler;
-
-// type information needed by the compiler for input/outputs
-// contiguity[i] is true if the dim i is contiguous with dim i + 1.
-// contiguity.back() == true means strides.back() == 1.
-struct TensorDesc {
-  at::ScalarType scalar_type;
-  std::vector<bool> contiguity;
-
-  TensorDesc(const at::ScalarType& type, const std::vector<bool>& contiguity)
-  : scalar_type(type), contiguity(contiguity) {
-    if (contiguity.size() == 0) {
-      nDim_ = 0;
-    } else {
-      nDim_ = std::count(contiguity.begin(), contiguity.end(), false) + (lastIsContiguous() ? 1 : 0);
-    }
-  }
-
-  TensorDesc(const at::ScalarType& type, const at::IntList& sizes, const at::IntList& strides)
-  : TensorDesc(type, TensorDesc::findContiguous(sizes, strides)) {}
-  TensorDesc(const at::Tensor& t)
-    : TensorDesc(t.type().scalarType(), t.sizes(), t.strides()) {}
-  TensorDesc(CompleteTensorTypePtr type)
-    : TensorDesc(type->scalarType(), type->sizes(), type->strides()) {}
-
-  // number of dimensions after contiguity compression
-  size_t nDim() const {
-    return nDim_;
-  }
-
-  // do we have inner stride == 1?
-  bool lastIsContiguous() const {
-    return contiguity.size() == 0 || contiguity.back();
-  }
-
-  static std::vector<bool> findContiguous(
-    const at::IntList& sizes,
-    const at::IntList& strides);
-
-  bool operator==(const TensorDesc & desc) const {
-    return scalar_type == desc.scalar_type && contiguity == desc.contiguity;
-  }
-  bool operator!=(const TensorDesc & desc) const {
-    return !(*this == desc);
-  }
-  static size_t hash(const TensorDesc& spec) {
-    return torch::get_hash(spec.scalar_type, spec.nDim_, std::hash<std::vector<bool>>{}(spec.contiguity));
-  }
-
-private:
-  size_t nDim_;
-};
-
-inline std::ostream& operator<<(std::ostream & out, const TensorDesc & d) {
-  out << d.scalar_type << "[";
-  for(auto b : d.contiguity)
-    out << b << ";";
-  out << "]";
-  return out;
-}
-
-struct FusedKernelArgSpec {
-  FusedKernelArgSpec(at::TensorList inputs)
-    : descs_(fmap<TensorDesc>(inputs))
-    , hash_code_(torch::get_hash(inputs.size(), descs_)) {}
-
-  bool operator==(const FusedKernelArgSpec & spec) const {
-    return hash_code_ == spec.hash_code_ && descs_ == spec.descs_;
-  }
-  bool operator!=(const FusedKernelArgSpec & spec) const {
-    return !(*this == spec);
-  }
-  static size_t hash(const FusedKernelArgSpec& spec) {
-    return spec.hash_code_;
-  }
-  const std::vector<TensorDesc>& descs() const {
-    return descs_;
-  }
-
-private:
-  std::vector<TensorDesc> descs_;
-  size_t hash_code_;
-};
-
-constexpr int kCPUDevice = -1;
-struct AnnotatedGraph {
-  // short-term storage only, so it borrows Graph.
-  AnnotatedGraph(Graph & graph, int device)
-  : graph(&graph), device(device) {}
-  Graph* graph = nullptr; // TODO: this should really be const
-  int device = kCPUDevice;
-  std::vector<TensorDesc> input_desc;
-  std::vector<TensorDesc> output_desc;
-};
-
-// FusionCompiler has very limited shape information available at the time getOrCompile
-// is called, and this is why it can't really prepare the kernels at that time. Instead,
-// it returns this object, which will take care of matching the run-time shapes to whatever
-// kernels we have compiled already.
-//
-// Two configurations are considered eligible for the same fused kernel if:
-//   - the shapes satisfy graph invariants for our fused code (e.g. that all intermediate shapes
-//     are the same - see fusion_compiler.cpp for more details).
-//   - their FusedKernelArgSpecs compare equal
-struct FusedKernelCache {
-  FusedKernelCache(FusionCompiler& compiler, std::shared_ptr<Graph> graph, int device);
-
-  void run(Stack& inputs);
-private:
-  struct PartitionInfo {
-    PartitionInfo(int64_t nsub, int64_t dim)
-      : nSubtensors(nsub), dim(dim) {};
-    int64_t nSubtensors;
-    int64_t dim;
-  };
-
-  void runFallback(Stack& stack);
-  void expandArgs(std::vector<at::Tensor>& args, std::vector<int64_t>& map_size);
-  at::optional<std::vector<int64_t>> canRunKernel(at::TensorList args);
-  at::optional<std::vector<int64_t>> getMapSize(at::TensorList args, at::IntList arg_subset);
-  std::vector<std::vector<int64_t>> getInputBroadcastGroups();
-  std::vector<PartitionInfo> getInputChunkDescriptors();
-  std::unique_ptr<FusedKernel> compileSpec(
-        const FusedKernelArgSpec& spec, const std::vector<int64_t>& map_size);
-
-  static std::atomic<size_t> next_kernel_id;
-
-  int device;
-  Code fallback_code;
-  FusionCompiler& compiler;
-  std::shared_ptr<Graph> graph;
-  std::vector<std::vector<int64_t>> input_broadcast_groups;
-  std::vector<PartitionInfo> input_chunks;
-  std::unordered_map<FusedKernelArgSpec, std::unique_ptr<FusedKernel>, torch::hash<FusedKernelArgSpec>> kernels;
-};
-
-struct FusionCompilerConfig {
-  std::string cxx = "g++"; // compiler location
-  bool debug = false; // emit debugging information about fusions
-  bool openmp = true;
-};
-
-// caching compiler
-struct FusionCompiler {
-  friend struct FusedKernelCache;
-
-  FusionCompiler();
-  TH_DISALLOW_COPY_AND_ASSIGN(FusionCompiler);
-
-  // uses type annotations in fusion_group to create Annotated graph
-  std::shared_ptr<FusedKernelCache> getOrCompile(Node * fusion_group);
-
-  // debugging function that lets you do everything from compilation to execution
-  // in one step.
-  // this should not be used in the hot path of execution because it has to serialize
-  // the graph each time
-  std::vector<at::Tensor> debugLaunchGraph(Graph & graph, int device, at::ArrayRef<at::Tensor> inputs);
-  bool canCompileOnCPU() const {
-    return config_.cxx.size() > 0;
-  }
-private:
-  FusionCompilerConfig config_;
-  std::unordered_map<std::string, std::shared_ptr<FusedKernelCache>> cache_map;
-};
-
-FusionCompiler & sharedFusionCompiler();
-
-}}
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index cdfeab0e730179..1202713121362d 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -23,6 +23,7 @@
 #include "torch/csrc/jit/passes/lower_grad_of.h"
 #include "torch/csrc/jit/passes/constant_propagation.h"
 #include "torch/csrc/jit/passes/inline_autodiff_subgraphs.h"
+#include "torch/csrc/jit/passes/requires_grad_analysis.h"
 #include "torch/csrc/jit/symbolic_variable.h"
 #include "torch/csrc/jit/ivalue.h"
 #include "torch/csrc/jit/custom_operator.h"
@@ -379,7 +380,12 @@ struct GraphExecutorImpl {
 
     // Phase 2. Propagate detailed information about the spec through the
     //          graph (enabled more specializations in later passes).
+    //          Shape propagation sometimes depends on certain arguments being
+    //          constants, and constant propagation doesn't need shape information
+    //          anyway, so it's better to run it first.
+    ConstantPropagation(opt_graph);
     PropagateInputShapes(*opt_graph, spec);
+    PropagateRequiresGrad(opt_graph, spec);
 
     // Phase 3. Run differentiable optimizations (i.e. simple graph rewrites that
     //          we can still execute using autograd).
@@ -392,18 +398,8 @@ struct GraphExecutorImpl {
     if (needsGradient(opt_graph, spec)) {
       auto diff_nodes = CreateAutodiffSubgraphs(*opt_graph);
       for (Node * dnode : diff_nodes) {
-        // XXX: we don't have requires_grad information on the intermediate values,
-        // so we conservatively assume it's always true (on tensor inputs).
         auto diff_graph = std::move(dnode->g(attr::Subgraph));
-        auto requires_grads = fmap(diff_graph->inputs(), [](Value* v) {
-          // NB: only floating-point inputs can have requires_grad=True. If we
-          // don't have type information, we have to assume that it's true.
-          if (auto tensor_type = v->type()->cast<TensorType>()) {
-            return at::isFloatingType(tensor_type->scalarType());
-          }
-          return v->type()->isSubtypeOf(DynamicType::get());
-        });
-        Gradient gradient = differentiate(diff_graph, requires_grads);
+        Gradient gradient = differentiate(diff_graph);
         runNondiffOptimization(gradient.f);
         packGradient(gradient, dnode);
       }
@@ -427,7 +423,6 @@ struct GraphExecutorImpl {
     EliminateDeadCode(graph);
     EliminateCommonSubexpression(graph);
     UnrollLoops(graph);
-    ConstantPropagation(graph);
     PeepholeOptimize(graph);
     CheckInplace(graph);
     BatchMM(graph);
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index ba6fe9fe0a27a4..84ffdd1c265466 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -1,6 +1,6 @@
 #include "torch/csrc/jit/import.h"
 #include "torch/csrc/jit/serialization.h"
-#include "onnx/onnx.pb.h"
+#include "onnx/onnx_pb.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/utils/functional.h"
 #include "torch/csrc/jit/assertions.h"
@@ -180,7 +180,7 @@ void DecoderBase::buildBlock(const onnx::GraphProto& graph_proto, Block* block,
 
 class ModuleDecoder : DecoderBase {
  public:
-  ModuleDecoder(std::shared_ptr<script::Module> root_module,
+  ModuleDecoder(ModuleLookup module_lookup,
                 const std::string& filename);
 
  private:
@@ -202,7 +202,7 @@ class ModuleDecoder : DecoderBase {
                                const std::vector<int64_t>& strides);
 
   std::pair<std::shared_ptr<script::Module>, std::string> parseFullName(
-      std::shared_ptr<script::Module> root_module,
+      ModuleLookup module_lookup,
       const std::string fullname);
 
   PyTorchFileReader file_reader_;
@@ -258,6 +258,10 @@ TypePtr ModuleDecoder::buildType(const onnx::TypeProto& type_proto) {
     return IntType::get();
   } else if (kind == "NoneType") {
     return NoneType::get();
+  } else if (kind == "GeneratorType") {
+    return GeneratorType::get();
+  }else if (kind == "StringType") {
+    return StringType::get();
   } else {
     throw std::runtime_error("unexpected string for type kind");
   }
@@ -313,7 +317,7 @@ at::Tensor ModuleDecoder::buildTensorCommon(
     int64_t size;
     std::tie(storage_ptr, size) = file_reader_.getRecordWithKey(record_number);
     auto storage = std::make_shared<at::Storage>(
-      at::CPU(type).scalarType(),
+      at::CPU(type).typeMeta(),
       std::move(storage_ptr),
       size,
       nullptr);
@@ -328,7 +332,7 @@ at::Tensor ModuleDecoder::buildTensorCommon(
 // Given a full name of a parameter or method,
 // return the parent submodule and local name
 std::pair<std::shared_ptr<script::Module>, std::string> ModuleDecoder::parseFullName(
-    std::shared_ptr<script::Module> root_module,
+    ModuleLookup module_lookup,
     const std::string fullname) {
   std::vector<std::string> vec;
   std::stringstream ss(fullname);
@@ -337,18 +341,13 @@ std::pair<std::shared_ptr<script::Module>, std::string> ModuleDecoder::parseFull
     vec.push_back(name);
   }
 
-  std::shared_ptr<script::Module> curr = root_module;
-  for (size_t i = 0; i < vec.size() - 1; i++) {
-    if (curr->find_module(vec[i]) == nullptr) {
-      curr->register_module(vec[i], std::make_shared<script::Module>());
-    }
-    curr = curr->get_module(vec[i]);
-  }
-  return std::make_pair(curr, vec.back());
+  std::string last = vec.back();
+  vec.pop_back();
+  return std::make_pair(module_lookup(vec), std::move(last));
 }
 
 ModuleDecoder::ModuleDecoder(
-    const std::shared_ptr<script::Module> root_module,
+    ModuleLookup module_lookup,
     const std::string &filename) :
     file_reader_(filename) {
   auto model_proto = onnx::ModelProto();
@@ -361,7 +360,7 @@ ModuleDecoder::ModuleDecoder(
   for (auto &tensor_proto : graph_proto.initializer()) {
     std::shared_ptr<script::Module> parent_module;
     std::string name;
-    std::tie(parent_module, name) = parseFullName(root_module, tensor_proto.name());
+    std::tie(parent_module, name) = parseFullName(module_lookup, tensor_proto.name());
 
     auto param = buildParameter(tensor_proto);
     parent_module->register_parameter(name, param, /* is_buffer = */ tensor_proto.int64_data(0));
@@ -371,7 +370,7 @@ ModuleDecoder::ModuleDecoder(
   for (auto &node_proto : graph_proto.node()) {
     std::shared_ptr<script::Module> parent_module;
     std::string name;
-    std::tie(parent_module, name) = parseFullName(root_module, node_proto.name());
+    std::tie(parent_module, name) = parseFullName(module_lookup, node_proto.name());
 
     std::vector<at::Tensor*> member_inputs;
     for (auto &param_name : node_proto.input()) {
@@ -391,15 +390,26 @@ ModuleDecoder::ModuleDecoder(
 
 }  // namespace
 
-void ImportIRModule(
-    const std::shared_ptr<script::Module> module,
+void import_ir_module(
+    ModuleLookup module_lookup,
     const std::string& filename) {
-  ModuleDecoder(module, filename);
+  ModuleDecoder(module_lookup, filename);
 }
 
 std::shared_ptr<script::Module> load(const std::string& filename) {
   auto module = std::make_shared<script::Module>();
-  ModuleDecoder(module, filename);
+
+  auto module_lookup = [&](const std::vector<std::string>& qualified_name) {
+    std::shared_ptr<script::Module> curr = module;
+    for (const auto& name : qualified_name) {
+      if (curr->find_module(name) == nullptr) {
+        curr->register_module(name, std::make_shared<script::Module>());
+      }
+      curr = curr->get_module(name);
+    }
+    return curr;
+  };
+  ModuleDecoder(module_lookup, filename);
   return module;
 }
 
diff --git a/torch/csrc/jit/import.h b/torch/csrc/jit/import.h
index ba466f736d3811..6ce901c4369961 100644
--- a/torch/csrc/jit/import.h
+++ b/torch/csrc/jit/import.h
@@ -3,12 +3,22 @@
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/script/module.h"
 
-namespace torch { namespace jit {
+namespace torch {
+namespace jit {
 
-TORCH_API void ImportIRModule(
-    const std::shared_ptr<script::Module> module,
+using ModuleLookup = std::function<std::shared_ptr<script::Module>(
+    const std::vector<std::string>&)>;
+
+TORCH_API void import_ir_module(
+    ModuleLookup module_lookup,
     const std::string& filename);
 
+/// Loads a serialized `script::Module` from the given `filename`.
+///
+/// The file stored at the location given in `filename` must contain a
+/// serialized `script::Module`, exported either via `ScriptModule.save()` in
+/// Python or `torch::jit::ExportModule` in C++.
 TORCH_API std::shared_ptr<script::Module> load(const std::string& filename);
 
-}}
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index ce472707319fbc..ab9a9ee4a81518 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -1,4 +1,5 @@
 #include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/utils/auto_gil.h"
 
 #include "torch/csrc/jit/python_tracer.h"
 #include "torch/csrc/jit/tracer.h"
@@ -32,6 +33,7 @@
 #include "torch/csrc/jit/function_schema.h"
 #include "torch/csrc/jit/serialization.h"
 #include "torch/csrc/jit/operator.h"
+#include "torch/csrc/jit/fusers/interface.h"
 
 #include <pybind11/functional.h>
 
@@ -114,13 +116,14 @@ void initJITBindings(PyObject *module) {
    .def("_jit_pass_onnx_block", BlockToONNX)
    .def("_jit_pass_fixup_onnx_loops", FixupONNXLoops)
    .def("_jit_pass_canonicalize_ops", CanonicalizeOps)
-    .def("_jit_pass_specialize_undef", specializeUndef)
-   .def("_jit_differentiate", [](Graph &g, const std::vector<bool>& requires_grad) {
+   .def("_jit_pass_specialize_undef", specializeUndef)
+   .def("_jit_override_can_fuse_on_cpu", &overrideCanFuseOnCPU)
+   .def("_jit_differentiate", [](Graph &g) {
        // the python binding slightly differs in semantics
        // it makes a copy of the input Graph, and works on that
        // jit::differentiate mutates the input Graph
        auto g_clone = g.copy();
-       return differentiate(g_clone, requires_grad);
+       return differentiate(g_clone);
    });
 
   py::class_<CompleteArgumentSpec>(m, "CompleteArgumentSpec")
@@ -206,7 +209,10 @@ void initJITBindings(PyObject *module) {
       .def("__call__", [](GraphExecutor& ge, py::args args) -> py::object {
         const auto & graph = ge.graph();
         auto stack = evilDeprecatedBadCreateStackDoNotUse(args, graph->inputs());
-        ge.run(stack);
+        {
+          AutoNoGIL no_gil_guard;
+          ge.run(stack);
+        }
         return createPyObjectForStack(std::move(stack));
       });
 
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
index 6d8f409fa3a2c0..e1d76dde56c59d 100644
--- a/torch/csrc/jit/interned_strings.h
+++ b/torch/csrc/jit/interned_strings.h
@@ -51,6 +51,7 @@ namespace torch { namespace jit {
   _(prim, ImplicitTensorToNum)     \
   _(prim, IntToFloat)              \
   _(prim, FloatToInt)              \
+  _(prim, StringToFloat)           \
   _(prim, AutogradAdd)             \
   _(prim, GradOf)                  \
   _(prim, AnyDefined)              \
@@ -109,7 +110,6 @@ namespace torch { namespace jit {
   _(attr, transA)                  \
   _(attr, transB)                  \
   _(attr, name)                    \
-  _(attr, string)                  \
   _(attr, a)                       \
   _(attr, b)
 
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index 6864d8ad797087..415820a3086164 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -6,7 +6,6 @@
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/jit/assertions.h"
-#include "torch/csrc/jit/fusion_compiler.h"
 #include "torch/csrc/jit/graph_executor.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/ivalue.h"
@@ -337,7 +336,7 @@ struct PreprocessGraph {
 struct ContainerTensor : public at::TensorImpl {
 public:
   ContainerTensor()
-  : TensorImpl(at::UndefinedTensorId(), at::ScalarType::Undefined, nullptr, /* is_variable */ false) {}
+  : TensorImpl(at::UndefinedTensorId(), caffe2::TypeMeta(), nullptr, /* is_variable */ false) {}
 
   virtual ~ContainerTensor() = default;
   virtual at::IntList sizes() const override {
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index a65a743ddb24cf..4cc59e8b9bb8e1 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -125,7 +125,7 @@ void printAttributes(std::ostream & out, const Node * n, bool ignore_subgraph=fa
         printPrimList(out,n->is(name));
         break;
       case AttributeKind::s:
-        out << escapeString(n->s(name));
+        out << "\"" << escapeString(n->s(name)) << "\"";
         break;
       case AttributeKind::ss:
         printPrimList(out,n->ss(name));
@@ -331,9 +331,9 @@ void Node::lint() const {
   IR_ELSEIFM_CONST(PythonOp)
     size_t n_scalars = 0, n_tensors = 0;
     for (auto c : value->cconv) {
-      if (c == 's') {
+      if (c == 'c') {
         n_scalars++;
-      } else if (c == 't') {
+      } else if (c == 'd') {
         n_tensors++;
       } else {
         JIT_ASSERT(0);
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 9ac5059a2d801e..fd730adb69570a 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -190,6 +190,9 @@ struct Value {
     JIT_ASSERT(type_ != nullptr);
     return type_;
   }
+  bool requires_grad() const {
+    return type()->requires_grad();
+  }
   bool isTensor() const {
     return type()->kind() == TypeKind::CompleteTensorType;
   }
@@ -993,7 +996,9 @@ friend struct Block;
     return create(prim::Undefined);
   }
   Node * createNoneGenerator() {
-    return create(prim::NoneGenerator);
+    auto n = create(prim::NoneGenerator);
+    n->output()->setType(GeneratorType::get());
+    return n;
   }
   Node * createFusionGroup(int device) {
     auto n = create(prim::FusionGroup, 0);
@@ -1061,6 +1066,12 @@ friend struct Block;
     result->output()->setType(IntType::get());
     return result;
   }
+  Node* createStringToFloat(Value* value) {
+    JIT_ASSERT(*value->type() == *StringType::get());
+    auto* result = create(prim::StringToFloat, {value});
+    result->output()->setType(FloatType::get());
+    return result;
+  }
   Node* createPythonOp(
       THPObjectPtr&& pyobj,
       const std::string& cconv,
@@ -1391,8 +1402,8 @@ struct PythonOp : public Node {
   // TraceInterpreterState for execution semantics.
   THPObjectPtr pyobj;
   // The calling convention for the Python function.
-  // 's' -- python scalar argument
-  // 't' -- tensor argument
+  // 'c' -- constant argument
+  // 'd' -- dynamic argument
   std::string cconv;
   // Scalar arguments to the Python function.  Not necessarily passed to
   // the function in this order; see cconv for the correct order.
diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index c8475b7ff86183..fb248e509853c8 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -1,402 +1 @@
-#pragma once
-
-#include "torch/csrc/jit/assertions.h"
-#include "torch/csrc/WindowsTorchApiMacro.h"
-
-#include <ATen/ATen.h>
-
-#include <type_traits>
-
-namespace torch { namespace jit {
-
-template <typename T>
-using Shared = c10::intrusive_ptr<T>;
-
-// string
-struct TORCH_API ConstantString : c10::intrusive_ptr_target {
- private:
-  const std::string str_;
- public:
-  ConstantString(std::string str)
-  : str_(std::move(str)) {}
-  static c10::intrusive_ptr<ConstantString> create(const std::string str_) {
-    return c10::make_intrusive<ConstantString>(str_);
-  }
-  const std::string & string() const {
-    return str_;
-  }
-  operator const std::string & () const {
-    return string();
-  }
-  TORCH_API friend std::ostream& operator<<(std::ostream& out, const ConstantString & v);
-};
-
-
-// non-mutable list
-template<typename Elem>
-struct TORCH_API ConstantList : c10::intrusive_ptr_target {
- private:
-  std::vector<Elem> elements_;
- public:
-  ConstantList(std::vector<Elem> elements_)
-  : elements_(std::move(elements_)) {}
-  static c10::intrusive_ptr<ConstantList<Elem>> create(std::vector<Elem> elements_) {
-    return c10::make_intrusive<ConstantList<Elem>>(std::move(elements_));
-  }
-  const std::vector<Elem>& elements() const {
-    return elements_;
-  }
-  operator const std::vector<Elem>&() const {
-    return elements();
-  }
-};
-
-struct IValue;
-using Tuple = ConstantList<IValue>;
-using IntList = ConstantList<int64_t>;
-using TensorList = ConstantList<at::Tensor>;
-using DoubleList = ConstantList<double>;
-
-// IValue is the generic tagged union used by the interpreter to hold
-// all value types.
-// It is a 16-byte object with an 8-byte payload and an 8-byte tag.
-// The tag is currently 4 bytes to determine the type, and 1 byte
-// to mark whether that type is a subtype of c10::intrusive_ptr_target and needs
-// retain/release calls.
-
-#define TORCH_FORALL_TAGS(_) \
-  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(String) _(TensorList)
-
-struct TORCH_API IValue {
-  IValue()
-  : payload(0)
-  , tag(Tag::None)
-  , is_intrusive_ptr(false) {}
-  IValue(const IValue& rhs)
-      : payload(rhs.payload),
-        tag(rhs.tag),
-        is_intrusive_ptr(rhs.is_intrusive_ptr) {
-    if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::incref(as_intrusive_ptr);
-    }
-  }
-  IValue(IValue&& rhs) noexcept : IValue() {
-    swap(rhs);
-  }
-  ~IValue() {
-    if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::decref(as_intrusive_ptr);
-    }
-  }
-  IValue & operator=(IValue && rhs) & noexcept {
-    rhs.swap(*this);
-    return *this;
-  }
-  IValue & operator=(IValue const & rhs) & {
-      IValue(rhs).swap(*this);
-      return *this;
-  }
-  void swap(IValue & rhs) noexcept {
-    std::swap(payload, rhs.payload);
-    std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr);
-    std::swap(tag, rhs.tag);
-  }
-  // Accessors for subtypes are arranged together below
-  // While some of these accessors could be generated through templates,
-  // we prefer to write them manually for clarity
-
-  // Tensor
-  IValue(at::Tensor t)
-  : tag(Tag::Tensor), is_intrusive_ptr(t.defined())  {
-    // Note: the undefined tensor is not refcounted, so while it
-    // is tagged as a tensor, is_intrusive_ptr is set to false.
-    // This is not an optional optimization: our incref call
-    // *will not* do the right thing when called on an
-    // undefined tensor.
-    as_tensor_impl = t.unsafeReleaseTensorImpl();
-  }
-  bool isTensor() const { return Tag::Tensor == tag; }
-  at::Tensor toTensor() && {
-    JIT_ASSERT(isTensor());
-    at::Tensor t(as_tensor_impl, /*retain=*/false);
-    clearToNone();
-    return t;
-  }
-  at::Tensor toTensor() const & {
-    JIT_ASSERT(isTensor());
-    return at::Tensor(as_tensor_impl, /*retain=*/true);
-  }
-
-  // Tuple
-  IValue(c10::intrusive_ptr<Tuple> v);
-  bool isTuple() const { return Tag::Tuple == tag; }
-  c10::intrusive_ptr<Tuple> toTuple() && {
-    JIT_ASSERT(isTuple());
-    return moveToIntrusivePtr<Tuple>();
-  }
-  c10::intrusive_ptr<Tuple> toTuple() const & {
-    JIT_ASSERT(isTuple());
-    return toIntrusivePtr<Tuple>();
-  }
-
-  // Double
-  IValue(double d)
-  : tag(Tag::Double), is_intrusive_ptr(false) {
-    as_double = d;
-  }
-  bool isDouble() const { return Tag::Double == tag; }
-  double toDouble() const {
-    JIT_ASSERT(isDouble());
-    return as_double;
-  }
-
-  // Int
-  IValue(int64_t i)
-  : tag(Tag::Int), is_intrusive_ptr(false) {
-    as_int = i;
-  }
-
-  // allow you to pass literals (3, 4) without ambiguity
-  IValue(int32_t i)
-  : IValue(static_cast<int64_t>(i)) {}
-  IValue(bool b)
-  : IValue(static_cast<int64_t>(b)) {}
-
-  bool isInt() const { return Tag::Int == tag; }
-
-  int64_t toInt() const {
-    JIT_ASSERT(isInt());
-    return as_int;
-  }
-
-  // IntList
-  IValue(c10::intrusive_ptr<IntList> v);
-  IValue(std::vector<int64_t> v);
-  IValue(at::ArrayRef<int64_t> v)
-  : IValue(std::vector<int64_t>(v.begin(), v.end())) {}
-  bool isIntList() const { return Tag::IntList == tag; }
-  c10::intrusive_ptr<IntList> toIntList() && {
-    JIT_ASSERT(isIntList());
-    return moveToIntrusivePtr<IntList>();
-  }
-  c10::intrusive_ptr<IntList> toIntList() const & {
-    JIT_ASSERT(isIntList());
-    return toIntrusivePtr<IntList>();
-  }
-
-  const std::vector<int64_t>& toIntListRef() const;
-  const std::vector<double>& toDoubleListRef() const;
-  const std::vector<at::Tensor>& toTensorListRef() const;
-
-  // ConstantString
-  IValue(c10::intrusive_ptr<ConstantString> v);
-  IValue(const std::string& v);
-  bool isString() const { return Tag::String == tag; }
-  c10::intrusive_ptr<ConstantString> toString() && {
-    JIT_ASSERT(isString());
-    return moveToIntrusivePtr<ConstantString>();
-  }
-  c10::intrusive_ptr<ConstantString> toString() const & {
-    JIT_ASSERT(isString());
-    return toIntrusivePtr<ConstantString>();
-  }
-
-  // DoubleList
-  IValue(c10::intrusive_ptr<DoubleList> v);
-  IValue(std::vector<double> v);
-  bool isDoubleList() const { return Tag::DoubleList == tag; }
-  c10::intrusive_ptr<DoubleList> toDoubleList() && {
-    JIT_ASSERT(isDoubleList());
-    return moveToIntrusivePtr<DoubleList>();
-  }
-  c10::intrusive_ptr<DoubleList> toDoubleList() const & {
-    JIT_ASSERT(isDoubleList());
-    return toIntrusivePtr<DoubleList>();
-  }
-
-  //TensorList
-  IValue(c10::intrusive_ptr<TensorList> v);
-  IValue(std::vector<at::Tensor> v);
-  bool isTensorList() const { return Tag::TensorList == tag; }
-  c10::intrusive_ptr<TensorList> toTensorList() && {
-    JIT_ASSERT(isTensorList());
-    return moveToIntrusivePtr<TensorList>();
-  }
-  c10::intrusive_ptr<TensorList> toTensorList() const & {
-    JIT_ASSERT(isTensorList());
-    return toIntrusivePtr<TensorList>();
-  }
-
-  // None
-  bool isNone() {
-    return Tag::None == tag;
-  }
-  std::string toNone() const {
-    return "None";
-  }
-  // Scalar, which gets encoded as either an Int or a Double
-  IValue(at::Scalar s)
-  : IValue() {
-    if(s.isFloatingPoint()) {
-      *this = s.toDouble();
-    } else {
-      *this = s.toLong();
-    }
-  }
-  bool isScalar() {
-    return isDouble() || isInt();
-  }
-  at::Scalar toScalar() const {
-    if(isDouble())
-      return toDouble();
-    else if(isInt())
-      return toInt();
-    else
-      throw std::runtime_error("IValue is not a Scalar");
-  }
-
-  // for debugging
-  std::string tagKind() const {
-    switch(tag) {
-      #define DEFINE_CASE(x) case Tag::x: return #x;
-      TORCH_FORALL_TAGS(DEFINE_CASE)
-      #undef DEFINE_CASE
-    }
-    return "Invalid Tag";
-  }
-
-  // generic v.to<at::Tensor>() implementations
-  // that can be used in special functions like pop/push
-  // that use template meta-programming.
-  // prefer the directly named methods when you can,
-  // since they are simpler to understand
-
-  // Note: if you get linker errors saying one of these is missing,
-  // change it to ... && = delete; and you will see better error messages for why
-  // However, we cannot commit this because some compiler versions barf on it.
-  template<typename T>
-  T to() &&;
-  template<typename T>
-  T to() const &;
-
-  TORCH_API friend std::ostream& operator<<(std::ostream & out, const IValue & v);
-
-private:
-  // NOTE: IValue tags are intentionally private. In the future we may encode
-  // this value different (e.g. using NaN boxing), and this would make it more
-  // costly to determine the tag for all types vs just determining if something
-  // is a particular type. Instead we want clients to use the `isX` methods when
-  // possible. If for perf. reasons you really, absolutely, must have a jump
-  // table, then we can revisit this.
-  enum class Tag : uint32_t {
-#define DEFINE_TAG(x) x,
-    TORCH_FORALL_TAGS(DEFINE_TAG)
-#undef DEFINE_TAG
-  };
-
-  template<typename T>
-  c10::intrusive_ptr<T> moveToIntrusivePtr() {
-    auto t = c10::intrusive_ptr<T>::reclaim(static_cast<T*>(as_intrusive_ptr));
-    clearToNone();
-    return t;
-  }
-  template<typename T>
-  c10::intrusive_ptr<T> toIntrusivePtr() const {
-    auto r = c10::intrusive_ptr<T>::reclaim(static_cast<T*>(as_intrusive_ptr));
-    auto p = r;
-    r.release();
-    return p;
-  }
-  void clearToNone() {
-    payload = 0;
-    tag = Tag::None;
-    is_intrusive_ptr = false;
-  }
-  union {
-    at::TensorImpl* as_tensor_impl;
-    c10::intrusive_ptr_target* as_intrusive_ptr;
-    double as_double;
-    int64_t as_int;
-    // this type should be as big as all the other types because it will
-    // be used to copy the union's value in certain cases
-    int64_t payload;
-  };
-  Tag tag;
-  bool is_intrusive_ptr;
-};
-
-#undef TORCH_FORALL_TAGS
-
-
-#define DEFINE_TO(type, method_name) \
-template<> \
-inline type IValue::to<type>() && { \
-  return std::move(*this).method_name(); \
-} \
-template<> \
-inline type IValue::to<type>() const & { \
-  return this->method_name(); \
-}
-DEFINE_TO(at::Tensor, toTensor)
-DEFINE_TO(c10::intrusive_ptr<Tuple>, toTuple)
-DEFINE_TO(double, toDouble)
-DEFINE_TO(int64_t, toInt)
-DEFINE_TO(c10::intrusive_ptr<DoubleList>, toDoubleList)
-DEFINE_TO(c10::intrusive_ptr<IntList>, toIntList)
-DEFINE_TO(c10::intrusive_ptr<TensorList>, toTensorList)
-DEFINE_TO(c10::intrusive_ptr<ConstantString>, toString)
-DEFINE_TO(at::Scalar, toScalar)
-DEFINE_TO(bool, toInt)
-DEFINE_TO(std::vector<int64_t>, toIntListRef)
-DEFINE_TO(std::vector<double>, toDoubleListRef)
-DEFINE_TO(std::vector<at::Tensor>, toTensorListRef)
-
-#undef DEFINE_TO
-
-inline IValue::IValue(c10::intrusive_ptr<Tuple> v)
-: tag(Tag::Tuple), is_intrusive_ptr(true) {
-  as_intrusive_ptr = v.release();
-}
-
-inline IValue::IValue(c10::intrusive_ptr<IntList> v)
-: tag(Tag::IntList), is_intrusive_ptr(true) {
-  as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::vector<int64_t> v)
-: IValue(IntList::create(std::move(v))) {}
-
-inline IValue::IValue(c10::intrusive_ptr<ConstantString> v)
-: tag(Tag::String), is_intrusive_ptr(true) {
-  as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(const std::string& v)
-: IValue(ConstantString::create(v)) {}
-
-inline IValue::IValue(c10::intrusive_ptr<DoubleList> v)
-: tag(Tag::DoubleList), is_intrusive_ptr(true) {
-  as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::vector<double> v)
-: IValue(DoubleList::create(std::move(v))) {}
-
-inline IValue::IValue(c10::intrusive_ptr<TensorList> v)
-: tag(Tag::TensorList), is_intrusive_ptr(true) {
-  as_intrusive_ptr = v.release();
-}
-inline IValue::IValue(std::vector<at::Tensor> v)
-: IValue(TensorList::create(std::move(v))) {}
-
-inline const std::vector<int64_t>& IValue::toIntListRef() const {
-  return toIntList()->elements();
-}
-
-inline const std::vector<double>& IValue::toDoubleListRef() const {
-  return toDoubleList()->elements();
-}
-
-inline const std::vector<at::Tensor>& IValue::toTensorListRef() const {
-  return toTensorList()->elements();
-}
-
-
-}}
+#include <ATen/core/ivalue.h>
diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp
index c9c4ceb0d2d8d3..75e5833535bcfc 100644
--- a/torch/csrc/jit/operator.cpp
+++ b/torch/csrc/jit/operator.cpp
@@ -54,6 +54,7 @@ struct SchemaParser {
       {"Layout", IntType::get() },
       {"Device", ListType::ofInts() },
       {"Scalar", NumberType::get() },
+      {"str", StringType::get() },
       {"float", FloatType::get() },
       {"int", IntType::get() },
       {"bool", IntType::get() }, // TODO: add separate bool type
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index bfd8ec9b9f1764..c9cc1b703de100 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -15,11 +15,9 @@ namespace {
 std::unordered_set<Symbol> skip_list = {
   prim::If,
   prim::Loop, //TODO: handle Loop
-  //FIXME Same problem as in DCE - cpp & python PythonOp and CppOp should be
-  //FIXME treated as having side effects but ONNX depends on them being removed
   prim::Print,
+  prim::PythonOp, //may have side effects
   //all the rand functions from native_functions.yaml
-  aten::permute,
   aten::rand,
   aten::rand_out,
   aten::rand_like,
@@ -33,6 +31,7 @@ std::unordered_set<Symbol> skip_list = {
   aten::randperm_out,
   prim::Constant,
   prim::Undefined,
+  prim::NoneGenerator,
   // TODO (zach): we should consider skipping tensor factories in the cases
   // where the constant tensor would be large but cheap to create.
  };
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 81a1768f48a302..4d69ed57e63786 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -1,7 +1,7 @@
 #include "torch/csrc/jit/passes/graph_fuser.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
 #include "torch/csrc/jit/symbolic_variable.h"
-#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/fusers/interface.h"
 #include "torch/csrc/jit/autodiff.h"
 #include "torch/csrc/jit/assertions.h"
 #include "ATen/ExpandUtils.h"
@@ -228,6 +228,9 @@ struct GraphFuser {
         node->matches("aten::le(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::le(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::le(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::gt(Tensor self, Tensor other) -> Tensor") ||
+        node->matches("aten::gt(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
+        node->matches("aten::gt(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") ||
         node->matches("aten::ge(Tensor self, Scalar other) -> Tensor", /*const=*/attr::other) ||
         node->matches("aten::ge(Scalar other, Tensor self) -> Tensor", /*const=*/attr::other) ||
@@ -343,7 +346,7 @@ struct GraphFuser {
     // is that if we're compiling on CPU, the fusion compiler works.
     if (consumer_device.type() == DeviceType::CPU ||
         producer_device.type() == DeviceType::CPU) {
-      return sharedFusionCompiler().canCompileOnCPU();
+      return canFuseOnCPU();
     }
     return true;
   }
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index 75fb063c761a31..6efcd11bfaadf3 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -156,10 +156,10 @@ void BlockToONNX(Block* old_block, Block* new_block, ::torch::onnx::OperatorExpo
     auto scalar_it = op->scalar_args.begin();
     for (auto arg_type : op->cconv) {
       py::object obj;
-      if (arg_type == 's') {
+      if (arg_type == 'c') {
         JIT_ASSERTM(scalar_it != op->scalar_args.end(), "expected too many scalar args");
         obj = py::reinterpret_borrow<py::object>(py::handle((scalar_it++)->get()));
-      } else if (arg_type == 't') {
+      } else if (arg_type == 'd') {
         JIT_ASSERTM(node_it != inputs.end(), "expected too many inputs");
         obj = py::cast(envFn(*node_it++));
       } else {
diff --git a/torch/csrc/jit/passes/requires_grad_analysis.cpp b/torch/csrc/jit/passes/requires_grad_analysis.cpp
new file mode 100644
index 00000000000000..8b17859af02c07
--- /dev/null
+++ b/torch/csrc/jit/passes/requires_grad_analysis.cpp
@@ -0,0 +1,128 @@
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/type.h"
+#include "torch/csrc/jit/argument_spec.h"
+#include "torch/csrc/jit/operator.h"
+
+#include <vector>
+
+namespace torch { namespace jit {
+
+namespace {
+
+bool getRequiresGrad(Value * value) {
+  return value->requires_grad();
+}
+
+void setRequiresGrad(Value * value, bool req_value) {
+  if (auto type = value->type()->cast<TensorType>()) {
+    value->setType(type->withRequiresGrad(req_value));
+  }
+}
+
+void setRequiresGrad(at::ArrayRef<Value*> outputs, const std::vector<bool>& values) {
+  JIT_ASSERT(outputs.size() == values.size());
+  for (size_t i = 0; i < values.size(); ++i) {
+    setRequiresGrad(outputs[i], values[i]);
+  }
+}
+
+void setRequiresGrad(Node * node, const std::vector<bool>& values) {
+  setRequiresGrad(node->outputs(), values);
+}
+
+std::vector<bool> bitwiseOr(std::vector<bool> a, const std::vector<bool>& b) {
+  JIT_ASSERT(a.size() == b.size());
+  for (size_t i = 0; i < a.size(); ++i) {
+    a[i] = a[i] || b[i];
+  }
+  return a;
+}
+
+
+void PropagateRequiresGradSimpleNode(Node* node) {
+  static const OperatorSet comparison_ops = {
+    "aten::lt(Tensor self, Tensor other) -> Tensor",
+    "aten::le(Tensor self, Tensor other) -> Tensor",
+    "aten::gt(Tensor self, Tensor other) -> Tensor",
+    "aten::ge(Tensor self, Tensor other) -> Tensor",
+    "aten::eq(Tensor self, Tensor other) -> Tensor",
+    "aten::ne(Tensor self, Tensor other) -> Tensor",
+    "aten::lt(Tensor self, Scalar other) -> Tensor",
+    "aten::le(Tensor self, Scalar other) -> Tensor",
+    "aten::gt(Tensor self, Scalar other) -> Tensor",
+    "aten::ge(Tensor self, Scalar other) -> Tensor",
+    "aten::eq(Tensor self, Scalar other) -> Tensor",
+    "aten::ne(Tensor self, Scalar other) -> Tensor",
+  };
+
+  if (comparison_ops.find(node)) {
+    return setRequiresGrad(node->output(), false);
+  } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) {
+    return setRequiresGrad(node->output(), node->input(0)->requires_grad());
+  } else if (node->matches("aten::detach(Tensor self) -> Tensor")) {
+    return setRequiresGrad(node->output(), false);
+  }
+
+  auto inputs = node->inputs();
+  auto outputs = node->outputs();
+  bool should_require = std::any_of(inputs.begin(), inputs.end(), getRequiresGrad);
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    if (auto type = outputs[i]->type()->cast<TensorType>()) {
+      setRequiresGrad(outputs[i], should_require && at::isFloatingType(type->scalarType()));
+    }
+  }
+}
+
+void PropagateRequiresGrad(Block * block);
+
+void PropagateRequiresGrad(Node * node) {
+  if (node->kind() == prim::If) {
+    auto blocks = node->blocks();
+    auto true_block = blocks.at(0);
+    auto false_block = blocks.at(1);
+
+    PropagateRequiresGrad(true_block);
+    PropagateRequiresGrad(false_block);
+
+    auto outputs_require =
+      bitwiseOr(fmap(true_block->outputs(), getRequiresGrad),
+                fmap(false_block->outputs(), getRequiresGrad));
+    setRequiresGrad(node, outputs_require);
+  } else if (node->kind() == prim::Loop) {
+    auto body = node->blocks().at(0);
+    std::vector<bool> body_inputs_require = fmap(node->inputs().slice(2), getRequiresGrad);
+    std::vector<bool> body_outputs_require (node->outputs().size(), false);
+
+    while (body_inputs_require != body_outputs_require) {
+      body_inputs_require = bitwiseOr(body_inputs_require, body_outputs_require);
+      setRequiresGrad(body->param_node()->outputs().slice(1), body_inputs_require);
+      PropagateRequiresGrad(body);
+      body_outputs_require = fmap(body->return_node()->inputs().slice(1), getRequiresGrad);
+    }
+
+    setRequiresGrad(node, body_outputs_require);
+  } else {
+    PropagateRequiresGradSimpleNode(node);
+  }
+}
+
+void PropagateRequiresGrad(Block * block) {
+  for (Node * node : block->nodes()) {
+    PropagateRequiresGrad(node);
+  }
+}
+
+} // anonymous namespace
+
+void PropagateRequiresGrad(std::shared_ptr<Graph>& graph, const ArgumentSpec & spec) {
+  auto inputs = graph->inputs();
+  JIT_ASSERT(spec.size() == inputs.size());
+  for (size_t i = 0; i < spec.size(); ++i) {
+    auto & arg = spec.at(i);
+    if (!arg.isTensor()) continue;
+    setRequiresGrad(inputs[i], arg.requires_grad());
+  }
+  PropagateRequiresGrad(graph->block());
+}
+
+}} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/requires_grad_analysis.h b/torch/csrc/jit/passes/requires_grad_analysis.h
new file mode 100644
index 00000000000000..37e53461ef4b47
--- /dev/null
+++ b/torch/csrc/jit/passes/requires_grad_analysis.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
+#include <memory>
+
+namespace torch { namespace jit {
+
+struct Graph;
+struct ArgumentSpec;
+
+TORCH_API void PropagateRequiresGrad(std::shared_ptr<Graph>& graph, const ArgumentSpec & spec);
+
+}}
+
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 7064e2b4b365a1..7499546f27684f 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -208,8 +208,7 @@ bool PropagateShapeOnNodeByRunningIt(Node* node) {
 // However, we allow primitive returns because we want to support mixed
 // primitive/tensor outputs.
 
-bool PropagateTensorShapeOnNode(
-    Node * node, bool insert_expands, std::vector<TensorTypePtr> types);
+bool PropagateTensorShapeOnNode(Node * node, bool insert_expands);
 bool PropagateCompleteShapeOnNode(
     Node * node, bool insert_expands, std::vector<CompleteTensorTypePtr> types);
 
@@ -306,8 +305,30 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
     }
     case prim::ImplicitTensorToNum:
     case prim::TensorToNum:
-    case prim::NumToTensor:
       return; // correct num type is already set
+    case prim::NumToTensor: {
+      if (node->input()->type()->isSubtypeOf(IntType::get())) {
+        node->output()->setType(TensorType::create(at::kLong, -1, 0));
+      } else {
+        JIT_ASSERT(node->input()->type()->isSubtypeOf(FloatType::get()));
+        node->output()->setType(TensorType::create(at::kDouble, -1, 0));
+      }
+      return;
+    }
+    case prim::TupleConstruct: {
+      // We refresh the tuple type, because the input types could have been refined.
+      node->output()->setType(TupleType::create(fmap(node->inputs(), [](Value *v) { return v->type(); })));
+      return;
+    }
+    case prim::TupleUnpack: {
+      auto tuple_type = node->input()->type()->cast<TupleType>();
+      JIT_ASSERT(tuple_type && tuple_type->elements().size() == node->outputs().size());
+      auto elems = tuple_type->elements();
+      for (size_t i = 0; i < node->outputs().size(); ++i) {
+        node->output(i)->setType(elems[i]);
+      }
+      return;
+    }
     case prim::Constant: {
       if(node->output()->type()->isSubtypeOf(DynamicType::get())) {
         node->output()->inferTypeFrom(node->t(attr::value));
@@ -344,10 +365,8 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
     }
   }
 
-  if (auto maybe_tensor_types = gatherTensorTypes<TensorType>(node)) {
-    if (PropagateTensorShapeOnNode(node, insert_expands, std::move(*maybe_tensor_types))) {
-      return;
-    }
+  if (PropagateTensorShapeOnNode(node, insert_expands)) {
+    return;
   }
 
   if (PropagateShapeOnNodeByRunningIt(node)) {
@@ -356,9 +375,11 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
   return setUnshapedType(node);
 }
 
-bool PropagateTensorShapeOnNode(Node * node, bool insert_expands,
-                                std::vector<TensorTypePtr> tensor_types) {
+bool PropagateTensorShapeOnNode(Node * node, bool insert_expands) {
   static const auto broadcast = [](std::vector<TensorTypePtr>& tensor_types) -> TensorTypePtr {
+    if (tensor_types.size() == 1) {
+      return tensor_types[0];
+    }
     JIT_ASSERT(!tensor_types.empty());
     auto any_type = tensor_types[0];
     auto max_dims = any_type->dim();
@@ -367,85 +388,727 @@ bool PropagateTensorShapeOnNode(Node * node, bool insert_expands,
     }
     return TensorType::create(any_type->scalarType(), any_type->device(), max_dims);
   };
+
+  using type_vec_t = std::vector<TensorTypePtr>;
+  // Formula is expected to return a vector of length equal to the number of tensor
+  // outputs of the node, or an empty vector which implies that it failed to propagate.
+  using formula_t = std::function<type_vec_t(Node *)>;
+  static std::mutex shape_formulas_mutex;
+  static std::vector<std::pair<OperatorSet, formula_t>> shape_formulas;
+  struct register_formula_for {
+    register_formula_for(OperatorSet operators, formula_t formula) {
+      std::unique_lock<std::mutex> lock {shape_formulas_mutex};
+      shape_formulas.emplace_back(std::move(operators), std::move(formula));
+    }
+  };
+
+  // Requirements:
+  //   dims           : preserved
+  //   scalar type    : preserved
+  //   device         : preserved
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - First input should be the only tensor input
+  static const register_formula_for simple_unary_ops {{
+    "aten::abs(Tensor self) -> Tensor",
+    "aten::acos(Tensor self) -> Tensor",
+    "aten::neg(Tensor self) -> Tensor",
+    "aten::t(Tensor self) -> Tensor",
+    "aten::sigmoid(Tensor self) -> Tensor",
+    "aten::tanh(Tensor self) -> Tensor",
+    "aten::exp(Tensor self) -> Tensor",
+    "aten::relu(Tensor self) -> Tensor",
+    "aten::asin(Tensor self) -> Tensor",
+    "aten::atan(Tensor self) -> Tensor",
+    "aten::ceil(Tensor self) -> Tensor",
+    "aten::clone(Tensor self) -> Tensor",
+    "aten::contiguous(Tensor self) -> Tensor",
+    "aten::bernoulli(Tensor self) -> Tensor",
+    "aten::celu(Tensor self, Scalar alpha) -> Tensor",
+    "aten::clamp(Tensor self, Scalar min, Scalar max) -> Tensor",
+    "aten::clamp_max(Tensor self, Scalar max) -> Tensor",
+    "aten::clamp_min(Tensor self, Scalar min) -> Tensor",
+    "aten::alpha_dropout(Tensor input, float p, int train) -> Tensor",
+    "aten::bernoulli(Tensor self, float p, Generator generator) -> Tensor",
+    "aten::cos(Tensor self) -> Tensor",
+    "aten::cosh(Tensor self) -> Tensor",
+    "aten::digamma(Tensor self) -> Tensor",
+    "aten::dropout(Tensor input, float p, int train) -> Tensor",
+    "aten::elu(Tensor self, Scalar alpha, Scalar scale, Scalar input_scale) -> Tensor",
+    "aten::erf(Tensor self) -> Tensor",
+    "aten::erfc(Tensor self) -> Tensor",
+    "aten::erfinv(Tensor self) -> Tensor",
+    "aten::exp(Tensor self) -> Tensor",
+    "aten::expm1(Tensor self) -> Tensor",
+    "aten::log(Tensor self) -> Tensor",
+    "aten::log10(Tensor self) -> Tensor",
+    "aten::log1p(Tensor self) -> Tensor",
+    "aten::log2(Tensor self) -> Tensor",
+    "aten::log_sigmoid(Tensor self) -> Tensor",
+    "aten::log_softmax(Tensor self, int dim) -> Tensor",
+    "aten::floor(Tensor self) -> Tensor",
+    "aten::frac(Tensor self) -> Tensor",
+    "aten::flip(Tensor self, int[] dims) -> Tensor",
+    "aten::feature_alpha_dropout(Tensor input, float p, int train) -> Tensor",
+    "aten::feature_dropout(Tensor input, float p, int train) -> Tensor",
+    "aten::hardshrink(Tensor self, Scalar lambd) -> Tensor",
+    "aten::hardtanh(Tensor self, Scalar min_val, Scalar max_val) -> Tensor",
+    "aten::glu(Tensor self, int dim) -> Tensor",
+    "aten::inverse(Tensor self) -> Tensor",
+    "aten::leaky_relu(Tensor self, Scalar negative_slope) -> Tensor",
+    "aten::lgamma(Tensor self) -> Tensor",
+    "aten::mvlgamma(Tensor self, int p) -> Tensor",
+    "aten::normal(float mean, Tensor std, *, Generator generator) -> Tensor",
+    "aten::normal(Tensor mean, float std, *, Generator generator) -> Tensor",
+    "aten::permute(Tensor self, int[] dims) -> Tensor",
+    "aten::pin_memory(Tensor self) -> Tensor",
+    "aten::pinverse(Tensor self, float rcond) -> Tensor",
+    "aten::reciprocal(Tensor self) -> Tensor",
+    "aten::relu(Tensor self) -> Tensor",
+    "aten::round(Tensor self) -> Tensor",
+    "aten::rrelu(Tensor self, Scalar lower, Scalar upper, int training, Generator generator) -> Tensor",
+    "aten::rsqrt(Tensor self) -> Tensor",
+    "aten::selu(Tensor self) -> Tensor",
+    "aten::sigmoid(Tensor self) -> Tensor",
+    "aten::sign(Tensor self) -> Tensor",
+    "aten::sin(Tensor self) -> Tensor",
+    "aten::sinh(Tensor self) -> Tensor",
+    "aten::softmax(Tensor self, int dim) -> Tensor",
+    "aten::softplus(Tensor self, Scalar beta, Scalar threshold) -> Tensor",
+    "aten::softshrink(Tensor self, Scalar lambd) -> Tensor",
+    "aten::sqrt(Tensor self) -> Tensor",
+    "aten::tan(Tensor self) -> Tensor",
+    "aten::tanh(Tensor self) -> Tensor",
+    "aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor",
+    "aten::transpose(Tensor self, int dim0, int dim1) -> Tensor",
+    "aten::tril(Tensor self, int diagonal) -> Tensor",
+    "aten::triu(Tensor self, int diagonal) -> Tensor",
+    "aten::trunc(Tensor self) -> Tensor",
+    "aten::rot90(Tensor self, int k, int[] dims) -> Tensor",
+    "aten::narrow(Tensor self, int dim, int start, int length) -> Tensor",
+    "aten::slice(Tensor self, int dim, int start, int end, int step) -> Tensor",
+    "aten::alias(Tensor self) -> Tensor",
+    "aten::detach(Tensor self) -> Tensor",
+    "aten::cumprod(Tensor self, int dim) -> Tensor",
+    "aten::cumsum(Tensor self, int dim) -> Tensor",
+
+    "aten::empty_like(Tensor self) -> Tensor",
+    "aten::full_like(Tensor self, Scalar fill_value) -> Tensor",
+    "aten::ones_like(Tensor self) -> Tensor",
+    "aten::rand_like(Tensor self) -> Tensor",
+    "aten::randint_like(Tensor self, int high) -> Tensor",
+    "aten::randint_like(Tensor self, int low, int high) -> Tensor",
+    "aten::randn_like(Tensor self) -> Tensor",
+    "aten::zeros_like(Tensor self) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    auto input_type = node->input(0)->type()->cast<TensorType>();
+    return input_type ? type_vec_t{input_type} : type_vec_t{};
+  }};
+
+  // Requirements:
+  //   dims           : broadcast all tensor args
+  //   scalar type    : always matching and preserved
+  //   device         : always matching and preserved
+  //   tensor inputs  : *
+  //   tensor outputs : 1
+  static const register_formula_for broadcasting_ops {{
+    // Tensor-Tensor operators
+    "aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
+    "aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor",
+    "aten::mul(Tensor self, Tensor other) -> Tensor",
+    "aten::div(Tensor self, Tensor other) -> Tensor",
+    "aten::pow(Tensor self, Tensor exponent) -> Tensor",
+    "aten::min(Tensor self, Tensor other) -> Tensor",
+    "aten::max(Tensor self, Tensor other) -> Tensor",
+    "aten::fmod(Tensor self, Tensor other) -> Tensor",
+    "aten::remainder(Tensor self, Tensor other) -> Tensor",
+    "aten::lerp(Tensor self, Tensor end, Scalar weight) -> Tensor",
+    "aten::max(Tensor self, Tensor other) -> Tensor",
+    "aten::min(Tensor self, Tensor other) -> Tensor",
+    "aten::__and__(Tensor self, Tensor other) -> Tensor",
+    "aten::__or__(Tensor self, Tensor other) -> Tensor",
+    "aten::__xor__(Tensor self, Tensor other) -> Tensor",
+    "aten::__lshift__(Tensor self, Tensor other) -> Tensor",
+    "aten::__rshift__(Tensor self, Tensor other) -> Tensor",
+    "aten::__iand__(Tensor self, Tensor other) -> Tensor",
+    "aten::__ior__(Tensor self, Tensor other) -> Tensor",
+    "aten::__ixor__(Tensor self, Tensor other) -> Tensor",
+    "aten::__ilshift__(Tensor self, Tensor other) -> Tensor",
+    "aten::__irshift__(Tensor self, Tensor other) -> Tensor",
+
+    // Tensor-Scalar operators
+    "aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor",
+    "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor",
+    "aten::mul(Tensor self, Scalar other) -> Tensor",
+    "aten::div(Tensor self, Scalar other) -> Tensor",
+    "aten::pow(Tensor self, Scalar exponent) -> Tensor",
+    "aten::fmod(Tensor self, Scalar other) -> Tensor",
+    "aten::remainder(Tensor self, Scalar other) -> Tensor",
+    "aten::add(Scalar other, Tensor self) -> Tensor",
+    "aten::sub(Scalar other, Tensor self) -> Tensor",
+    "aten::mul(Scalar other, Tensor self) -> Tensor",
+    "aten::div(Scalar other, Tensor self) -> Tensor",
+    "aten::pow(Scalar base, Tensor self) -> Tensor",
+    "aten::__and__(Tensor self, Scalar other) -> Tensor",
+    "aten::__or__(Tensor self, Scalar other) -> Tensor",
+    "aten::__xor__(Tensor self, Scalar other) -> Tensor",
+    "aten::__lshift__(Tensor self, Scalar other) -> Tensor",
+    "aten::__rshift__(Tensor self, Scalar other) -> Tensor",
+    "aten::__iand__(Tensor self, Scalar other) -> Tensor",
+    "aten::__ior__(Tensor self, Scalar other) -> Tensor",
+    "aten::__ixor__(Tensor self, Scalar other) -> Tensor",
+    "aten::__ilshift__(Tensor self, Scalar other) -> Tensor",
+    "aten::__irshift__(Tensor self, Scalar other) -> Tensor",
+
+    // Ops with Tensor-Tensor overloads only
+    "aten::atan2(Tensor self, Tensor other) -> Tensor",
+
+    // Non-binary ops
+    "aten::where(Tensor condition, Tensor self, Tensor other) -> Tensor",
+    "aten::addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value) -> Tensor",
+    "aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto maybe_tensor_types = gatherTensorTypes<TensorType>(node)) {
+      return {broadcast(*maybe_tensor_types)};
+    }
+    return {};
+  }};
+
+  static const auto any_tensor_type = [](Node * node) -> TensorTypePtr {
+    for (Value * input : node->inputs()) {
+      if (auto type = input->type()->cast<TensorType>()) {
+        return type;
+      }
+    }
+    return nullptr;
+  };
+
+  // Requirements:
+  //   dims           : always matching and preserved
+  //   scalar type    : always matching and preserved
+  //   device         : always matching and preserved
+  //   tensor inputs  : 2
+  //   tensor outputs : 1
+  static const register_formula_for binary_ops_strict_match {{
+    "aten::normal(Tensor mean, Tensor std, *, Generator generator) -> Tensor",
+    "aten::bernoulli(Tensor self, Tensor p, Generator generator) -> Tensor",
+    "aten::mm(Tensor self, Tensor mat2) -> Tensor",
+    "aten::bmm(Tensor self, Tensor mat2) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto type = any_tensor_type(node)) {
+      return {type};
+    }
+    return {};
+  }};
+
+  // Requirements:
+  //   dims           : all tensor args are broadcast
+  //   scalar type    : byte/uint8
+  //   device         : always matching and preserved
+  //   tensor inputs  : *
+  //   tensor outputs : 1
+  static const register_formula_for comparison_ops {{
+    "aten::lt(Tensor self, Tensor other) -> Tensor",
+    "aten::le(Tensor self, Tensor other) -> Tensor",
+    "aten::gt(Tensor self, Tensor other) -> Tensor",
+    "aten::ge(Tensor self, Tensor other) -> Tensor",
+    "aten::eq(Tensor self, Tensor other) -> Tensor",
+    "aten::ne(Tensor self, Tensor other) -> Tensor",
+    "aten::lt(Tensor self, Scalar other) -> Tensor",
+    "aten::le(Tensor self, Scalar other) -> Tensor",
+    "aten::gt(Tensor self, Scalar other) -> Tensor",
+    "aten::ge(Tensor self, Scalar other) -> Tensor",
+    "aten::eq(Tensor self, Scalar other) -> Tensor",
+    "aten::ne(Tensor self, Scalar other) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto maybe_tensor_types = gatherTensorTypes<TensorType>(node)) {
+      return {broadcast(*maybe_tensor_types)->toScalarType(at::kByte)};
+    }
+    return {};
+  }};
+
+  // Requirements:
+  //   dims           : preserved from the first argument
+  //   scalar type    : preserved from the first argument (doesn't have to match other arguments)
+  //   device         : always matching and preserved
+  //   tensor inputs  : *
+  //   tensor outputs : 1
+  // NB: those ops (with slight adjustments) are good candidates for restarts.
+  //     Knowing the type and device of weights or biases is usually enough to
+  //     infer the output type.
+  static const register_formula_for nn_ops_first_input_preserving {{
+    "aten::batch_norm(Tensor input, Tensor weight, Tensor bias, Tensor running_mean, Tensor running_var, int training, float momentum, float eps, int cudnn_enabled) -> Tensor",
+    "aten::conv1d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor",
+    "aten::conv2d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor",
+    "aten::conv3d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor",
+    "aten::conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad) -> Tensor",
+    "aten::conv_transpose1d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] output_padding, int groups, int[] dilation) -> Tensor",
+    "aten::conv_transpose2d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] output_padding, int groups, int[] dilation) -> Tensor",
+    "aten::conv_transpose3d(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] output_padding, int groups, int[] dilation) -> Tensor",
+    "aten::convolution(Tensor input, Tensor weight, Tensor bias, int[] stride, int[] padding, int[] dilation, int transposed, int[] output_padding, int groups) -> Tensor",
+    "aten::adaptive_avg_pool1d(Tensor self, int[] output_size) -> Tensor",
+    "aten::adaptive_avg_pool2d(Tensor self, int[] output_size) -> Tensor",
+    "aten::adaptive_avg_pool3d(Tensor self, int[] output_size) -> Tensor",
+    "aten::avg_pool1d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int ceil_mode, int count_include_pad) -> Tensor",
+    "aten::avg_pool2d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int ceil_mode, int count_include_pad) -> Tensor",
+    "aten::avg_pool3d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int ceil_mode, int count_include_pad) -> Tensor",
+    "aten::max_pool1d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, int ceil_mode) -> Tensor",
+    "aten::max_pool2d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, int ceil_mode) -> Tensor",
+    "aten::max_pool3d(Tensor self, int[] kernel_size, int[] stride, int[] padding, int[] dilation, int ceil_mode) -> Tensor",
+    "aten::max_unpool2d(Tensor self, Tensor indices, int[] output_size) -> Tensor",
+    "aten::max_unpool3d(Tensor self, Tensor indices, int[] output_size, int[] stride, int[] padding) -> Tensor",
+    "aten::reflection_pad1d(Tensor self, int[] padding) -> Tensor",
+    "aten::reflection_pad2d(Tensor self, int[] padding) -> Tensor",
+    "aten::replication_pad1d(Tensor self, int[] padding) -> Tensor",
+    "aten::replication_pad2d(Tensor self, int[] padding) -> Tensor",
+    "aten::replication_pad3d(Tensor self, int[] padding) -> Tensor",
+    "aten::upsample_bilinear2d(Tensor self, int[] output_size, int align_corners) -> Tensor",
+    "aten::upsample_linear1d(Tensor self, int[] output_size, int align_corners) -> Tensor",
+    "aten::upsample_nearest1d(Tensor self, int[] output_size) -> Tensor",
+    "aten::upsample_nearest2d(Tensor self, int[] output_size) -> Tensor",
+    "aten::upsample_nearest3d(Tensor self, int[] output_size) -> Tensor",
+    "aten::upsample_trilinear3d(Tensor self, int[] output_size, int align_corners) -> Tensor",
+    "aten::prelu(Tensor self, Tensor weight) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto type = node->input(0)->type()->cast<TensorType>()) {
+      return {type};
+    }
+    return {};
+  }};
+
+  // Requirements:
+  //   dims           : 0
+  //   scalar type    : preserved
+  //   device         : preserved
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - First input should be the only tensor input
+  static const register_formula_for all_reduce_ops {{
+    "aten::argmax(Tensor self) -> Tensor",
+    "aten::argmin(Tensor self) -> Tensor",
+    "aten::det(Tensor self) -> Tensor",
+    "aten::logdet(Tensor self) -> Tensor",
+    "aten::max(Tensor self) -> Tensor",
+    "aten::min(Tensor self) -> Tensor",
+    "aten::mean(Tensor self) -> Tensor",
+    "aten::median(Tensor self) -> Tensor",
+    "aten::norm(Tensor self, Scalar p) -> Tensor",
+    "aten::std(Tensor self, int unbiased) -> Tensor",
+    "aten::sum(Tensor self) -> Tensor",
+    "aten::trace(Tensor self) -> Tensor",
+    "aten::var(Tensor self, int unbiased) -> Tensor",
+    "aten::all(Tensor self) -> Tensor",
+    "aten::any(Tensor self) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto type = node->input(0)->type()->cast<TensorType>()) {
+      return {type->withDim(0)};
+    }
+    return {};
+  }};
+
+  // Requirements:
+  //   dims           : 0
+  //   scalar type    : preserved if floating point, otherwise long/int64
+  //   device         : preserved
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - First input should be the only tensor input
+  static const register_formula_for all_reduce_ops_with_integer_upcast {{
+    "aten::sum(Tensor self) -> Tensor",
+    "aten::prod(Tensor self) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto type = node->input(0)->type()->cast<TensorType>()) {
+      return {at::isFloatingType(type->scalarType()) ? type->withDim(0) : type->withDim(0)->toScalarType(at::kLong)};
+    }
+    return {};
+  }};
+
+  static const auto multidim_reduce_with_postprocess =
+    [](Node * node, size_t num_reduced_dim, bool upcast_integer) -> type_vec_t {
+      auto maybe_keepdim = node->get<int64_t>(attr::keepdim);
+      if (!maybe_keepdim) return {};
+      if (auto type = node->input(0)->type()->cast<TensorType>()) {
+        if (upcast_integer && !at::isFloatingType(type->scalarType())) {
+          type = type->toScalarType(at::kLong);
+        }
+        if (*maybe_keepdim) {
+          return {type};
+        } else if (type->dim() > num_reduced_dim) {
+          return {type->withDim(type->dim() - num_reduced_dim)};
+        }
+      }
+      return {};
+    };
+
+  // Requirements:
+  //   dims           : preserved if keepdim == false, 1 smaller otherwise
+  //   scalar type    : preserved for first output, byte/uint8 for second output if exists
+  //   device         : preserved
+  //   tensor inputs  : 1
+  //   tensor outputs : 1 or 2
+  // Additionally:
+  //   - First input should be the only tensor input
+  //   - Has a bool keepdim argument
+  static const register_formula_for dim_reduce_ops {{
+    "aten::argmax(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::argmin(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::max_values(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::min_values(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::mean(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::norm(Tensor self, Scalar p, int dim, int keepdim) -> Tensor",
+    "aten::std(Tensor self, int dim, int unbiased, int keepdim) -> Tensor",
+    "aten::var(Tensor self, int dim, int unbiased, int keepdim) -> Tensor",
+    "aten::logsumexp(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::all(Tensor self, int dim, int keepdim) -> Tensor",
+    "aten::any(Tensor self, int dim, int keepdim) -> Tensor",
+
+    // Ops returning indices as second output
+    "aten::kthvalue(Tensor self, int k, int dim, int keepdim) -> (Tensor, Tensor)",
+    "aten::max(Tensor self, int dim, int keepdim) -> (Tensor, Tensor)",
+    "aten::min(Tensor self, int dim, int keepdim) -> (Tensor, Tensor)",
+    "aten::median(Tensor self, int dim, int keepdim) -> (Tensor, Tensor)",
+    "aten::mode(Tensor self, int dim, int keepdim) -> (Tensor, Tensor)",
+  }, [](Node * node) -> type_vec_t {
+    // NB: Note that while this function is generally meant to be used with ops that
+    // have a single output, we will fix up its return right below.
+    auto output_types = multidim_reduce_with_postprocess(node, /*num_reduce_dim=*/1, /*integer_upcast=*/false);
+    if (!output_types.empty() && node->outputs().size() == 2) {
+      output_types.push_back(output_types.back()->toScalarType(at::kLong));
+    }
+    return output_types;
+  }};
+
+  // Requirements:
+  //   dims           : preserved if keepdim == false, 1 smaller otherwise
+  //   scalar type    : preserved if floating point, otherwise long/int64
+  //   device         : preserved
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - First input should be the only tensor input
+  //   - has a bool keepdim argument
+  static const register_formula_for dim_reduce_ops_with_integer_upcast {{
+    "aten::prod(Tensor self, int dim, int keepdim) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    return multidim_reduce_with_postprocess(node, /*num_reduce_dim=*/1, /*integer_upcast=*/true);
+  }};
+
+  // Requirements:
+  //   dims           : preserved if keepdim == false, 1 smaller otherwise
+  //   scalar type    : preserved if floating point, otherwise long/int64
+  //   device         : preserved
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - has bool keepdim and int[] dim arguments
+  static const register_formula_for multidim_reduce_ops_with_integer_upcast {{
+    "aten::sum(Tensor self, int[] dim, int keepdim) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto dim = node->get<std::vector<int64_t>>(attr::dim)) {
+      // TODO: can dim contain duplicates?
+      return multidim_reduce_with_postprocess(node, /*num_reduce_dim=*/dim->size(), /*integer_upcast=*/true);
+    }
+    return {};
+  }};
+
+  static const auto get_device_int = [](at::optional<at::Device> dev) -> at::optional<int> {
+    if (!dev) return {};
+    if (dev->is_cpu()) {
+      return {-1};
+    }
+    return dev->has_index() ? at::optional<int>{dev->index()} : at::nullopt;
+  };
+  static const auto factory_with_ndim = [](Node * node, int dim) -> type_vec_t{
+    auto maybe_layout = node->get<at::Layout>(attr::layout);
+    if (!maybe_layout || maybe_layout != at::kStrided) return {};
+    auto maybe_device = get_device_int(node->get<at::Device>(attr::device));
+    if (!maybe_device) return {};
+    auto maybe_scalar_type = node->get<at::ScalarType>(attr::dtype);
+    if (!maybe_scalar_type) return {};
+    return {TensorType::create(*maybe_scalar_type, *maybe_device, dim)};
+  };
+
+  // Requirements:
+  //   dims           : preserved
+  //   scalar type    : equal to value of dtype
+  //   device         : equal to value of device
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - has ScalarType dtype, Layeout layout and Device device arguments
+  static const register_formula_for like_factories_with_options {{
+    "aten::empty_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::full_like(Tensor self, Scalar fill_value, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::ones_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::rand_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::randint_like(Tensor self, int high, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::randint_like(Tensor self, int low, int high, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::randn_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::zeros_like(Tensor self, *, int dtype, int layout, int[] device) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto type = node->namedInput(attr::self)->type()->cast<TensorType>()) {
+      return factory_with_ndim(node, type->dim());
+    }
+    return {};
+  }};
+
+  // Requirements:
+  //   dims           : equal to number of elements in size
+  //   scalar type    : equal to value of dtype
+  //   device         : equal to value of device
+  //   tensor inputs  : 1
+  //   tensor outputs : 1
+  // Additionally:
+  //   - has int[] size, ScalarType dtype, Layeout layout and Device device arguments
+  static const register_formula_for size_factories_with_options {{
+    "aten::empty(int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::full(int[] size, Scalar fill_value, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::ones(int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::rand(int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::randn(int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::zeros(int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::randint(int high, int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+    "aten::randint(int low, int high, int[] size, *, int dtype, int layout, int[] device) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto maybe_size = node->get<std::vector<int64_t>>(attr::size)) {
+      return factory_with_ndim(node, maybe_size->size());
+    }
+    return {};
+  }};
+
+  static const auto get_cast_scalar_type = [](Node *node) -> at::ScalarType {
+    switch (node->kind()) {
+      case aten::_cast_Byte: return at::kByte;
+      case aten::_cast_Char: return at::kChar;
+      case aten::_cast_Double: return at::kDouble;
+      case aten::_cast_Float: return at::kFloat;
+      case aten::_cast_Half: return at::kHalf;
+      case aten::_cast_Int: return at::kInt;
+      case aten::_cast_Long: return at::kLong;
+      case aten::_cast_Short: return at::kShort;
+      default: AT_ASSERTM(false, "unknown node kind in get_cast_scalar_type: ", node->kind().toQualString());
+    }
+  };
+  static const register_formula_for cast_ops {{
+    "aten::_cast_Byte(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Char(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Double(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Float(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Half(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Int(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Long(Tensor self, int non_blocking) -> Tensor",
+    "aten::_cast_Short(Tensor self, int non_blocking) -> Tensor",
+  }, [](Node * node) -> type_vec_t {
+    if (auto type = node->namedInput(attr::self)->type()->cast<TensorType>()) {
+      return {type->toScalarType(get_cast_scalar_type(node))};
+    }
+    return {};
+  }};
+
+  // First, try to match one of the registered formulas to their operator sets.
+  for (auto & entry : shape_formulas) {
+    if (entry.first.find(node)) {
+      auto types = entry.second(node);
+      if (types.empty()) {
+        return false;
+      } else {
+        auto outputs = node->outputs();
+        JIT_ASSERT(types.size() == outputs.size());
+        for (size_t i = 0; i < types.size(); ++i) {
+          JIT_ASSERT(outputs[i]->type()->isSubtypeOf(DynamicType::get()));
+          outputs[i]->setType(types[i]);
+        }
+        return true;
+      }
+    }
+  }
+
+  // This section implements shape prop for an assorted set of nodes that only
+  // need partial information about their input types.
+  const auto input_type = [node](size_t index) {
+    return node->input(index)->type()->cast<TensorType>();
+  };
+  if (node->matches("aten::masked_select(Tensor self, Tensor mask) -> Tensor")) {
+    auto type = input_type(0);
+    auto mask_type = input_type(1);
+    if (type && mask_type) {
+      if (type->dim() == 0 && mask_type->dim() == 0) {
+        node->output()->setType(type->withDim(0));
+      } else {
+        node->output()->setType(type->withDim(1));
+      }
+      return true;
+    }
+    if (auto type = input_type(0)) {
+      node->output()->setType(type->withDim(1));
+      return true;
+    }
+  } else if (node->matches("aten::dot(Tensor self, Tensor tensor) -> Tensor")) {
+    if (auto type = any_tensor_type(node)) {
+      node->output()->setType(type->withDim(0));
+      return true;
+    }
+  } else if (node->matches("aten::mv(Tensor self, Tensor vec) -> Tensor") ||
+             node->matches("aten::addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta, Scalar alpha) -> Tensor")) {
+    if (auto type = any_tensor_type(node)) {
+      node->output()->setType(type->withDim(1));
+      return true;
+    }
+  } else if (node->matches("aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta, Scalar alpha) -> Tensor") ||
+             node->matches("aten::addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta, Scalar alpha) -> Tensor") ||
+             node->matches("aten::addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta, Scalar alpha) -> Tensor")) {
+    if (auto type = any_tensor_type(node)) {
+      node->output()->setType(type->withDim(2));
+      return true;
+    }
+  } else if (node->matches("aten::baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta, Scalar alpha) -> Tensor")) {
+    if (auto type = any_tensor_type(node)) {
+      node->output()->setType(type->withDim(3));
+      return true;
+    }
+  } else if (node->matches("aten::index_select(Tensor self, int dim, Tensor index) -> Tensor")) {
+    auto type = input_type(0);
+    auto index_type = input_type(1);
+    // index_select behaves very weirdly when self.dim() == 0. It allows both 0D and 1D
+    // indices, and returns a value that has as many dimensions as index.
+    if (type && index_type) {
+      if (type->dim() == 0) {
+        node->output()->setType(type->withDim(index_type->dim()));
+      } else {
+        node->output()->setType(type);
+      }
+      return true;
+    }
+  } else if (node->matches("aten::gather(Tensor self, int dim, Tensor index) -> Tensor")) {
+    auto type = input_type(0);
+    auto index_type = input_type(1);
+    // Gather has this annoying edge case where index always needs to match the number of
+    // dims of self, **except** when self is 1D and index is 0D in which case we return
+    // a 0D output.
+    if (type && index_type) {
+      if (index_type->dim() == 0) {
+        node->output()->setType(type->withDim(0));
+      } else {
+        node->output()->setType(type);
+      }
+      return true;
+    }
+  } else if (node->matches("aten::embedding(Tensor weight, Tensor indices, int padding_idx, int scale_grad_by_freq, int sparse) -> Tensor")) {
+    auto weight_type = input_type(0);
+    auto indices_type = input_type(1);
+    if (weight_type && indices_type) {
+      node->output()->setType(weight_type->withDim(indices_type->dim() + 1));
+      return true;
+    }
+  } else if (node->matches("aten::bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor bias) -> Tensor")) {
+    if (auto type = input_type(0)) {
+      node->output()->setType(type);
+      return true;
+    }
+    if (auto type = input_type(1)) {
+      node->output()->setType(type);
+      return true;
+    }
+  } else if (node->matches("aten::dist(Tensor self, Tensor other, Scalar p) -> Tensor")) {
+    if (auto type = any_tensor_type(node)) {
+      node->output()->setType(type->withDim(0));
+      return true;
+    }
+  }
+
+  // The code below implements formulas that need type information for all their
+  // tensor inputs, and have exactly one output.
+  std::vector<TensorTypePtr> tensor_types;
+  static const auto reshape_prop =
+    [](Node * node, Symbol shape_input, const std::vector<TensorTypePtr>& tensor_types) -> TensorTypePtr {
+      if (auto shape = node->get<std::vector<int64_t>>(shape_input)) {
+        return tensor_types.at(0)->withDim(shape->size());
+      }
+      return nullptr;
+    };
   const auto getSingleOutputType = [&]() -> TypePtr {
-    if (node->matches("aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor") ||
-        node->matches("aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor") ||
-        node->matches("aten::mul(Tensor self, Tensor other) -> Tensor") ||
-        node->matches("aten::pow(Tensor self, Tensor exponent) -> Tensor") ||
-        node->matches("aten::min(Tensor self, Tensor other) -> Tensor") ||
-        node->matches("aten::max(Tensor self, Tensor other) -> Tensor") ||
-        node->matches("aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta, Scalar alpha) -> Tensor")) {
-      return broadcast(tensor_types);
-    } else if (node->matches("aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor") ||
-               node->matches("aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor") ||
-               node->matches("aten::mul(Tensor self, Scalar other) -> Tensor") ||
-               node->matches("aten::pow(Tensor self, Scalar exponent) -> Tensor") ||
-               node->matches("aten::add(Scalar other, Tensor self) -> Tensor") ||
-               node->matches("aten::sub(Scalar other, Tensor self) -> Tensor") ||
-               node->matches("aten::mul(Scalar other, Tensor self) -> Tensor") ||
-               node->matches("aten::mm(Tensor self, Tensor mat2) -> Tensor")) {
-      return tensor_types.at(0);
-    } else if (node->matches("aten::lt(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::le(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::gt(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::ge(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::eq(Tensor self, Tensor other) -> Tensor") ||
-               node->matches("aten::ne(Tensor self, Tensor other) -> Tensor")) {
-      return broadcast(tensor_types)->toScalarType(at::kByte);
-    } else if (node->matches("aten::lt(Tensor self, Scalar other) -> Tensor") ||
-               node->matches("aten::le(Tensor self, Scalar other) -> Tensor") ||
-               node->matches("aten::gt(Tensor self, Scalar other) -> Tensor") ||
-               node->matches("aten::ge(Tensor self, Scalar other) -> Tensor") ||
-               node->matches("aten::eq(Tensor self, Scalar other) -> Tensor") ||
-               node->matches("aten::ne(Tensor self, Scalar other) -> Tensor")) {
-      return tensor_types.at(0)->toScalarType(at::kByte);
-    } else if (node->matches("aten::neg(Tensor self) -> Tensor") ||
-               node->matches("aten::t(Tensor self) -> Tensor") ||
-               node->matches("aten::sigmoid(Tensor self) -> Tensor") ||
-               node->matches("aten::tanh(Tensor self) -> Tensor") ||
-               node->matches("aten::exp(Tensor self) -> Tensor") ||
-               node->matches("aten::relu(Tensor self) -> Tensor") ||
-               node->matches("aten::mm(Tensor self, Tensor mat2) -> Tensor") ||
-               node->matches("aten::narrow(Tensor self, int dim, int start, int length) -> Tensor") ||
-               node->matches("aten::index_select(Tensor self, int dim, Tensor index) -> Tensor")) {
-      return tensor_types.at(0);
-    } else if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) {
+    if (node->matches("aten::type_as(Tensor self, Tensor other) -> Tensor")) {
       return tensor_types.at(0)->toScalarType(tensor_types.at(1)->scalarType());
-    } else if (node->matches("aten::sum(Tensor self) -> Tensor")) {
-      // TODO: this depends on the dtype argument. why don't we have access to it in here?
-      // TODO: integral types are upcast
+    } else if (node->matches("aten::view_as(Tensor self, Tensor other) -> Tensor") ||
+               node->matches("aten::expand_as(Tensor self, Tensor other) -> Tensor") ||
+               node->matches("aten::reshape_as(Tensor self, Tensor other) -> Tensor")) {
+      return tensor_types.at(0)->withDim(tensor_types.at(1)->dim());
+    } else if (node->matches("aten::view(Tensor self, int[] size) -> Tensor") ||
+               node->matches("aten::expand(Tensor self, int[] size, *, int implicit) -> Tensor") ||
+               node->matches("aten::as_strided(Tensor self, int[] size, int[] stride) -> Tensor") ||
+               node->matches("aten::as_strided(Tensor self, int[] size, int[] stride, int storage_offset) -> Tensor")) {
+      return reshape_prop(node, attr::size, tensor_types);
+    } else if (node->matches("aten::reshape(Tensor self, int[] shape) -> Tensor")) {
+      return reshape_prop(node, attr::shape, tensor_types);
+    } else if (node->matches("aten::repeat(Tensor self, int[] repeats) -> Tensor")) {
+      return reshape_prop(node, attr::repeats, tensor_types);
+    } else if (node->matches("aten::unsqueeze(Tensor self, int dim) -> Tensor")) {
       auto & t = tensor_types.at(0);
-      return TensorType::create(t->scalarType(), t->device(), 0);
-    } else if (node->matches("aten::sum(Tensor self, int[] dim, int keepdim) -> Tensor",
-              /*with_const=*/attr::keepdim)) {
+      return t->withDim(t->dim() + 1);
+    } else if (node->matches("aten::select(Tensor self, int dim, int index) -> Tensor") ||
+               node->matches("aten::diagonal(Tensor self, int offset, int dim1, int dim2) -> Tensor")) {
       auto & t = tensor_types.at(0);
-      bool keepdim = node->get<int64_t>(attr::keepdim).value();
-      if (!keepdim) {
-        if (auto dims = node->get<std::vector<int64_t>>(attr::dim)) {
-          // TODO: do we need to account for duplicates in dim here?
-          return t->withDim(t->dim() - dims->size());
+      return t->dim() > 0 ? t->withDim(t->dim() - 1) : nullptr;
+    } else if (node->matches("aten::matmul(Tensor self, Tensor other) -> Tensor")) {
+      int dim1 = tensor_types.at(0)->dim();
+      int dim2 = tensor_types.at(1)->dim();
+      if (dim1 == 1 && dim2 == 1) {
+        // Dot product
+        return tensor_types.at(0)->withDim(0);
+      } else if (dim1 == 2 && dim2 == 2) {
+        // Matrix multiply
+        return tensor_types.at(0);
+      } else if (dim1 == 1 && dim2 == 2) {
+        // Unsqueeze + matrix multiply + squeeze
+        return tensor_types.at(0);
+      } else if (dim1 == 2 && dim2 == 1) {
+        // Matrix vector multiply
+        return tensor_types.at(1);
+      } else {
+        // Batched matrix multiply (possibly with squeeze + unsqueeze if one argument is 1D)
+        auto type = broadcast(tensor_types);
+        if (tensor_types.at(0)->dim() == 1 || tensor_types.at(1)->dim() == 1) {
+          type = type->withDim(type->dim() - 1);
         }
-        return nullptr;
+        return type;
+      }
+    } else if (node->matches("aten::nonzero(Tensor self) -> Tensor")) {
+      return tensor_types.at(0)->toScalarType(at::kLong);
+    } else if (node->matches("aten::take(Tensor self, Tensor index) -> Tensor")) {
+      return tensor_types.at(1)->toScalarType(tensor_types.at(0)->scalarType());
+    } else if (node->matches("aten::diagflat(Tensor self, int offset) -> Tensor")) {
+      return tensor_types.at(0)->withDim(2);
+    } else if (node->matches("aten::diag(Tensor self, int diagonal) -> Tensor")) {
+      auto & t = tensor_types.at(0);
+      if (t->dim() == 1) {
+        return t->withDim(2);
+      } else if (t->dim() == 2) {
+        return t->withDim(1);
       } else {
-        return t;
+        return nullptr;
       }
-      return nullptr;
-    } else if (node->matches("aten::unsqueeze(Tensor self, int dim) -> Tensor")) {
+    } else if (node->matches("aten::unfold(Tensor self, int dimension, int size, int step) -> Tensor")) {
       auto & t = tensor_types.at(0);
-      return t->withDim(t->dim() + 1);
-    } else if (node->matches("aten::view(Tensor self, int[] size) -> Tensor", /*with_const=*/attr::size) ||
-               node->matches("aten::expand(Tensor self, int[] size, *, int implicit) -> Tensor", /*with_const=*/attr::size)) {
-      return tensor_types.at(0)->withDim(node->get<std::vector<int64_t>>(attr::size)->size());
+      return t->dim() == 0 ? t : t->withDim(t->dim() + 1);
+    } else if (node->matches("aten::polygamma(int n, Tensor self) -> Tensor")) {
+      return tensor_types.at(0);
     }
     return nullptr;
   };
+  if (auto maybe_tensor_types = gatherTensorTypes<TensorType>(node)) {
+    tensor_types = std::move(*maybe_tensor_types);
+  } else {
+    return false;
+  }
   if (node->outputs().size() == 1) {
     if (auto type = getSingleOutputType()) {
       node->output()->setType(type);
       return true;
     }
   }
-  setUnshapedType(node);
   return false;
 }
 
diff --git a/torch/csrc/jit/pybind.h b/torch/csrc/jit/pybind.h
index 30a03bdd350ca1..f517302ec121f3 100644
--- a/torch/csrc/jit/pybind.h
+++ b/torch/csrc/jit/pybind.h
@@ -24,7 +24,12 @@ template <> struct type_caster<torch::jit::IValue> {
   PYBIND11_TYPE_CASTER(torch::jit::IValue, _("IValue"));
 
   bool load(handle src, bool) {
-    return false;
+    try {
+      value = torch::jit::toIValue(src);
+      return true;
+    } catch (std::exception& e) {
+      return false;
+    }
   }
 
   static handle cast(torch::jit::IValue src, return_value_policy /* policy */, handle /* parent */) {
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index 4b76872cb3f967..004bc075161697 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -7,6 +7,7 @@
 #include "torch/csrc/jit/type.h"
 #include "torch/csrc/jit/operator.h"
 #include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/utils/auto_gil.h"
 
 #include <ATen/Error.h>
 
@@ -18,6 +19,12 @@
 
 namespace torch { namespace jit {
 namespace detail {
+
+// error reporting: when reporting user-caused errors, these functions should
+// not use AT_ERROR macros, since these macros add stack trace information
+// that is confusing to display to the end user since it always reports
+// locations in libtorch code rather than user code.
+
 inline void findErrorInKwargs(
     const FunctionSchema& schema,
     py::kwargs kwargs) {
@@ -26,22 +33,22 @@ inline void findErrorInKwargs(
   // any argument in the schema.
   for (const auto& kwarg : kwargs) {
     const auto key = py::cast<std::string>(kwarg.first);
-    AT_CHECK(
-        std::count_if(
+    if(!std::count_if(
             arguments.begin(),
             arguments.end(),
-            [&key](const Argument& argument) { return argument.name == key; }),
-        "Unknown keyword argument '", key, "' for operator '",
-        schema.name, "'. Schema: ", schema);
+            [&key](const Argument& argument) { return argument.name == key; })) {
+        throw std::runtime_error(at::str("Unknown keyword argument '", key, "' for operator '",
+        schema.name, "'. Schema: ", schema));
+    }
   }
   // If there are unconsumed kwargs but none of them were unknown, the first
   // positional argument present in the kwargs is duplicated.
   for (const auto& argument : arguments) {
     if (kwargs.contains(argument.name.c_str())) {
       AT_ASSERT(!argument.default_value);
-      AT_ERROR(
+      throw std::runtime_error(at::str(
           "Argument '", argument.name, "' specified both as positional and ",
-          "keyword argument. Schema: ", schema);
+          "keyword argument. Schema: ", schema));
     }
   }
 }
@@ -49,7 +56,11 @@ inline void findErrorInKwargs(
 
 inline IValue toIValue(py::handle input) {
   if (THPVariable_Check(input.ptr())) {
-    return py::cast<at::Tensor>(input);
+    auto ten = py::cast<at::Tensor>(input);
+    if (ten.is_sparse()) {
+      AT_ERROR("sparse tensors not supported");
+    }
+    return ten;
   } else if (py::isinstance<py::tuple>(input)) {
     py::tuple input_tuple = py::cast<py::tuple>(input);
     Stack s;
@@ -78,25 +89,17 @@ inline IValue createGenericList(py::handle obj, const TypePtr& elem_type) {
   return ConstantList<IValue>::create(std::move(elems));
 }
 
-struct ConvertError : public std::exception {
-    ConvertError(std::string msg)
-    : msg_(std::move(msg)) {}
-    const char* what() const noexcept override  {
-        return msg_.c_str();
-    }
-private:
-    std::string msg_;
-};
-
-#define TORCH_CONVERT_ERROR(...) \
-  throw ConvertError(at::str(__VA_ARGS__))
-
 inline IValue toIValue(py::handle obj, const TypePtr& type) {
     switch (type->kind()) {
       case TypeKind::DynamicType:
       case TypeKind::TensorType:
-      case TypeKind::CompleteTensorType:
-        return py::cast<autograd::Variable>(obj);
+      case TypeKind::CompleteTensorType: {
+        auto var = py::cast<autograd::Variable>(obj);
+        if (var.is_sparse()) {
+          AT_ERROR("sparse tensors not supported");
+        }
+        return var;
+      }
       case TypeKind::FloatType:
         return py::cast<double>(obj);
       case TypeKind::IntType:
@@ -104,11 +107,14 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) {
       case TypeKind::NoneType:
         return {};
       case TypeKind::TupleType: {
+        if(!PyTuple_Check(obj.ptr()))
+          throw py::cast_error(); // note: the py::cast does not throw cast_error
+                                  // because it attempts to iterate a non-tuple
         py::tuple tuple = py::cast<py::tuple>(obj);
         size_t tuple_size = tuple.size();
         const auto & elem_types = type->cast<TupleType>()->elements();
         if (elem_types.size() != tuple_size) {
-          TORCH_CONVERT_ERROR("Expected ", elem_types.size(), " tuple elements for argument, but got ", tuple_size);
+          throw py::cast_error();
         }
         std::vector<IValue> values;
         values.reserve(tuple_size);
@@ -134,11 +140,10 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) {
         }
       }
       case TypeKind::NumberType:
-        TORCH_CONVERT_ERROR("Insufficient type information to convert input");
       case TypeKind::GeneratorType:
-        TORCH_CONVERT_ERROR("Generators are not supported yet.");
+        break;
     }
-  AT_ERROR("Missing cases in toIValue! File a bug report.");
+  AT_ERROR("Missing cases in toIValue for type: ", type->str(), "! File a bug report.");
 }
 
 inline IValue argumentToIValue(
@@ -149,21 +154,28 @@ inline IValue argumentToIValue(
   try {
     return toIValue(object, argument.type);
   } catch (const py::cast_error& error) {
-    AT_ERROR(
+    throw std::runtime_error(at::str(
         schema.name, "() expected value of type ", argument.type->str(),
         " for argument '", argument.name,
         "' in position ", argumentPosition,
         ", but instead got value of type ",
-        py::str(object.get_type().attr("__name__")),
-        ".\nDeclaration: ", schema);
-  } catch (const ConvertError& error) {
-    AT_ERROR(
-        schema.name, "(): ", error.what(),
-        "\n for argument '", argument.name,
-        "' in position ", argumentPosition,
-        ", but instead got value of type ",
-        py::str(object.get_type().attr("__name__")),
-        ".\nDeclaration: ", schema);
+        py::str(object.get_type().attr("__name__")), ".",
+        "\nValue: ", py::repr(object),
+        "\nDeclaration: ", schema));
+  }
+}
+
+inline IValue returnToIValue(
+    const TypePtr& type,
+    py::handle object) {
+  try {
+    return toIValue(object, type);
+  } catch (const py::cast_error& error) {
+    throw std::runtime_error(at::str(
+        " expected value of type ", type->str(),
+        " for return value but instead got value of type ",
+        py::str(object.get_type().attr("__name__")), ".",
+          "\nValue: ", py::repr(object)));
   }
 }
 
@@ -171,7 +183,11 @@ inline py::object toPyObject(IValue&& ivalue) {
   if (ivalue.isNone()) {
     return py::none();
   } else if (ivalue.isTensor()) {
-    return py::cast(autograd::Variable(ivalue.toTensor()));
+    auto tensor = std::move(ivalue).toTensor();
+    if (tensor.is_sparse()) {
+      AT_ERROR("sparse tensors not supported");
+    }
+    return py::cast(autograd::Variable(std::move(tensor)));
   } else if (ivalue.isDouble()) {
     return py::cast(ivalue.toDouble());
   } else if (ivalue.isInt()) {
@@ -199,12 +215,12 @@ inline Stack createStackForSchema(
     const FunctionSchema& schema,
     py::args args,
     py::kwargs kwargs = py::kwargs()) {
-  AT_CHECK(
-      args.size() + kwargs.size() <= schema.arguments.size(),
-      schema.name, "() expected at most ", schema.arguments.size(),
-      " argument(s) but received ",
-      args.size() + kwargs.size(), " argument(s). Declaration: ", schema);
-
+  if(args.size() + kwargs.size() > schema.arguments.size()) {
+    throw std::runtime_error(at::str(
+        schema.name, "() expected at most ", schema.arguments.size(),
+        " argument(s) but received ",
+        args.size() + kwargs.size(), " argument(s). Declaration: ", schema));
+  }
   Stack stack;
   stack.reserve(schema.arguments.size());
 
@@ -226,9 +242,9 @@ inline Stack createStackForSchema(
     } else if (arg.default_value) {
       push(stack, *arg.default_value);
     } else {
-      AT_ERROR(
+      throw std::runtime_error(at::str(
           schema.name, "() is missing value for argument '", arg.name,
-          "'. Declaration: ", schema);
+          "'. Declaration: ", schema));
     }
   }
 
@@ -277,7 +293,10 @@ inline py::object invokeScriptMethodFromPython(
     script::Method& method,
     py::args args, py::kwargs kwargs) {
   auto stack = createStackForSchema(method.getSchema(), std::move(args), std::move(kwargs));
-  method.run(stack);
+  {
+    AutoNoGIL no_gil_guard;
+    method.run(stack);
+  }
   return createPyObjectForStack(std::move(stack));
 }
 
@@ -285,18 +304,13 @@ inline py::object invokeOperatorFromPython(
     const Operator& op,
     py::args args,
     py::kwargs kwargs) {
-  try {
-    // Create a stack full of the arguments and keyword arguments.
-    auto stack =
-        createStackForSchema(op.schema(), std::move(args), std::move(kwargs));
+  // Create a stack full of the arguments and keyword arguments.
+  auto stack =
+      createStackForSchema(op.schema(), std::move(args), std::move(kwargs));
 
-    // Invoke the operation, which puts the return values onto the stack.
-    op.getOperation()(stack);
+  // Invoke the operation, which puts the return values onto the stack.
+  op.getOperation()(stack);
 
-    return createPyObjectForStack(std::move(stack));
-  } catch (const at::Error& error) {
-    // We don't want to show the backtrace in the error message in Python.
-    throw std::runtime_error(error.what_without_backtrace());
-  }
+  return createPyObjectForStack(std::move(stack));
 }
 }}  // namespace torch::jit
diff --git a/torch/csrc/jit/python_interpreter.cpp b/torch/csrc/jit/python_interpreter.cpp
index 86bd4dfaf96733..5c2115a20413d8 100644
--- a/torch/csrc/jit/python_interpreter.cpp
+++ b/torch/csrc/jit/python_interpreter.cpp
@@ -5,11 +5,11 @@
 #include "torch/csrc/autograd/function.h"
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/variable.h"
-#include "torch/csrc/jit/fusion_compiler.h"
 #include "torch/csrc/jit/operator.h"
 #include "torch/csrc/jit/custom_operator.h"
 #include "torch/csrc/jit/graph_executor.h"
 #include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/pybind_utils.h"
 
 #include "torch/csrc/variable_tensor_functions.h"
 
@@ -27,13 +27,15 @@ namespace torch { namespace jit {
 namespace {
 
 Operation createPythonOperation(Node* op_) {
+  AutoGIL gil;
   PythonOp* op = static_cast<PythonOp*>(op_);
   py::function func = py::reinterpret_borrow<py::function>(py::handle(op->pyobj.get()));
   size_t num_inputs = 0;
   for(auto arg_type : op->cconv) {
-    if(arg_type == 't')
+    if(arg_type == 'd')
       num_inputs++;
   }
+  JIT_ASSERT(op->outputs().size() == 1);
   return [=](Stack & stack) {
     AutoGIL gil;
     py::tuple py_inputs(op->cconv.size());
@@ -41,47 +43,18 @@ Operation createPythonOperation(Node* op_) {
     size_t next_scalar = 0;
     size_t next_tensor = 0;
     for (auto arg_type : op->cconv) {
-      if (arg_type == 's') {
+      if (arg_type == 'c') {
         py_inputs[i] = py::reinterpret_borrow<py::object>(
             op->scalar_args[next_scalar++].get());
-      } else if (arg_type == 't') {
-        auto var = std::move(peek(stack, next_tensor, num_inputs)).toTensor();
-        py_inputs[i] =
-            py::reinterpret_steal<py::object>(THPVariable_Wrap(var));
+      } else if (arg_type == 'd') {
+        py_inputs[i] = toPyObject(std::move(peek(stack, next_tensor, num_inputs)));
         next_tensor++;
       }
       i++;
     }
     drop(stack, num_inputs);
-    py::object py_outputs(func(*py_inputs));
-
-    auto num_outputs = op->outputs().size();
-    auto addOutput = [&](py::handle entry) {
-      if (!THPVariable_Check(entry.ptr())) {
-        throw std::runtime_error(
-            "Function application returned a non-Variable output");
-      }
-      THPVariable* var = (THPVariable*)entry.ptr();
-      auto cdata = var->cdata;
-      stack.push_back(std::move(cdata));
-    };
-
-    if (!PyTuple_Check(py_outputs.ptr())) {
-      if (num_outputs != 1) {
-        throw std::runtime_error(
-            "Function.apply returned the wrong number of outputs.");
-      }
-      addOutput(py_outputs);
-    } else {
-      auto output_tuple = py::tuple(py_outputs);
-      if (output_tuple.size() != num_outputs) {
-        throw std::runtime_error(
-            "Function application returned the wrong number of outputs.");
-      }
-      for (py::handle entry : py::tuple(py_outputs)) {
-        addOutput(entry);
-      }
-    }
+    py::object py_output(func(*py_inputs));
+    stack.push_back(returnToIValue(op->output()->type(), py_output));
     return 0;
   };
 }
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 416de46f1e6953..c745f9f6d8f122 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -180,29 +180,6 @@ void initPythonIRBindings(PyObject * module_) {
        py::arg("defer_weight_export")=false,
        py::arg("operator_export_type")=::torch::onnx::OperatorExportTypes::ONNX,
        py::arg("google_printer")=false)
-    .def("wrapPyFuncWithSymbolic", [](Graph &g, py::function func, std::vector<Value*> inputs, size_t n_outputs, py::function symbolic) {
-      // This function should be used for situations where we have a Python function
-      // that should have different behavior when exporting for JIT interpreter
-      // execution v.s. for ONNX export. For example, nn.utils.rnn.pack_padded_sequence
-      // emits a placeholder under ONNX export, but we want to keep the ability to
-      // run this in the interpreter, thus we emit a PythonOp for that use case.
-
-      // Concretely, this function emits a PythonOp wrapping the passed-in
-      // parameter `func`, while storing the function `symbolic` for use by the
-      // ONNX export
-      std::string cconv(inputs.size(), 't');
-      func.attr("symbolic") = symbolic;
-      Node* new_node = g.insertNode(g.createPythonOp(
-        THPObjectPtr(func.release().ptr()), cconv, {}));
-      for (auto i : inputs)
-        new_node->addInput(i);
-      std::vector<Value*> outputs;
-      for (size_t i = 0; i < n_outputs; ++i)
-        new_node->addOutput();
-      auto sl = std::make_shared<StringSourceLocation>(tracer::getPythonInterpreterStackTrace());
-      new_node->setSourceLocation(sl);
-      return py::make_iterator(new_node->outputs().begin(), new_node->outputs().end());
-    }, py::return_value_policy::reference_internal)
     .def("inputs",[](Graph &g) {
       return py::make_iterator(g.inputs().begin(), g.inputs().end());
     })
@@ -440,6 +417,11 @@ void initPythonIRBindings(PyObject * module_) {
     .def("__repr__",[](Type & t) {
       return t.python_str();
     })
+    .def("str",[](Type & t) {
+      std::ostringstream s;
+      s << t;
+      return s.str();
+    })
     .def("kind",[](Type& t_) {
       Type * t = &t_;
       switch(t->kind()) {
@@ -486,7 +468,8 @@ void initPythonIRBindings(PyObject * module_) {
     })
     .def("isSubtypeOf", [](std::shared_ptr<Type>& self, std::shared_ptr<Type> other) {
         return self->isSubtypeOf(other);
-    });
+    })
+    .def_static("inferFrom", inferTypeFrom);
 
   py::class_<NumberType, Type, std::shared_ptr<NumberType>>(m, "NumberType")
     .def_static("get", &NumberType::get);
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index 7e70550a38094b..1f8618121f1e28 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -3,7 +3,7 @@
 #include "torch/csrc/autograd/generated/variable_factories.h"
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/autograd/variable.h"
-#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/fusers/interface.h"
 #include "torch/csrc/jit/graph_executor.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/operator.h"
@@ -13,10 +13,12 @@
 
 #include <exception>
 #include <iostream>
+#include <limits>
 #include <memory>
 #include <mutex>
 #include <ostream>
 #include <stdexcept>
+#include <string>
 #include <typeinfo>
 #include <unordered_map>
 #include <unordered_set>
@@ -53,10 +55,10 @@ RegisterOperators reg({
     Operator(
         prim::FusionGroup,
         [](Node* node) {
-          auto kernel_cache = sharedFusionCompiler().getOrCompile(node);
-          return [kernel_cache](Stack& stack) {
+          auto handle = getFusionHandle(node);
+          return [handle](Stack& stack) {
             autograd::profiler::RecordFunction record("FusionGroup");
-            kernel_cache->run(stack);
+            handle->run(stack);
             return 0;
           };
         }),
@@ -134,6 +136,21 @@ RegisterOperators reg({
             return 0;
           };
         }),
+    Operator(
+        prim::StringToFloat,
+        [](Node* node) -> Operation {
+          return [](Stack& stack) {
+            auto s = pop(stack).toString();
+            if (s->string() != "inf") {
+              AT_ERROR(
+                  "Only 'inf' can be cast to a float, but got '",
+                  s->string(),
+                  "'");
+            }
+            push(stack, std::numeric_limits<double>::infinity());
+            return 0;
+          };
+        }),
     Operator(
         prim::Undefined,
         [](Node* node) {
@@ -146,7 +163,7 @@ RegisterOperators reg({
         prim::NoneGenerator,
         [](Node* node) {
           return [](Stack& stack) {
-            stack.push_back(at::Tensor());
+            stack.emplace_back();
             return 0;
           };
         }),
@@ -390,14 +407,14 @@ RegisterOperators reg({
 });
 
 // define implementations for primitive number ops
-#define DEFINE_GENERIC_OP(aten_op, op, float_result)                        \
+#define DEFINE_GENERIC_OP(aten_op, int_op, float_op, float_result)          \
   Operator(                                                                 \
       #aten_op "(int a, int b) -> int",                                     \
       [](Node* node) {                                                      \
         return [=](Stack& stack) {                                          \
           int64_t a, b;                                                     \
           pop(stack, a, b);                                                 \
-          push(stack, op);                                                  \
+          push(stack, int_op);                                                  \
           return 0;                                                         \
         };                                                                  \
       }),                                                                   \
@@ -406,7 +423,7 @@ RegisterOperators reg({
         return [=](Stack& stack) {                                          \
           double a, b;                                                      \
           pop(stack, a, b);                                                 \
-          push(stack, op);                                                  \
+          push(stack, float_op);                                                  \
           return 0;                                                         \
         };                                                                  \
       }),
@@ -421,8 +438,8 @@ RegisterOperators reg({
     };                                                        \
   }),
 
-#define DEFINE_BINARY_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, float)
-#define DEFINE_COMPARISON_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, int)
+#define DEFINE_BINARY_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, op, float)
+#define DEFINE_COMPARISON_OP(aten_op, op) DEFINE_GENERIC_OP(aten_op, op, op, int)
 
 // define helpers for where aten is missing scalar overloads
 // note: it would be better to define these in a standard library as
@@ -618,6 +635,11 @@ RegisterOperators reg2({
     DEFINE_BINARY_OP(aten::mul, a * b)
     DEFINE_BINARY_OP(aten::pow, static_cast<decltype(a)>(pow(a, b)))
 
+    // Pass in two ops for handling int and float separately as % in C++ only works for int
+    // The modulus calculation is different between C++ and Python (on negative), we preserve
+    // the python behavior as it's more common and match python syntax, hence the conversion.
+    DEFINE_GENERIC_OP(aten::remainder, (b + (a % b)) % b, fmod((b + fmod(a, b)), b), float)
+
     // TODO: Support python floordiv (//)
     // Right now aten::floordiv is only used by loop unrolling
     DEFINE_INT_OP(aten::floordiv, a / b)
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 8aae072a02a1fb..b66b96dd5eb6fb 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -12,7 +12,6 @@
 
 #include "ATen/core/optional.h"
 
-
 #include <climits>
 #include <set>
 
@@ -77,6 +76,8 @@ static Value* typeCast(const SourceRange& loc, Value* value, TypePtr dst) {
     n = graph.createFloatToInt(value);
   } else if(dst->isSubtypeOf(FloatType::get()) && orig->isSubtypeOf(IntType::get())) {
     n = graph.createIntToFloat(value);
+  } else if(dst->isSubtypeOf(FloatType::get()) && orig->isSubtypeOf(StringType::get())) {
+    n = graph.createStringToFloat(value);
   } else {
     throw ErrorReport(loc) << "Cannot cast type '" << orig->str() << "' to type '"
       << dst->str() << "'.";
@@ -722,6 +723,24 @@ std::shared_ptr<SugaredValue> BuiltinFunction::call(
       emitBuiltinCall(loc, *m.graph(), symbol, inputs, attributes, true));
 }
 
+inline bool isSupportedListElementType(TypePtr type) {
+  return type->isSubtypeOf(DynamicType::get()) ||
+      type->isSubtypeOf(NumberType::get());
+}
+
+// guard for List types we do not currently have operations for
+inline void ensureLegalType(const SourceRange& range, TypePtr ptr) {
+  if(TupleTypePtr tt = ptr->cast<TupleType>()) {
+    for(auto elem : tt->elements()) {
+      ensureLegalType(range, elem);
+    }
+  } else if(ListTypePtr lt = ptr->cast<ListType>()) {
+    if(!isSupportedListElementType(lt->getElementType())) {
+        throw ErrorReport(range) << "Lists can only contain numbers or Tensors, but found " << lt->getElementType()->str();
+    }
+  }
+}
+
 struct to_ir {
   to_ir(
       Def def,
@@ -772,6 +791,7 @@ struct to_ir {
       // Record the type for the schema and set the Type on the Value*
       arguments.push_back(schema.arguments.at(arg_annotation_idx++));
       new_input->setType(arguments.back().type);
+      ensureLegalType((*it).ident().range(), arguments.back().type);
     }
     // body
     auto stmts = def.statements();
@@ -970,11 +990,15 @@ struct to_ir {
 
   Value* emitCond(Expr cond) {
     Value* v = emitExpr(cond);
-    if(v->type()->isSubtypeOf(DynamicType::get())) {
-      v = typeCast(cond.range(), v, IntType::get());
-    }
-    if(!v->type()->isSubtypeOf(IntType::get())) {
-      throw ErrorReport(cond) << "expected a tensor or integer expression for condition but found " << v->type()->str();
+    if (!v->type()->isSubtypeOf(IntType::get())) {
+      ErrorReport error(cond);
+      error << "expected an integer expression for condition but found "
+            << v->type()->str();
+      if (v->type()->isSubtypeOf(DynamicType::get())) {
+        error << ", to use a tensor in a boolean"
+              << " expression, explicitly cast it with `bool()`";
+      }
+      throw error;
     }
     return v;
   }
@@ -1339,6 +1363,8 @@ struct to_ir {
         return prim::Starred;
       case '/':
         return aten::div;
+      case '%':
+        return aten::remainder;
       case TK_NE:
         return aten::ne;
       case TK_EQ:
@@ -1480,6 +1506,7 @@ struct to_ir {
       case '/':
       case '+':
       case '-':
+      case '%':
       case TK_UNARY_MINUS: {
         const auto& inputs = tree->trees();
         auto kind = getNodeKind(tree->kind(), inputs.size());
@@ -1548,8 +1575,10 @@ struct to_ir {
                 << *elem_type << " but found " << *v->type() << " instead";
           }
         }
-        return graph->insertNode(graph->createList(elem_type, values))
+        Value* result = graph->insertNode(graph->createList(elem_type, values))
             ->output();
+        ensureLegalType(tree->range(), result->type());
+        return result;
       } break;
       case TK_TUPLE_LITERAL: {
         auto ll = TupleLiteral(tree);
@@ -1727,7 +1756,7 @@ struct to_ir {
     auto slice_exp = SliceExpr(subscript.subscript_exprs()[0]);
     auto * sliceable = emitExpr(subscript.value());
     at::optional<int64_t> maybe_dim;
-    if (sliceable->type()->kind() == TypeKind::DynamicType) {
+    if (sliceable->type()->isSubtypeOf(DynamicType::get())) {
       // If the sliceable object is a tensor, specify a default dimension
       maybe_dim = 0;
     }
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index 2afc56df8c2c4e..c09caf4c3702f9 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -24,6 +24,8 @@
 #include <tuple>
 #include <utility>
 #include <vector>
+#include <pybind11/functional.h>
+
 
 namespace torch {
 namespace jit {
@@ -69,18 +71,15 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
     // introspection.
     size_t actual_n_args = n_args;
     if (!signature.is_none()) {
-      std::vector<TypePtr> arg_types, ret_types;
-      std::tie(arg_types, ret_types) = py::cast<std::pair<std::vector<TypePtr>, std::vector<TypePtr>>>(signature);
+      std::vector<TypePtr> arg_types;
+      TypePtr ret_type;
+      std::tie(arg_types, ret_type) = py::cast<std::pair<std::vector<TypePtr>, TypePtr>>(signature);
       args.reserve(arg_types.size());
       size_t idx = 0; // Fake argument names by putting in the index
       for (auto &arg_type : arg_types) {
         args.push_back(Argument(std::to_string(idx++), std::move(arg_type), {}, {}, false));
       }
-      rets.reserve(ret_types.size());
-      idx = 0;
-      for (auto &ret_type : ret_types) {
-        rets.push_back(Argument(std::to_string(idx++), std::move(ret_type), {}, {}, false));
-      }
+      rets.push_back(Argument("0", std::move(ret_type), {}, {}, false));
     } else {
       // Create a default signature using what information we have
 
@@ -99,10 +98,12 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
       for (size_t i=0; i < actual_n_args; ++i) {
         args.push_back(Argument(std::to_string(i), DynamicType::get(), {}, {}, false));
       }
-      rets.reserve(n_binders);
-      for (size_t i = 0; i < n_binders; ++i) {
-        rets.push_back(Argument(std::to_string(i), DynamicType::get(), {}, {}, false));
+      TypePtr ret_type = DynamicType::get();
+      if(n_binders != 1) {
+        std::vector<TypePtr> tuple_values(n_binders, ret_type);
+        ret_type = TupleType::create(std::move(tuple_values));
       }
+      rets.push_back(Argument("0", ret_type, {}, {}, false));
     }
     return FunctionSchema("", std::move(args), std::move(rets));
   }
@@ -120,27 +121,17 @@ struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
 
     // Release the function object so we can wrap it in a PythonOp
     py::object func = self;
-    std::string cconv(inputs.size(), 't');
+    std::string cconv(inputs.size(), 'd');
     Node* new_node = m.graph()->insertNode(m.graph()->createPythonOp(
       THPObjectPtr(func.release().ptr()), cconv, {}));
     new_node->setSourceLocation(std::make_shared<SourceRange>(loc));
     for(auto &i : *all_inputs)
       new_node->addInput(i);
 
-    // This is really dumb, but relaxing the constraints on return types would
-    // require us to change the implementation of PythonOps in the interpreter.
-    // Note that this effectively makes the return type of Tuple[Tensor] and Tensor
-    // equivalent, but the PythonOp impl ends with an optional tuple unpack, so we need
-    // to do it.
-    for (auto & ret_arg : schema.returns) {
-      if (!ret_arg.type->isSubtypeOf(DynamicType::get())) {
-        throw ErrorReport(loc) << "Python functions can currently only return Tensors";
-      }
-    }
-
     std::vector<Value*> outputs;
-    for(size_t i = 0; i < schema.returns.size(); ++i)
-      outputs.push_back(new_node->addOutput());
+    for(auto & ret_arg : schema.returns) {
+      outputs.push_back(new_node->addOutput()->setType(ret_arg.type));
+    }
     return std::make_shared<SimpleValue>(packOutputs(*m.graph(), outputs));
   }
 
@@ -381,10 +372,6 @@ void initJitScriptBindings(PyObject* module) {
   py::class_<Module, std::shared_ptr<Module>>(m, "ScriptModule")
       .def(py::init<>())
       .def("save", &Module::save)
-      .def("_load", [](const std::shared_ptr<script::Module> module,
-                       const std::string& filename) {
-        ImportIRModule(module, filename);
-      })
       .def("_set_optimized", &Module::set_optimized)
       .def(
           "_define",
@@ -540,7 +527,7 @@ void initJitScriptBindings(PyObject* module) {
   });
 
   m.def("merge_type_from_type_comment", &mergeTypesFromTypeComment);
-
+  m.def("import_ir_module", import_ir_module);
 }
 
 } // namespace script
diff --git a/torch/csrc/jit/script/lexer.cpp b/torch/csrc/jit/script/lexer.cpp
index 55e63df128a4d8..969a79ebb635bc 100644
--- a/torch/csrc/jit/script/lexer.cpp
+++ b/torch/csrc/jit/script/lexer.cpp
@@ -10,6 +10,49 @@ namespace torch {
 namespace jit {
 namespace script {
 
+static const std::unordered_map<int, int> binary_prec = {
+    {TK_IF,  1},
+    {TK_AND, 2},
+    {TK_OR,  2},
+    // reserve a level for unary not
+    {'<',    4},
+    {'>',    4},
+    {TK_EQ,  4},
+    {TK_LE,  4},
+    {TK_GE,  4},
+    {TK_NE,  4},
+    {'+',    5},
+    {'-',    5},
+    {'*',    6},
+    {'/',    6},
+    {'%',    6},
+    {'@',    6},
+    {TK_POW, 7},
+};
+
+static const std::unordered_map<int, int> unary_prec = {
+    {TK_NOT, 3},
+    {'-',    8},
+    {'*',    8},
+};
+
+bool SharedParserData::isUnary(int kind, int* prec) {
+  auto it = unary_prec.find(kind);
+  if (it != unary_prec.end()) {
+    *prec = it->second;
+    return true;
+  }
+  return false;
+}
+bool SharedParserData::isBinary(int kind, int* prec) {
+  auto it = binary_prec.find(kind);
+  if (it != binary_prec.end()) {
+    *prec = it->second;
+    return true;
+  }
+  return false;
+}
+
 int stringToKind(std::string str) {
   static std::once_flag init_flag;
   static std::unordered_map<std::string, int> str_to_kind;
diff --git a/torch/csrc/jit/script/lexer.h b/torch/csrc/jit/script/lexer.h
index 45b7061c3bf3d8..7d29ea5944ad10 100644
--- a/torch/csrc/jit/script/lexer.h
+++ b/torch/csrc/jit/script/lexer.h
@@ -8,7 +8,7 @@
 #include <vector>
 #include "torch/csrc/jit/assertions.h"
 #include "torch/csrc/jit/source_range.h"
-
+#include <torch/csrc/utils/memory.h>
 
 namespace torch {
 namespace jit {
@@ -85,7 +85,7 @@ namespace script {
   _(TK_SLICE_EXPR, "slice expr", "")             \
   _(TK_TYPE_COMMENT, "type comment", "# type:")
 
-static const char* valid_single_char_tokens = "+-*/@()[]:,={}><.?";
+static const char* valid_single_char_tokens = "+-*/%@()[]:,={}><.?";
 
 enum TokenKind {
   // we use characters to represent themselves so skip all valid characters
@@ -111,34 +111,28 @@ struct TokenTrie {
       kind = tok;
       return;
     }
-    auto& entry = children[*str];
-    if (entry == nullptr) {
-      entry.reset(new TokenTrie());
+
+    for (size_t i = 0, e = child_chars.size(); i < e; ++i) {
+      if (child_chars[i] == *str) {
+        child_tries[i]->insert(str + 1, tok);
+        return;
+      }
     }
-    entry->insert(str + 1, tok);
+
+    child_chars.emplace_back(*str);
+    child_tries.emplace_back(torch::make_unique<TokenTrie>());
+    child_tries.back()->insert(str + 1, tok);
   }
   int kind; // 0 == invalid token
-  std::unordered_map<char, TokenTrieRef> children;
+
+  std::vector<char> child_chars;
+  std::vector<TokenTrieRef> child_tries;
 };
 
 // stuff that is shared against all TC lexers/parsers and is initialized only
 // once.
 struct SharedParserData {
   SharedParserData() : head(new TokenTrie()) {
-    // listed in increasing order of precedence
-    std::vector<std::vector<int>> binary_ops = {
-        {TK_IF},
-        {TK_AND, TK_OR},
-        {}, // reserve a level for unary not
-        {'<', '>', TK_EQ, TK_LE, TK_GE, TK_NE},
-        {'+', '-'},
-        {'*', '/', '@'},
-        {TK_POW},
-    };
-    std::vector<std::vector<int>> unary_ops = {
-        {'-', '*'},
-    };
-
     std::stringstream ss;
     for (const char* c = valid_single_char_tokens; *c; c++) {
       std::string str(1, *c);
@@ -151,26 +145,6 @@ struct SharedParserData {
   }
     TC_FORALL_TOKEN_KINDS(ADD_CASE)
 #undef ADD_CASE
-
-    // precedence starts at 1 so that there is always a 0 precedence
-    // less than any other precedence
-    int prec = 1;
-    for (auto& group : binary_ops) {
-      for (auto& element : group) {
-        binary_prec[element] = prec;
-      }
-      prec++;
-    }
-    // unary ops
-    for (auto& group : unary_ops) {
-      for (auto& element : group) {
-        unary_prec[element] = prec;
-      }
-      prec++;
-    }
-    // add unary not separately because it slots into the precedence of
-    // binary operators
-    unary_prec[TK_NOT] = binary_prec[TK_AND] + 1;
   }
   // 1. skip whitespace
   // 2. handle comment or newline
@@ -195,7 +169,7 @@ struct SharedParserData {
     return start + len <= str.size() && std::count(str.begin() + start, str.begin() + start + len, c) == len;
   }
 
-  // python conconcatenates all adjacent strings "a" "b" == "ab"
+  // python concatenates all adjacent strings "a" "b" == "ab"
   // strings can be enclosed with 1 or 3 single or double quotes
   // if enclosed with 3 quotes newlines are valid
   // as elsewhere, backslash and new line should be ignored
@@ -314,8 +288,16 @@ struct SharedParserData {
       // rather the
       // identifier 'max'
       if (cur) {
-        auto it = cur->children.find(str[pos + i]);
-        cur = (it == cur->children.end()) ? nullptr : it->second.get();
+        size_t child_offset = 0;
+        for (size_t e = cur->child_chars.size(); child_offset < e; ++child_offset) {
+          if (cur->child_chars[child_offset] == str[pos + i])
+          break;
+        }
+
+        cur = (child_offset == cur->child_chars.size())
+          ? nullptr
+          : cur->child_tries[child_offset].get();
+
         if (cur && cur->kind != 0) {
           matched = true;
           *len = i + 1;
@@ -325,22 +307,8 @@ struct SharedParserData {
     }
     return matched;
   }
-  bool isUnary(int kind, int* prec) {
-    auto it = unary_prec.find(kind);
-    if (it != unary_prec.end()) {
-      *prec = it->second;
-      return true;
-    }
-    return false;
-  }
-  bool isBinary(int kind, int* prec) {
-    auto it = binary_prec.find(kind);
-    if (it != binary_prec.end()) {
-      *prec = it->second;
-      return true;
-    }
-    return false;
-  }
+  bool isUnary(int kind, int* prec);
+  bool isBinary(int kind, int* prec);
   bool isRightAssociative(int kind) {
     switch (kind) {
       case '?':
@@ -356,10 +324,6 @@ struct SharedParserData {
     return isalpha(n) || n == '_' || (i > 0 && isdigit(n));
   }
   TokenTrieRef head;
-  std::unordered_map<int, int>
-      unary_prec; // map from token to its unary precedence
-  std::unordered_map<int, int>
-      binary_prec; // map from token to its binary precedence
 };
 
 SharedParserData& sharedParserData();
diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h
index e808e70ccd4006..14e5e4f5ae1354 100644
--- a/torch/csrc/jit/script/parser.h
+++ b/torch/csrc/jit/script/parser.h
@@ -177,7 +177,7 @@ struct Parser {
     int binary_prec;
     while (shared.isBinary(L.cur().kind, &binary_prec)) {
       if (binary_prec <= precedence) // not allowed to parse something which is
-        // not greater than 'precedenc'
+        // not greater than 'precedence'
         break;
 
       int kind = L.cur().kind;
@@ -263,8 +263,10 @@ struct Parser {
   StringLiteral parseStringLiteral() {
     auto range = L.cur().range;
     std::stringstream ss;
-    while(L.cur().kind == TK_STRINGLITERAL)
-      ss << parseString(L.cur().range, L.next().text());
+    while(L.cur().kind == TK_STRINGLITERAL) {
+      auto literal_range = L.cur().range;
+      ss << parseString(literal_range, L.next().text());
+    }
     return StringLiteral::create(range, ss.str());
   }
 
@@ -339,7 +341,8 @@ struct Parser {
     auto param_types = parseList('(', ',', ')', &Parser::parseBareTypeAnnotation);
     TreeRef return_type;
     if (L.nextIf(TK_ARROW)) {
-      return_type = Maybe<Expr>::create(L.cur().range, parseExp());
+      auto return_type_range = L.cur().range;
+      return_type = Maybe<Expr>::create(return_type_range, parseExp());
     } else {
       return_type = Maybe<Expr>::create(L.cur().range);
     }
@@ -397,9 +400,10 @@ struct Parser {
     }
     return list;
   }
-  TreeRef parseIf() {
+  TreeRef parseIf(bool expect_if=true) {
     auto r = L.cur().range;
-    L.expect(TK_IF);
+    if (expect_if)
+      L.expect(TK_IF);
     auto cond = parseExp();
     L.expect(':');
     auto true_branch = parseStatements();
@@ -407,6 +411,12 @@ struct Parser {
     if (L.nextIf(TK_ELSE)) {
       L.expect(':');
       false_branch = parseStatements();
+    } else if (L.nextIf(TK_ELIF)) {
+      // NB: this needs to be a separate statement, since the call to parseIf
+      // mutates the lexer state, and thus causes a heap-use-after-free in
+      // compilers which evaluate argument expressions LTR
+      auto range = L.cur().range;
+      false_branch = makeList(range, {parseIf(false)});
     }
     return If::create(r, Expr(cond), List<Stmt>(true_branch), List<Stmt>(false_branch));
   }
@@ -448,7 +458,8 @@ struct Parser {
     TreeRef return_type;
     if (L.nextIf(TK_ARROW)) {
       // Exactly one expression for return type annotation
-      return_type = Maybe<Expr>::create(L.cur().range, parseExp());
+      auto return_type_range = L.cur().range;
+      return_type = Maybe<Expr>::create(return_type_range, parseExp());
     } else {
       // Default to returning single tensor. TODO: better sentinel value?
       return_type = Maybe<Expr>::create(L.cur().range);
diff --git a/torch/csrc/jit/script/tree_views.h b/torch/csrc/jit/script/tree_views.h
index b5d38f80fde390..162c33e6838666 100644
--- a/torch/csrc/jit/script/tree_views.h
+++ b/torch/csrc/jit/script/tree_views.h
@@ -233,6 +233,7 @@ struct Expr : public TreeView {
       case '*':
       case TK_STARRED:
       case '/':
+      case '%':
       case TK_NOT:
       case TK_CONST:
       case TK_STRINGLITERAL:
@@ -421,6 +422,7 @@ struct AssignKind : public TreeView {
       case '-':
       case '*':
       case '/':
+      case '%':
         return;
       default:
         throw ErrorReport(tree) << "is not a valid AssignKind";
@@ -496,6 +498,7 @@ struct BinOp : public Expr {
       case '-':
       case '@':
       case TK_POW:
+      case '%':
         if (tree->trees().size() != 2)
           throw ErrorReport(tree) << "BinOp expected 2 subtrees, found " << tree->trees().size();
         return;
diff --git a/torch/csrc/jit/source_range.h b/torch/csrc/jit/source_range.h
index 0139c2527513a3..227315e643fe4b 100644
--- a/torch/csrc/jit/source_range.h
+++ b/torch/csrc/jit/source_range.h
@@ -1,5 +1,6 @@
 #pragma once
 #include "torch/csrc/jit/source_location.h"
+#include "torch/csrc/jit/assertions.h"
 
 
 namespace torch {
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index bec6b0459a0814..3110fb2c360a35 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -1,18 +1,18 @@
 #ifdef USE_CATCH
 
 #define CATCH_CONFIG_MAIN
-#include "catch.hpp"
+#include "catch_utils.hpp"
 
 using Catch::StartsWith;
 
 #else
 
-#define REQUIRE JIT_ASSERT
+#define CATCH_REQUIRE JIT_ASSERT
 
 #endif
 
 #include "torch/csrc/jit/assertions.h"
-#include "torch/csrc/jit/fusion_compiler.h"
+#include "torch/csrc/jit/fusers/interface.h"
 #include "torch/csrc/jit/code_template.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/attributes.h"
@@ -26,6 +26,7 @@ using Catch::StartsWith;
 #include "torch/csrc/utils/hash.h"
 #include "torch/csrc/jit/argument_spec.h"
 #include "torch/csrc/jit/passes/shape_analysis.h"
+#include "torch/csrc/jit/passes/requires_grad_analysis.h"
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
 #include "torch/csrc/jit/passes/lower_grad_of.h"
 #include "torch/csrc/jit/operator.h"
@@ -34,7 +35,6 @@ using Catch::StartsWith;
 
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/autograd/engine.h"
-#include "torch/csrc/jit/passes/shape_analysis.h"
 
 #include "torch/csrc/jit/graph_executor.h"
 #include "torch/csrc/jit/script/compiler.h"
@@ -110,9 +110,9 @@ static void codeTemplateTest() {
     e.v("what",{"is","this"});
     TemplateEnv c(e);
     c.s("hi","foo2");
-    REQUIRE(e.s("hi") == "foo");
-    REQUIRE(c.s("hi") == "foo2");
-    REQUIRE(e.v("what")[0] == "is");
+    CATCH_REQUIRE(e.s("hi") == "foo");
+    CATCH_REQUIRE(c.s("hi") == "foo2");
+    CATCH_REQUIRE(e.v("what")[0] == "is");
   }
 
   {
@@ -126,7 +126,7 @@ static void codeTemplateTest() {
     auto s = ct.format(e);
     //std::cout << "'" << s << "'\n";
     //std::cout << "'" << ct_expect << "'\n";
-    REQUIRE(s == ct_expect);
+    CATCH_REQUIRE(s == ct_expect);
   }
 }
 
@@ -136,8 +136,6 @@ Value * appendNewNode(NodeKind kind, Graph& graph, ArrayRef<Value*> inputs) {
 
 
 static void fusionTests() {
-  FusionCompiler comp;
-
   auto testSimple = [&] {
     Graph graph;
     Var i0 = Var::asNewInput(graph);
@@ -147,12 +145,12 @@ static void fusionTests() {
     auto a = at::rand({3,4}, at::kCUDA);
     auto b = at::rand({4,3}, at::kCUDA).transpose(0,1);
     auto o = at::zeros({3,4}, at::kCUDA);
-    auto outputs = comp.debugLaunchGraph(graph, 0, {a,b});
-    REQUIRE(outputs.size() == 1);
+    auto outputs = debugLaunchGraph(graph, 0, {a,b});
+    CATCH_REQUIRE(outputs.size() == 1);
     auto o2 = a*b;
     float max_diff = (o2 - outputs[0]).abs().max().toCDouble();
     //std::cout << "max diff: " << max_diff << "\n";
-    REQUIRE(max_diff == 0);
+    CATCH_REQUIRE(max_diff == 0);
   };
   testSimple();
 
@@ -201,11 +199,11 @@ static void fusionTests() {
     auto t5 = out1.tanh();
     auto out0 = t16*t5;
 
-    auto outputs = comp.debugLaunchGraph(graph, 0, inputs);
-    REQUIRE(outputs.size() == graph.outputs().size());
-    REQUIRE(out0.is_same_size(outputs.front()));
+    auto outputs = debugLaunchGraph(graph, 0, inputs);
+    CATCH_REQUIRE(outputs.size() == graph.outputs().size());
+    CATCH_REQUIRE(out0.is_same_size(outputs.front()));
     float max_diff = (outputs.front() - out0).abs().max().toCDouble();
-    REQUIRE(max_diff < 1e-6);
+    CATCH_REQUIRE(max_diff < 1e-6);
 
   };
   testOne(0,0,0,0);
@@ -235,13 +233,13 @@ static void fusionTests() {
 
     auto o_r = a*b;
     auto o2_r = at::cat({a, o_r}, dim);
-    auto outputs = comp.debugLaunchGraph(graph, 0, {a,b});
-    REQUIRE(outputs.size() == 2);
+    auto outputs = debugLaunchGraph(graph, 0, {a,b});
+    CATCH_REQUIRE(outputs.size() == 2);
 
     float max_diff = (o_r - outputs[0]).abs().max().toCDouble();
-    REQUIRE(max_diff == 0);
+    CATCH_REQUIRE(max_diff == 0);
     float max_diff2 = (o2_r - outputs[1]).abs().max().toCDouble();
-    REQUIRE(max_diff2 == 0);
+    CATCH_REQUIRE(max_diff2 == 0);
   };
   testConcat(0);
   testConcat(1);
@@ -257,58 +255,58 @@ void attributesTest() {
   auto four = attr::perm;
   Attr attr;
   attr.f_(one,3.4)->i_(two,5)->s_(three,"what");
-  REQUIRE(attr.f(one) == 3.4);
-  REQUIRE(attr.s(three) == "what");
-  REQUIRE(attr.i(two) == 5);
+  CATCH_REQUIRE(attr.f(one) == 3.4);
+  CATCH_REQUIRE(attr.s(three) == "what");
+  CATCH_REQUIRE(attr.i(two) == 5);
   attr.s_(one,"no");
-  REQUIRE(attr.s(one) == "no");
-  REQUIRE(attr.hasAttribute(three));
-  REQUIRE(!attr.hasAttribute(four));
+  CATCH_REQUIRE(attr.s(one) == "no");
+  CATCH_REQUIRE(attr.hasAttribute(three));
+  CATCH_REQUIRE(!attr.hasAttribute(four));
   attr.ss_(two, {"hi", "now"});
-  REQUIRE(attr.ss(two).at(1) == "now");
+  CATCH_REQUIRE(attr.ss(two).at(1) == "now");
 
   Attr attr2;
   attr2.copyAttributes(attr);
-  REQUIRE(attr2.s(one) == "no");
+  CATCH_REQUIRE(attr2.s(one) == "no");
   attr2.f_(one,5);
-  REQUIRE(attr.s(one) == "no");
-  REQUIRE(attr2.f(one) == 5);
+  CATCH_REQUIRE(attr.s(one) == "no");
+  CATCH_REQUIRE(attr2.f(one) == 5);
 }
 
 void internedStringsTests () {
 
-  REQUIRE(prim::Param == Symbol::prim("Param"));
-  REQUIRE(prim::Return == Symbol::prim("Return"));
-  REQUIRE(prim::Return.toUnqualString() == std::string("Return"));
-  REQUIRE(prim::Return.toQualString() == std::string("prim::Return"));
+  CATCH_REQUIRE(prim::Param == Symbol::prim("Param"));
+  CATCH_REQUIRE(prim::Return == Symbol::prim("Return"));
+  CATCH_REQUIRE(prim::Return.toUnqualString() == std::string("Return"));
+  CATCH_REQUIRE(prim::Return.toQualString() == std::string("prim::Return"));
   Symbol newsym = Symbol::aten("__NEW_SYMBOL");
   size_t symstart = newsym;
-  REQUIRE(newsym.toQualString() == std::string("aten::__NEW_SYMBOL"));
+  CATCH_REQUIRE(newsym.toQualString() == std::string("aten::__NEW_SYMBOL"));
   // TODO: This test is a bit too close to the implementation details.
-  REQUIRE(Symbol::aten("What") == symstart+1);
-  REQUIRE(Symbol::aten("What2") == symstart+2);
-  REQUIRE(Symbol::aten("What") == symstart+1);
-  REQUIRE(Symbol::aten("What2") == symstart+2);
-  REQUIRE(Symbol(symstart+2).toUnqualString() == std::string("What2"));
+  CATCH_REQUIRE(Symbol::aten("What") == symstart+1);
+  CATCH_REQUIRE(Symbol::aten("What2") == symstart+2);
+  CATCH_REQUIRE(Symbol::aten("What") == symstart+1);
+  CATCH_REQUIRE(Symbol::aten("What2") == symstart+2);
+  CATCH_REQUIRE(Symbol(symstart+2).toUnqualString() == std::string("What2"));
 }
 
 void fromQualStringTests() {
-  REQUIRE(Symbol::fromQualString("prim::Param") == Symbol::prim("Param"));
-  REQUIRE(Symbol::fromQualString("aten::mm") == Symbol::aten("mm"));
-  REQUIRE(Symbol::fromQualString("onnx::LSTM") == Symbol::onnx("LSTM"));
-  REQUIRE(Symbol::fromQualString("attr::value") == Symbol::attr("value"));
-  REQUIRE(Symbol::fromQualString("scope::") == Symbol::scope(""));
-  REQUIRE(Symbol::fromQualString("::").toUnqualString() == std::string(""));
-  REQUIRE(Symbol::fromQualString("::").ns().toQualString() == std::string("namespaces::"));
-  REQUIRE(Symbol::fromQualString("new_ns::param").toUnqualString() == std::string("param"));
-  REQUIRE(Symbol::fromQualString("new_ns::param").ns().toUnqualString() == std::string("new_ns"));
-  REQUIRE(Symbol::fromQualString("new_ns::param").ns() == Symbol::fromQualString("namespaces::new_ns"));
+  CATCH_REQUIRE(Symbol::fromQualString("prim::Param") == Symbol::prim("Param"));
+  CATCH_REQUIRE(Symbol::fromQualString("aten::mm") == Symbol::aten("mm"));
+  CATCH_REQUIRE(Symbol::fromQualString("onnx::LSTM") == Symbol::onnx("LSTM"));
+  CATCH_REQUIRE(Symbol::fromQualString("attr::value") == Symbol::attr("value"));
+  CATCH_REQUIRE(Symbol::fromQualString("scope::") == Symbol::scope(""));
+  CATCH_REQUIRE(Symbol::fromQualString("::").toUnqualString() == std::string(""));
+  CATCH_REQUIRE(Symbol::fromQualString("::").ns().toQualString() == std::string("namespaces::"));
+  CATCH_REQUIRE(Symbol::fromQualString("new_ns::param").toUnqualString() == std::string("param"));
+  CATCH_REQUIRE(Symbol::fromQualString("new_ns::param").ns().toUnqualString() == std::string("new_ns"));
+  CATCH_REQUIRE(Symbol::fromQualString("new_ns::param").ns() == Symbol::fromQualString("namespaces::new_ns"));
 
   auto bad_inputs = {"scope", ":", ""};
   for (auto input : bad_inputs) {
     try {
       Symbol::fromQualString(input);
-      REQUIRE(0);
+      CATCH_REQUIRE(0);
     } catch (std::runtime_error c) {
     }
   }
@@ -469,8 +467,8 @@ void interpTest() {
     std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh);
 
     //std::cout << almostEqual(outputs[0],hx) << "\n";
-    REQUIRE(exactlyEqual(outputs[0],hx));
-    REQUIRE(exactlyEqual(outputs[1],cx));
+    CATCH_REQUIRE(exactlyEqual(outputs[0],hx));
+    CATCH_REQUIRE(exactlyEqual(outputs[1],cx));
 }
 
 void interpStageTest() {
@@ -502,8 +500,8 @@ void interpStageTest() {
     std::tie(hx, cx) = lstm(input[0], hx, cx1, w_ih, w_hh);
 
     //std::cout << almostEqual(outputs[0],hx) << "\n";
-    REQUIRE(exactlyEqual(outputs[0],hx));
-    REQUIRE(exactlyEqual(outputs[1],cx));
+    CATCH_REQUIRE(exactlyEqual(outputs[0],hx));
+    CATCH_REQUIRE(exactlyEqual(outputs[1],cx));
 }
 
 using var_meta_type = std::vector<int64_t>;
@@ -556,10 +554,10 @@ variable_list grad(const variable_list& outputs, const variable_list& inputs, co
 }
 
 void assertAllClose(const tensor_list& a, const tensor_list& b) {
-  REQUIRE(a.size() == b.size());
+  CATCH_REQUIRE(a.size() == b.size());
   for (size_t i = 0; i < a.size(); ++i) {
-    REQUIRE(a[i].is_same_size(b[i]));
-    REQUIRE(a[i].allclose(b[i]));
+    CATCH_REQUIRE(a[i].is_same_size(b[i]));
+    CATCH_REQUIRE(a[i].allclose(b[i]));
   }
 }
 
@@ -618,7 +616,7 @@ void testADFormulas() {
     // Trace and differentiate the op
     auto graph = trace(test, vars_in);
     EliminateDeadCode(graph); // Tracing of some ops depends on the DCE trick
-    auto grad_spec = differentiate(graph, std::vector<bool>(vars_in.size(), true));
+    auto grad_spec = differentiate(graph);
     LowerGradOf(*grad_spec.df);
     // Get outputs from the interpreter
     auto tensors_in                = fmap(vars_in, unwrap);
@@ -651,16 +649,16 @@ void testDifferentiate(std::ostream & out) {
   auto c = a * b * a + b;
   graph->registerOutput(c.value());
 
-  auto grad_spec = differentiate(graph, {true, true});
+  auto grad_spec = differentiate(graph);
   std::vector<size_t> expected_captured_inputs = {0, 1};
   std::vector<size_t> expected_captured_outputs = {1};
   std::vector<size_t> expected_input_vjps = {0, 1};
   std::vector<size_t> expected_output_vjps = {0, 1};
-  REQUIRE(grad_spec.f_real_outputs == 1);
-  REQUIRE(grad_spec.df_input_captured_inputs == expected_captured_inputs);
-  REQUIRE(grad_spec.df_input_captured_outputs == expected_captured_outputs);
-  REQUIRE(grad_spec.df_input_vjps == expected_input_vjps);
-  REQUIRE(grad_spec.df_output_vjps == expected_output_vjps);
+  CATCH_REQUIRE(grad_spec.f_real_outputs == 1);
+  CATCH_REQUIRE(grad_spec.df_input_captured_inputs == expected_captured_inputs);
+  CATCH_REQUIRE(grad_spec.df_input_captured_outputs == expected_captured_outputs);
+  CATCH_REQUIRE(grad_spec.df_input_vjps == expected_input_vjps);
+  CATCH_REQUIRE(grad_spec.df_output_vjps == expected_output_vjps);
   out << "testDifferentiate\n";
   out << *grad_spec.f;
   out << *grad_spec.df;
@@ -668,26 +666,29 @@ void testDifferentiate(std::ostream & out) {
 }
 
 void testDifferentiateWithRequiresGrad(std::ostream & out) {
-  auto graph = std::make_shared<Graph>();
-  at::ScalarType s = at::ScalarType::Float;
-  auto type = CompleteTensorType::create(s, -1, {2, 3, 4}, {12, 4, 1});
-
   // Build up a fake graph
-  auto a = SymbolicVariable::asNewInput(*graph, type);
-  auto b = SymbolicVariable::asNewInput(*graph, type);
+  auto graph = std::make_shared<Graph>();
+  auto a = SymbolicVariable::asNewInput(*graph);
+  auto b = SymbolicVariable::asNewInput(*graph);
   auto d = b * b + b;
   auto e = (d + a) * a + b;
   graph->registerOutput(d.value());
   graph->registerOutput(e.value());
 
-  auto grad_spec = differentiate(graph, {true, false});
+  auto a_var = autograd::make_variable(at::CPU(at::kFloat).tensor(2, 2), true);
+  auto b_var = autograd::make_variable(at::CPU(at::kFloat).tensor(2, 2), false);
+  ArgumentSpec spec (true, {a_var, b_var});
+  PropagateInputShapes(*graph, spec);
+  PropagateRequiresGrad(graph, spec);
+
+  auto grad_spec = differentiate(graph);
   std::vector<size_t> expected_input_vjps = {1, 2};  // for e and %4 = (d + a)
   std::vector<size_t> expected_output_vjps = {0};    // only a requires grad
-  REQUIRE(grad_spec.f_real_outputs == 2);              // we need one temporary %4 = (d + a)
-  REQUIRE(grad_spec.df_input_captured_inputs == std::vector<size_t>({0}));
-  REQUIRE(grad_spec.df_input_captured_outputs == std::vector<size_t>({2}));
-  REQUIRE(grad_spec.df_input_vjps == expected_input_vjps);
-  REQUIRE(grad_spec.df_output_vjps == expected_output_vjps);
+  CATCH_REQUIRE(grad_spec.f_real_outputs == 2);              // we need one temporary %4 = (d + a)
+  CATCH_REQUIRE(grad_spec.df_input_captured_inputs == std::vector<size_t>({0}));
+  CATCH_REQUIRE(grad_spec.df_input_captured_outputs == std::vector<size_t>({2}));
+  CATCH_REQUIRE(grad_spec.df_input_vjps == expected_input_vjps);
+  CATCH_REQUIRE(grad_spec.df_output_vjps == expected_output_vjps);
   out << "testDifferentiateWithRequiresGrad\n";
   out << *grad_spec.f;
   out << *grad_spec.df;
@@ -717,7 +718,7 @@ bool isEqual(at::IntList lhs, at::IntList rhs) {
 }
 
 bool isEqual(const CompleteArgumentInfo & ti, const autograd::Variable & v) {
-  REQUIRE(ti.isTensor());
+  CATCH_REQUIRE(ti.isTensor());
   if(!ti.defined())
     return ti.defined() == v.defined();
   return
@@ -753,34 +754,34 @@ void argumentSpecTest() {
 
   CompleteArgumentSpec a(true, list);
   CompleteArgumentSpec b(true, list);
-  REQUIRE(a.hashCode() == b.hashCode());
+  CATCH_REQUIRE(a.hashCode() == b.hashCode());
 
-  REQUIRE(a == b);
+  CATCH_REQUIRE(a == b);
   CompleteArgumentSpec d(true, list2);
-  REQUIRE(d == a);
-  REQUIRE(d.hashCode() == a.hashCode());
+  CATCH_REQUIRE(d == a);
+  CATCH_REQUIRE(d.hashCode() == a.hashCode());
 
   for(size_t i = 0; i < list.size(); ++i) {
-    REQUIRE(isEqual(a.at(i), list[i].toTensor()));
+    CATCH_REQUIRE(isEqual(a.at(i), list[i].toTensor()));
   }
   CompleteArgumentSpec no_grad(/*with_grad=*/false, list);
-  REQUIRE(no_grad != a);
+  CATCH_REQUIRE(no_grad != a);
 
   std::unordered_set<CompleteArgumentSpec> spec;
   spec.insert(std::move(a));
-  REQUIRE(spec.count(b) > 0);
-  REQUIRE(spec.count(no_grad) == 0);
+  CATCH_REQUIRE(spec.count(b) > 0);
+  CATCH_REQUIRE(spec.count(no_grad) == 0);
   spec.insert(std::move(no_grad));
-  REQUIRE(spec.count(CompleteArgumentSpec(true,list)) == 1);
+  CATCH_REQUIRE(spec.count(CompleteArgumentSpec(true,list)) == 1);
 
   list2[1].toTensor().transpose_(0,1);
   CompleteArgumentSpec c(true, list2); // same as list, except for one stride
-  REQUIRE(!(c == a));
-  REQUIRE(spec.count(c) == 0);
+  CATCH_REQUIRE(!(c == a));
+  CATCH_REQUIRE(spec.count(c) == 0);
 
   Stack stack = { var(CF, {1,2}, true), 3, var(CF, {1,2}, true) };
   CompleteArgumentSpec with_const(true, stack);
-  REQUIRE(with_const.at(2).sizes().size() == 2);
+  CATCH_REQUIRE(with_const.at(2).sizes().size() == 2);
 }
 
 void testGraphExecutor() {
@@ -801,11 +802,11 @@ void testGraphExecutor() {
   GraphExecutor executor(g);
   auto stack = createStack({v(input), v(hx), v(cx), v(w_ih), v(w_hh)});
   executor.run(stack);
-  REQUIRE(stack.size() == 2);
+  CATCH_REQUIRE(stack.size() == 2);
   at::Tensor r0, r1;
   std::tie(r0, r1) = lstm(input, hx, cx, w_ih, w_hh);
-  REQUIRE(almostEqual(Variable(stack[0].toTensor()).data(), r0));
-  REQUIRE(almostEqual(Variable(stack[1].toTensor()).data(), r1));
+  CATCH_REQUIRE(almostEqual(Variable(stack[0].toTensor()).data(), r0));
+  CATCH_REQUIRE(almostEqual(Variable(stack[1].toTensor()).data(), r1));
 }
 
 void testBlocks(std::ostream & out) {
@@ -844,25 +845,25 @@ const static auto cf_examples = R"JIT(
       # FIXME: use 0 instead of a.
       # c = 0
       c = a
-      if a < b:
+      if bool(a < b):
         c = b
       else:
         c = a
       return c
   def if_one(a, b):
     c = b
-    if a < b:
+    if bool(a < b):
       c = a
     return c
   def while_test(a, i):
-    while i < 3:
+    while bool(i < 3):
       a *= a
       i += 1
     return a
 )JIT";
 void testControlFlow() {
   script::Module cu;
-  script::defineMethodsInModule(cu, cf_examples, torch::jit::script::Resolver(), nullptr);
+  script::defineMethodsInModule(cu, cf_examples, torch::jit::script::nativeResolver, nullptr);
   auto run = [&](const std::string & name, std::vector<IValue> stack) {
     auto graph = cu.get_method(name).graph();
     Code code(graph);
@@ -876,11 +877,11 @@ void testControlFlow() {
   auto run_binary = [&](const std::string & name, int64_t a, int64_t b) {
     return V(run(name, {L(a), L(b)})[0]);
   };
-  REQUIRE(2 == run_binary("if_test", 1, 2));
-  REQUIRE(3 == run_binary("if_test", 3, 2));
-  REQUIRE(2 == run_binary("if_one", 2, 3));
-  REQUIRE(2 == run_binary("if_one", 3, 2));
-  REQUIRE(256 == run_binary("while_test",2,0));
+  CATCH_REQUIRE(2 == run_binary("if_test", 1, 2));
+  CATCH_REQUIRE(3 == run_binary("if_test", 3, 2));
+  CATCH_REQUIRE(2 == run_binary("if_one", 2, 3));
+  CATCH_REQUIRE(2 == run_binary("if_one", 3, 2));
+  CATCH_REQUIRE(256 == run_binary("while_test",2,0));
 }
 
 void testIValue() {
@@ -938,18 +939,18 @@ void testCustomOperators() {
     RegisterOperators reg({createOperator(
         "foo::bar", [](double a, at::Tensor b) { return a + b; })});
     auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
-    REQUIRE(ops.size() == 1);
+    CATCH_REQUIRE(ops.size() == 1);
 
     auto& op = ops.front();
-    REQUIRE(op->schema().name == "foo::bar");
+    CATCH_REQUIRE(op->schema().name == "foo::bar");
 
-    REQUIRE(op->schema().arguments.size() == 2);
-    REQUIRE(op->schema().arguments[0].name == "_0");
-    REQUIRE(op->schema().arguments[0].type->kind() == TypeKind::FloatType);
-    REQUIRE(op->schema().arguments[1].name == "_1");
-    REQUIRE(op->schema().arguments[1].type->kind() == TypeKind::DynamicType);
+    CATCH_REQUIRE(op->schema().arguments.size() == 2);
+    CATCH_REQUIRE(op->schema().arguments[0].name == "_0");
+    CATCH_REQUIRE(op->schema().arguments[0].type->kind() == TypeKind::FloatType);
+    CATCH_REQUIRE(op->schema().arguments[1].name == "_1");
+    CATCH_REQUIRE(op->schema().arguments[1].type->kind() == TypeKind::DynamicType);
 
-    REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType);
+    CATCH_REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType);
 
     Stack stack;
     push(stack, 2.0f, autograd::make_variable(at::ones(5)));
@@ -957,7 +958,7 @@ void testCustomOperators() {
     at::Tensor output;
     pop(stack, output);
 
-    REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f))));
+    CATCH_REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f))));
   }
   {
     RegisterOperators reg({createOperator(
@@ -966,19 +967,19 @@ void testCustomOperators() {
 
     auto& ops =
         getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
-    REQUIRE(ops.size() == 1);
+    CATCH_REQUIRE(ops.size() == 1);
 
     auto& op = ops.front();
-    REQUIRE(op->schema().name == "foo::bar_with_schema");
+    CATCH_REQUIRE(op->schema().name == "foo::bar_with_schema");
 
-    REQUIRE(op->schema().arguments.size() == 2);
-    REQUIRE(op->schema().arguments[0].name == "a");
-    REQUIRE(op->schema().arguments[0].type->kind() == TypeKind::FloatType);
-    REQUIRE(op->schema().arguments[1].name == "b");
-    REQUIRE(op->schema().arguments[1].type->kind() == TypeKind::DynamicType);
+    CATCH_REQUIRE(op->schema().arguments.size() == 2);
+    CATCH_REQUIRE(op->schema().arguments[0].name == "a");
+    CATCH_REQUIRE(op->schema().arguments[0].type->kind() == TypeKind::FloatType);
+    CATCH_REQUIRE(op->schema().arguments[1].name == "b");
+    CATCH_REQUIRE(op->schema().arguments[1].type->kind() == TypeKind::DynamicType);
 
-    REQUIRE(op->schema().returns.size() == 1);
-    REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType);
+    CATCH_REQUIRE(op->schema().returns.size() == 1);
+    CATCH_REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType);
 
     Stack stack;
     push(stack, 2.0f, autograd::make_variable(at::ones(5)));
@@ -986,7 +987,7 @@ void testCustomOperators() {
     at::Tensor output;
     pop(stack, output);
 
-    REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f))));
+    CATCH_REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f))));
   }
   {
     // Check that lists work well.
@@ -998,21 +999,21 @@ void testCustomOperators() {
 
     auto& ops =
         getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
-    REQUIRE(ops.size() == 1);
+    CATCH_REQUIRE(ops.size() == 1);
 
     auto& op = ops.front();
-    REQUIRE(op->schema().name == "foo::lists");
+    CATCH_REQUIRE(op->schema().name == "foo::lists");
 
-    REQUIRE(op->schema().arguments.size() == 3);
-    REQUIRE(op->schema().arguments[0].name == "ints");
-    REQUIRE(op->schema().arguments[0].type->isSubtypeOf(ListType::ofInts()));
-    REQUIRE(op->schema().arguments[1].name == "floats");
-    REQUIRE(op->schema().arguments[1].type->isSubtypeOf(ListType::ofFloats()));
-    REQUIRE(op->schema().arguments[2].name == "tensors");
-    REQUIRE(op->schema().arguments[2].type->isSubtypeOf(ListType::ofTensors()));
+    CATCH_REQUIRE(op->schema().arguments.size() == 3);
+    CATCH_REQUIRE(op->schema().arguments[0].name == "ints");
+    CATCH_REQUIRE(op->schema().arguments[0].type->isSubtypeOf(ListType::ofInts()));
+    CATCH_REQUIRE(op->schema().arguments[1].name == "floats");
+    CATCH_REQUIRE(op->schema().arguments[1].type->isSubtypeOf(ListType::ofFloats()));
+    CATCH_REQUIRE(op->schema().arguments[2].name == "tensors");
+    CATCH_REQUIRE(op->schema().arguments[2].type->isSubtypeOf(ListType::ofTensors()));
 
-    REQUIRE(op->schema().returns.size() == 1);
-    REQUIRE(op->schema().returns[0].type->isSubtypeOf(ListType::ofFloats()));
+    CATCH_REQUIRE(op->schema().returns.size() == 1);
+    CATCH_REQUIRE(op->schema().returns[0].type->isSubtypeOf(ListType::ofFloats()));
 
     Stack stack;
     push(stack, std::vector<int64_t>{1, 2});
@@ -1022,9 +1023,9 @@ void testCustomOperators() {
     std::vector<double> output;
     pop(stack, output);
 
-    REQUIRE(output.size() == 2);
-    REQUIRE(output[0] == 1.0);
-    REQUIRE(output[1] == 2.0);
+    CATCH_REQUIRE(output.size() == 2);
+    CATCH_REQUIRE(output[0] == 1.0);
+    CATCH_REQUIRE(output[1] == 2.0);
   }
   {
     RegisterOperators reg(
@@ -1033,17 +1034,17 @@ void testCustomOperators() {
 
     auto& ops =
         getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
-    REQUIRE(ops.size() == 1);
+    CATCH_REQUIRE(ops.size() == 1);
 
     auto& op = ops.front();
-    REQUIRE(op->schema().name == "foo::lists2");
+    CATCH_REQUIRE(op->schema().name == "foo::lists2");
 
-    REQUIRE(op->schema().arguments.size() == 1);
-    REQUIRE(op->schema().arguments[0].name == "tensors");
-    REQUIRE(op->schema().arguments[0].type->isSubtypeOf(ListType::ofTensors()));
+    CATCH_REQUIRE(op->schema().arguments.size() == 1);
+    CATCH_REQUIRE(op->schema().arguments[0].name == "tensors");
+    CATCH_REQUIRE(op->schema().arguments[0].type->isSubtypeOf(ListType::ofTensors()));
 
-    REQUIRE(op->schema().returns.size() == 1);
-    REQUIRE(op->schema().returns[0].type->isSubtypeOf(ListType::ofTensors()));
+    CATCH_REQUIRE(op->schema().returns.size() == 1);
+    CATCH_REQUIRE(op->schema().returns[0].type->isSubtypeOf(ListType::ofTensors()));
 
     Stack stack;
     push(stack, std::vector<at::Tensor>{autograd::make_variable(at::ones(5))});
@@ -1051,31 +1052,31 @@ void testCustomOperators() {
     std::vector<at::Tensor> output;
     pop(stack, output);
 
-    REQUIRE(output.size() == 1);
-    REQUIRE(output[0].allclose(autograd::make_variable(at::ones(5))));
+    CATCH_REQUIRE(output.size() == 1);
+    CATCH_REQUIRE(output[0].allclose(autograd::make_variable(at::ones(5))));
   }
   {
 #ifdef USE_CATCH
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         createOperator(
             "foo::bar_with_bad_schema(Tensor a) -> Tensor",
             [](double a, at::Tensor b) { return a + b; }),
         StartsWith("Inferred 2 argument(s) for operator implementation, "
                    "but the provided schema specified 1 argument(s)."));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         createOperator(
             "foo::bar_with_bad_schema(Tensor a) -> Tensor",
             [](double a) { return a; }),
         StartsWith("Inferred type for argument #0 was float, "
                    "but the provided schema specified type Dynamic "
                    "for the argument in that position"));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         createOperator(
             "foo::bar_with_bad_schema(float a) -> (float, float)",
             [](double a) { return a; }),
         StartsWith("Inferred 1 return value(s) for operator implementation, "
                    "but the provided schema specified 2 return value(s)."));
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         createOperator(
             "foo::bar_with_bad_schema(float a) -> Tensor",
             [](double a) { return a; }),
@@ -1108,7 +1109,7 @@ void testCustomOperators() {
         break;
       }
     }
-    REQUIRE(contains_traced_op);
+    CATCH_REQUIRE(contains_traced_op);
   }
   {
 #ifdef USE_CATCH
@@ -1123,7 +1124,7 @@ void testCustomOperators() {
     Stack stack;
     push(stack, std::vector<double>{1.0});
 
-    REQUIRE_THROWS_WITH(
+    CATCH_REQUIRE_THROWS_WITH(
         op.getOperation()(stack),
         StartsWith("Tracing float lists currently not supported!"));
 #endif
@@ -1155,42 +1156,42 @@ TORCH_API std::string runJITCPPTests() {
 
 #ifdef USE_CATCH
 
-TEST_CASE( "jit test CPU", "[cpu]" ) {
+CATCH_TEST_CASE( "jit test CPU", "[cpu]" ) {
 
   std::stringstream out;
-  SECTION( "control flow" )
+  CATCH_SECTION( "control flow" )
     testControlFlow();
-  SECTION( "blocks" )
+  CATCH_SECTION( "blocks" )
     testBlocks(out);
-  SECTION( "create autodiff subgraphs" )
+  CATCH_SECTION( "create autodiff subgraphs" )
     testCreateAutodiffSubgraphs(out);
-  SECTION( "differentiate" )
+  CATCH_SECTION( "differentiate" )
     testDifferentiate(out);
-  SECTION( "differentiate with requires grad" )
+  CATCH_SECTION( "differentiate with requires grad" )
     testDifferentiateWithRequiresGrad(out);
-  SECTION( "AD formulas" )
+  CATCH_SECTION( "AD formulas" )
     testADFormulas();
-  SECTION( "code template" )
+  CATCH_SECTION( "code template" )
     codeTemplateTest();
-  SECTION( "attributes" )
+  CATCH_SECTION( "attributes" )
     attributesTest();
-  SECTION( "interned strings" )
+  CATCH_SECTION( "interned strings" )
     internedStringsTests();
-  SECTION( "custom operators" )
+  CATCH_SECTION( "custom operators" )
     testCustomOperators();
 }
 
-TEST_CASE( "jit test CUDA", "[cuda]" ) {
+CATCH_TEST_CASE( "jit test CUDA", "[cuda]" ) {
 
-  SECTION( "graph executor" )
+  CATCH_SECTION( "graph executor" )
     testGraphExecutor();
-  SECTION( "fusion" )
+  CATCH_SECTION( "fusion" )
     fusionTests();
-  SECTION( "interp" )
+  CATCH_SECTION( "interp" )
     interpTest();
-  SECTION( "interp stage" )
+  CATCH_SECTION( "interp stage" )
     interpStageTest();
-  SECTION( "argument spec" )
+  CATCH_SECTION( "argument spec" )
     argumentSpecTest();
 }
 
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index d2c4ef9f0da5a7..c4e4948fa9248a 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -26,8 +26,9 @@ void genericAddInput(Node *n, T value) {
   n->addInput(v);
 }
 
-void badArgType() {
-  AT_ERROR("Found an unsupported argument type in the JIT tracer. File a bug report.");
+template<typename T>
+void badArgType(const T& v) {
+  AT_ERROR("Found an unsupported argument type in the JIT tracer: ", at::demangle_type<T>(), ". File a bug report.");
 }
 
 thread_local std::shared_ptr<TracingState> tracing_state;
@@ -38,9 +39,29 @@ void addInputs(Node *n, const char * name, int64_t value)            { detail::g
 void addInputs(Node *n, const char * name, bool value)               { detail::genericAddInput(n, value); }
 void addInputs(Node *n, const char * name, double value)             { detail::genericAddInput(n, value); }
 void addInputs(Node *n, const char * name, const at::Scalar& value)  { detail::genericAddInput(n, value); }
+void addInputs(Node *n, const char * name, const std::string& value) { detail::genericAddInput(n, value); }
 void addInputs(Node *n, const char * name, const at::Tensor& value)  { n->addInput(getValueTrace(value)); }
-void addInputs(Node *n, const char * name, const std::string& value)         { detail::badArgType(); }
-void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { detail::badArgType(); }
+void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { detail::badArgType(value); }
+void addInputs(Node *n, const char * name, at::Generator * value)            {
+  if (value) {
+    detail::badArgType(value);
+  }
+  Graph * g = n->owningGraph();
+  Value * undef_gen = g->insertNode(g->createNoneGenerator())->output();
+  n->addInput(undef_gen);
+}
+void addInputs(Node *n, const char * name, at::Device value) {
+  std::vector<int64_t> device = {
+      static_cast<int64_t>(value.type()),
+      static_cast<int64_t>(value.index())};
+  detail::genericAddInput(n, std::move(device));
+}
+void addInputs(Node *n, const char * name, at::Layout value) {
+  detail::genericAddInput(n, static_cast<int64_t>(value));
+}
+void addInputs(Node *n, const char * name, at::ScalarType value) {
+  detail::genericAddInput(n, static_cast<int64_t>(value));
+}
 
 void addInputs(Node *n, const char * name, at::TensorList value) {
   Graph *g = n->owningGraph();
@@ -50,12 +71,9 @@ void addInputs(Node *n, const char * name, at::TensorList value) {
 
 void addInputs(Node* n, const char * name, const at::TensorOptions& options) {
   // [TensorOptions in script] - update this when you change how we schematize TensorOptions
-  detail::genericAddInput(n, static_cast<int64_t>(options.dtype()));
-  detail::genericAddInput(n, static_cast<int64_t>(options.layout()));
-  std::vector<int64_t> device = {
-      static_cast<int64_t>(options.device().type()),
-      static_cast<int64_t>(options.device().index())};
-  detail::genericAddInput(n, std::move(device));
+  addInputs(n, name, options.dtype());
+  addInputs(n, name, options.layout());
+  addInputs(n, name, options.device());
 }
 
 void addInputs(Node *n, const char * name, at::IntList value) {
@@ -172,12 +190,22 @@ void setRecordSourceLocation(void (*v)(Node*)) {
 void defaultWarn(const std::string& str) { AT_WARN(str); }
 std::atomic<warn_fn_type> warn_callback { defaultWarn };
 
-void _do_warn(const char * _reason) {
+const char * WARN_PYTHON_DATAFLOW =
+  " might cause the trace to be incorrect. We can't record the data flow of "
+  "Python values, so this value will be treated as a constant in the future. "
+  "This means that the trace might not generalize to other inputs!";
+const char * WARN_CONSTRUCTOR =
+  " results are registered as constants in the trace. You can safely ignore this "
+  "warning if you use this function to create tensors out of constant variables "
+  "that would be the same every time you call this function. In any other case, "
+  "this might cause the trace to be incorrect.";
+
+// XXX: _kind can be a nullptr
+void _do_warn(const char * _reason, const char * _kind) {
   std::string reason { _reason };
+  std::string kind { _kind ? _kind : "" };
   std::ostringstream s;
-  s << std::string(reason);
-  s << " might cause the trace to be incorrect. We can't record the data flow of "
-       " Python values, which means the trace might not generalize to other inputs.";
+  s << reason << kind;
   warn_callback.load()(s.str());
 }
 
@@ -185,15 +213,4 @@ void setWarn(warn_fn_type fn) {
   warn_callback.store(fn);
 }
 
-void ensureUnique(const char * name, const at::Tensor& tensor) {
-  auto aliases = tensor.storage().use_count();
-  if (aliases > 1) {
-    std::stringstream ss;
-    ss << "There are " << aliases
-       << " live references to the tensor being modified when tracing in-place operator "
-       << name << " which ";
-    warn(ss.str().c_str());
-  }
-}
-
 }}}
diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h
index 73adda456c39f1..21a806d86a245e 100644
--- a/torch/csrc/jit/tracer.h
+++ b/torch/csrc/jit/tracer.h
@@ -172,13 +172,30 @@ TORCH_API void addInputs(Node *n, const char * name, const ArrayRef<double>& val
 TORCH_API void addInputs(Node *n, const char * name, const std::string& value);
 TORCH_API void addInputs(Node *n, const char * name, const at::SparseTensorRef& value);
 TORCH_API void addInputs(Node *n, const char * name, const at::TensorOptions& value);
+TORCH_API void addInputs(Node *n, const char * name, at::Device value);
+TORCH_API void addInputs(Node *n, const char * name, at::Layout value);
+TORCH_API void addInputs(Node *n, const char * name, at::ScalarType value);
+TORCH_API void addInputs(Node *n, const char * name, at::Generator * value);
 
 template<size_t N>
 void addInputs(Node *n, const char * name, std::array<bool, N> value) {
   throw std::runtime_error("Found an unsupported argument type in the JIT tracer. File a bug report.");
 }
 
-TORCH_API void ensureUnique(const char * name, const at::Tensor& tensor);
+inline void ensureUnique(const char * name, const at::Tensor& tensor) {
+  auto aliases = tensor.storage().use_count();
+  if (isTracing() && aliases > 1) {
+    std::stringstream ss;
+    ss << "There are " << aliases
+       << " live references to the data region being modified when tracing in-place operator "
+       << name << ". This might cause the trace to be incorrect, because all other views "
+       << "that also reference this data will not not reflect this change in the trace! "
+       << "On the other hand, if all other views use the same memory chunk, but are disjoint (e.g. "
+       << "are outputs of torch.split), this might still be safe.";
+    warn(ss.str().c_str());
+  }
+}
+
 
 template <
     typename T,
diff --git a/torch/csrc/jit/tracing_state.h b/torch/csrc/jit/tracing_state.h
index 34808f0272e467..598c165650daa4 100644
--- a/torch/csrc/jit/tracing_state.h
+++ b/torch/csrc/jit/tracing_state.h
@@ -95,11 +95,13 @@ inline bool isTracing() {
 }
 
 using warn_fn_type = void (*)(const std::string& msg);
-TORCH_API void _do_warn(const char * _reason);
-inline void warn(const char * _reason) {
+TORCH_API extern const char * WARN_PYTHON_DATAFLOW;
+TORCH_API extern const char * WARN_CONSTRUCTOR;
+TORCH_API void _do_warn(const char * _reason, const char * _kind);
+inline void warn(const char * _reason, const char * _kind=nullptr) {
   if (auto state = getTracingState()) {
     if (!state->warn) return;
-    _do_warn(_reason);
+    _do_warn(_reason, _kind);
   }
 }
 TORCH_API void setWarn(warn_fn_type fn);
diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h
index f3be38cbb2b005..166dfd6e06b7d3 100644
--- a/torch/csrc/jit/type.h
+++ b/torch/csrc/jit/type.h
@@ -10,6 +10,7 @@
 
 #include <memory>
 #include <iostream>
+#include <type_traits>
 
 namespace torch { namespace jit {
 
@@ -51,13 +52,20 @@ struct cloneType<true, T> {
 template<typename T>
 struct cloneType<false, T> {
   std::shared_ptr<T> operator()(std::shared_ptr<const T> ptr) const {
-    return std::make_shared<T>(*ptr);
+    auto result = std::make_shared<typename std::remove_const<T>::type>(*ptr);
+    // XXX: the line above will correctly slice the struct, and make its runtype
+    // type exactly equal to T. However, kind_ is a field of Type, so it will simply
+    // be copied, and we need to fix it in here to match the dynamic type.
+    result->kind_ = T::Kind;
+    return result;
   }
 };
 
 struct TORCH_API Type : std::enable_shared_from_this<Type> {
 private:
   TypeKind kind_;
+  template<bool is_singleton, typename T>
+  friend struct cloneType;
 
 protected:
   Type(TypeKind kind)
@@ -85,6 +93,8 @@ struct TORCH_API Type : std::enable_shared_from_this<Type> {
     return kind_;
   }
 
+  virtual bool requires_grad() const { return false; }
+
   // Dynamically cast this object to the subclass indicated by the
   // template variable, returning nullptr if the cast is invalid.
   // NOTE: if the cast succeeds, but the casted kind is not the
@@ -138,6 +148,8 @@ struct TORCH_API DynamicType : public Type {
     return DynamicTypePtr(new DynamicType( std::forward<T>(all)... )); // NOLINT(modernize-make-shared)
   }
 
+  bool requires_grad() const override { return true; }
+
   bool operator==(const Type& rhs) const override {
     return rhs.kind() == kind();
   }
@@ -168,6 +180,7 @@ struct TORCH_API TensorType : public Type {
   at::ScalarType scalarType() const { return scalar_type_; }
   int device() const { return device_; }
   int dim() const { return dim_; }
+  bool requires_grad() const override { return requires_grad_; }
 
   TensorTypePtr toScalarType(at::ScalarType type){
     auto t = TensorType::create(*this);
@@ -179,6 +192,11 @@ struct TORCH_API TensorType : public Type {
     t->dim_ = new_dim;
     return t;
   }
+  TensorTypePtr withRequiresGrad(bool req) {
+    auto t = TensorType::create(*this);
+    t->requires_grad_ = req;
+    return t;
+  }
 
   bool operator==(const Type& rhs) const override {
     if (rhs.kind() != TypeKind::TensorType)
@@ -201,14 +219,20 @@ struct TORCH_API TensorType : public Type {
 
 protected:
   TensorType(const at::Tensor& tensor, TypeKind kind=TypeKind::TensorType)
-    : TensorType(tensor.type().scalarType(), tensor.type().is_cuda() ? tensor.get_device() : -1, tensor.dim(), kind) {}
-  TensorType(at::ScalarType scalar_type, int device, int dim, TypeKind kind=TypeKind::TensorType)
+    : TensorType(tensor.type().scalarType(),
+                 tensor.type().is_cuda() ? tensor.get_device() : -1,
+                 tensor.dim(),
+                 tensor.is_variable() && tensor.requires_grad(),
+                 kind) {}
+  TensorType(at::ScalarType scalar_type, int device, int dim, bool requires_grad=true, TypeKind kind=TypeKind::TensorType)
     : Type(kind)
     , scalar_type_(scalar_type)
+    , requires_grad_(at::isFloatingType(scalar_type) && requires_grad)
     , device_(device)
     , dim_(dim) {}
 
   at::ScalarType scalar_type_;
+  bool requires_grad_;
   int device_;
   int dim_;
 };
@@ -270,7 +294,7 @@ struct TORCH_API CompleteTensorType : public TensorType {
     if (rhs->kind() == TypeKind::DynamicType)
       return true;
     if (rhs->kind() == TypeKind::TensorType)
-      return *dynamic_cast<const TensorType*>(this) == *rhs;
+      return *expect<TensorType>() ==  *rhs;
     return *this == *rhs;
   }
   std::string str() const override {
@@ -292,10 +316,10 @@ struct TORCH_API CompleteTensorType : public TensorType {
     : TensorType(tensor, TypeKind::CompleteTensorType)
     , sizes_(tensor.sizes().vec())
     , strides_(tensor.strides().vec()) {}
-  CompleteTensorType(at::ScalarType scalar_type, int device, at::IntList sizes)
-    : CompleteTensorType(scalar_type, device, sizes, CompleteTensorType::contiguousStridesOf(sizes)) {}
-  CompleteTensorType(at::ScalarType scalar_type, int device, at::IntList sizes, at::IntList strides)
-    : TensorType(scalar_type, device, sizes.size(), TypeKind::CompleteTensorType)
+  CompleteTensorType(at::ScalarType scalar_type, int device, at::IntList sizes, bool requires_grad=true)
+    : CompleteTensorType(scalar_type, device, sizes, CompleteTensorType::contiguousStridesOf(sizes), requires_grad) {}
+  CompleteTensorType(at::ScalarType scalar_type, int device, at::IntList sizes, at::IntList strides, bool requires_grad=true)
+    : TensorType(scalar_type, device, sizes.size(), requires_grad, TypeKind::CompleteTensorType)
     , sizes_(sizes.vec())
     , strides_(strides.vec()) {}
 
@@ -349,10 +373,11 @@ struct TORCH_API ListType : public Type {
   static ListTypePtr ofTensors();
   static ListTypePtr ofInts();
   static ListTypePtr ofFloats();
+
+  static const TypeKind Kind = TypeKind::ListType;
 private:
   ListType(TypePtr elem)
   : Type(TypeKind::ListType), elem(std::move(elem)) {}
-  static const TypeKind Kind = TypeKind::ListType;
   TypePtr elem;
 };
 
@@ -401,11 +426,12 @@ struct TORCH_API TupleType : public Type {
     ss << "]";
     return ss.str();
   }
+
+  static const TypeKind Kind = TypeKind::TupleType;
 private:
   TupleType(std::vector<TypePtr> elements_)
   : Type(TypeKind::TupleType)
   , elements_(std::move(elements_)) {}
-  static const TypeKind Kind = TypeKind::TupleType;
 
   bool compare(const Type& rhs, std::function<bool(const TypePtr, const TypePtr)> fn) const {
     if(rhs.kind() != kind())
diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp
index ea78e83ad01b36..fe40b60ce674f7 100644
--- a/torch/csrc/onnx/init.cpp
+++ b/torch/csrc/onnx/init.cpp
@@ -1,6 +1,6 @@
 #include "torch/csrc/onnx/init.h"
 #include "torch/csrc/onnx/onnx.h"
-#include "onnx/onnx.pb.h"
+#include "onnx/onnx_pb.h"
 
 namespace torch { namespace onnx {
 void initONNXBindings(PyObject* module) {
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index beda85e938be1b..4a40cf243f3a68 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -43,6 +43,9 @@ struct PyTensorType {
   // Precondition: Access to this struct is protected by the GIL
   at::Type* aten_type() {
     if (!aten_type_) {
+      if (is_cuda) {
+        torch::utils::cuda_lazy_init();
+      }
       auto* baseType = globalContext().getNonVariableTypeOpt(static_cast<at::Backend>(backend), static_cast<at::ScalarType>(scalar_type));
       aten_type_ = baseType ? torch::autograd::VariableType::getVariableTypeFromBaseType(*baseType) : nullptr;
     }
@@ -69,9 +72,6 @@ static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs
   if (!aten_type) {
     throw unavailable_type(tensor_type);
   }
-  if (aten_type->is_cuda()) {
-    torch::utils::cuda_lazy_init();
-  }
   return THPVariable_Wrap(torch::utils::legacy_tensor_ctor(*aten_type, args, kwargs));
   END_HANDLE_TH_ERRORS
 }
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index 60b9b8b05352d9..1d1d0188940216 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -74,13 +74,8 @@
 #define THPFloatUtils_unpackAccreal(object)   (double)THPUtils_unpackReal_FLOAT(object)
 #define THPFloatUtils_newAccreal(value)       THPUtils_newReal_FLOAT(value)
 #define THPHalfUtils_checkReal(object)        THPUtils_checkReal_FLOAT(object)
-#ifndef THP_HOST_HALF
-#define THPHalfUtils_unpackReal(object)       (half)THC_float2half(THPUtils_unpackReal_FLOAT(object))
-#define THPHalfUtils_newReal(value)           PyFloat_FromDouble(THC_half2float(value))
-#else
-#define THPHalfUtils_unpackReal(object)       TH_float2half(THPUtils_unpackReal_FLOAT(object))
-#define THPHalfUtils_newReal(value)           PyFloat_FromDouble(TH_half2float(value))
-#endif
+#define THPHalfUtils_unpackReal(object)       (at::Half)THPUtils_unpackReal_FLOAT(object)
+#define THPHalfUtils_newReal(value)           PyFloat_FromDouble(value)
 #define THPHalfUtils_checkAccreal(object)     THPUtils_checkReal_FLOAT(object)
 #define THPHalfUtils_unpackAccreal(object)    (double)THPUtils_unpackReal_FLOAT(object)
 #define THPHalfUtils_newAccreal(value)        THPUtils_newReal_FLOAT(value)
diff --git a/torch/csrc/utils/cuda_lazy_init.cpp b/torch/csrc/utils/cuda_lazy_init.cpp
index ac35e04e1decbf..6fd73526bba520 100644
--- a/torch/csrc/utils/cuda_lazy_init.cpp
+++ b/torch/csrc/utils/cuda_lazy_init.cpp
@@ -10,13 +10,19 @@ namespace torch {
 namespace utils {
 
 void cuda_lazy_init() {
-  static std::once_flag once;
-  std::call_once(once, []() {
+  AutoGIL g;
+  // Protected by the GIL.  We don't use call_once because under ASAN it
+  // has a buggy implementation that deadlocks if an instance throws an
+  // exception.  In any case, call_once isn't necessary, because we
+  // have taken a lock.
+  static bool run_yet = false;
+  if (!run_yet) {
     auto module = THPObjectPtr(PyImport_ImportModule("torch.cuda"));
     if (!module) throw python_error();
     auto res = THPObjectPtr(PyObject_CallMethod(module.get(), "_lazy_init", ""));
     if (!res) throw python_error();
-  });
+    run_yet = true;
+  }
 }
 
 }
diff --git a/torch/csrc/utils/cuda_lazy_init.h b/torch/csrc/utils/cuda_lazy_init.h
index 8b1d4be125beaa..f8522c1463443a 100644
--- a/torch/csrc/utils/cuda_lazy_init.h
+++ b/torch/csrc/utils/cuda_lazy_init.h
@@ -1,11 +1,23 @@
 #pragma once
 
-// It initially lies in torch/csrc/cuda, but to unconditionlly compile it
-// we have to put it here.
+// cuda_lazy_init() is always compiled, even for CPU-only builds.
+// Thus, it does not live in the cuda/ folder.
 
 namespace torch {
 namespace utils {
 
+// The INVARIANT is that this function MUST be called before you attempt
+// to get a CUDA Type object from ATen, in any way.  Here are some common
+// ways that a Type object may be retrieved:
+//
+//    - You call getNonVariableType or getNonVariableTypeOpt
+//    - You call toBackend() on a Type
+//
+// It's important to do this correctly, because if you forget to add it
+// you'll get an oblique error message about "Cannot initialize CUDA without
+// ATen_cuda library" if you try to use CUDA functionality from a CPU-only
+// build, which is not good UX.
+//
 void cuda_lazy_init();
 
 }
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index d36f70e460d793..b1118481c4520a 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -103,6 +103,10 @@ bool FunctionParameter::check(PyObject* obj) {
       return THPVariable_Check(obj) || (allow_numbers_as_tensors && THPUtils_checkDouble(obj));
     }
     case ParameterType::SCALAR:
+      if (PyComplex_Check(obj)) {
+        return true;
+      }
+      // fallthrough
     case ParameterType::DOUBLE: {
       if (THPUtils_checkDouble(obj)) {
         return true;
@@ -446,6 +450,13 @@ bool FunctionSignature::parse(PyObject* args, PyObject* kwargs, PyObject* dst[],
     PyObject* obj = nullptr;
     bool is_kwd = false;
     if (arg_pos < nargs) {
+      // extra positional args given after single positional IntList arg
+      if (param.keyword_only) {
+        if (raise_exception) {
+          extra_args(*this, nargs);
+        }
+        return false;
+      }
       obj = PyTuple_GET_ITEM(args, arg_pos);
     } else if (kwargs) {
       obj = PyDict_GetItem(kwargs, param.python_name);
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index a44ce9b3ed0221..9ff25d2d4e5134 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -232,6 +232,10 @@ inline at::Scalar PythonArgs::scalarWithDefault(int i, at::Scalar default_scalar
   if (THPUtils_checkLong(args[i])) {
     return at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(args[i])));
   }
+
+  if (PyComplex_Check(args[i])) {
+    return at::Scalar(THPUtils_unpackComplexDouble(args[i]));
+  }
   return at::Scalar(THPUtils_unpackDouble(args[i]));
 }
 
@@ -292,7 +296,7 @@ inline std::vector<int64_t> PythonArgs::intlistWithDefault(int i, std::vector<in
     try {
       // Elements of torch.Size are tensors during tracing, and we need to record extra
       // information before they are turned into an IntList
-      if (traceable && THPVariable_Check(obj)) {
+      if (traceable && jit::tracer::isTracing() && THPVariable_Check(obj)) {
         auto & var = THPVariable_Unpack(obj);
         jit::tracer::ArgumentStash::stashIntListElem(
             signature.params[i].name, size, idx, var);
@@ -430,7 +434,7 @@ inline at::Generator* PythonArgs::generator(int i) {
 }
 
 inline at::Storage PythonArgs::storage(int i) {
-  if (!args[i]) return nullptr;
+  if (!args[i]) return at::Storage();
   return createStorage(args[i]);
 }
 
diff --git a/torch/csrc/utils/python_numbers.h b/torch/csrc/utils/python_numbers.h
index 6a292e215108bb..0d3fa2f5cc37d6 100644
--- a/torch/csrc/utils/python_numbers.h
+++ b/torch/csrc/utils/python_numbers.h
@@ -126,3 +126,12 @@ inline double THPUtils_unpackDouble(PyObject* obj) {
   }
   return value;
 }
+
+inline std::complex<double> THPUtils_unpackComplexDouble(PyObject *obj) {
+  Py_complex value = PyComplex_AsCComplex(obj);
+  if (value.real == -1.0 && PyErr_Occurred()) {
+    throw python_error();
+  }
+
+  return std::complex<double>(value.real, value.imag);
+}
diff --git a/torch/csrc/utils/python_scalars.h b/torch/csrc/utils/python_scalars.h
index 820e6d2776596f..85ed0a0a1bc88f 100644
--- a/torch/csrc/utils/python_scalars.h
+++ b/torch/csrc/utils/python_scalars.h
@@ -20,6 +20,8 @@ inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
       break;
     case at::kFloat: *(float*)data = (float)THPUtils_unpackDouble(obj); break;
     case at::kDouble: *(double*)data = THPUtils_unpackDouble(obj); break;
+    case at::kComplexFloat: *(std::complex<float>*)data = (std::complex<float>)THPUtils_unpackComplexDouble(obj); break;
+    case at::kComplexDouble: *(std::complex<double>*)data = THPUtils_unpackComplexDouble(obj); break;
     default: throw std::runtime_error("invalid type");
   }
 }
@@ -34,6 +36,8 @@ inline PyObject* load_scalar(void* data, at::ScalarType scalarType) {
     case at::kHalf: return PyFloat_FromDouble(at::convert<double, at::Half>(*(at::Half*)data));
     case at::kFloat: return PyFloat_FromDouble(*(float*)data);
     case at::kDouble: return PyFloat_FromDouble(*(double*)data);
+    case at::kComplexFloat: return PyComplex_FromCComplex(*reinterpret_cast<Py_complex *>((std::complex<float>*)data));
+    case at::kComplexDouble: return PyComplex_FromCComplex(*reinterpret_cast<Py_complex *>((std::complex<double>*)data));
     default: throw std::runtime_error("invalid type");
   }
 }
diff --git a/torch/csrc/utils/tensor_flatten.cpp b/torch/csrc/utils/tensor_flatten.cpp
index e1bbd48316d6c6..29414368c7bd50 100644
--- a/torch/csrc/utils/tensor_flatten.cpp
+++ b/torch/csrc/utils/tensor_flatten.cpp
@@ -85,9 +85,8 @@ std::vector<at::Tensor> unflatten_sparse_tensors(
 
   std::vector<at::Tensor> outputs;
   outputs.reserve(tensors.size());
-  auto & type = tensors[0].type();
   for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; ++i)
-    outputs.emplace_back(type._sparse_coo_tensor_unsafe(indices[i], values[i], tensors[i].sizes()));
+    outputs.emplace_back(at::_sparse_coo_tensor_unsafe(indices[i], values[i], tensors[i].sizes()));
   return outputs;
 }
 
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index bf0e3308b1e66f..1b6ec75d13ebc2 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -197,7 +197,7 @@ void recursive_store(char* data, IntList sizes, IntList strides, int64_t dim,
 
 Tensor internal_new_from_data(const Type & type, at::optional<Device> device_opt, PyObject* data,
                                      bool copy_variables, bool copy_numpy,
-                                     bool type_inference) {
+                                     bool type_inference, bool args_requires_grad=false) {
   int32_t device_index = -1;
   if (device_opt.has_value()) {
     device_index = device_opt->index();
@@ -207,6 +207,10 @@ Tensor internal_new_from_data(const Type & type, at::optional<Device> device_opt
   }
 
   if (THPVariable_Check(data)) {
+      PyErr_WarnEx(PyExc_UserWarning,
+        "To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() "
+        "or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).", 1);
+
       auto var = reinterpret_cast<THPVariable*>(data)->cdata;
       auto type_inference_device_type = device_opt.has_value() ? device_opt->type()
                                                                : torch::getDeviceType(var.type());
@@ -216,8 +220,11 @@ Tensor internal_new_from_data(const Type & type, at::optional<Device> device_opt
                                                        *torch::getLayout(type.backend()),
                                                        type_inference_device_type);
       const auto& type_to_use = type_inference ? type_inference_type : type;
-      return copy_variables ? new_with_tensor_copy(type_to_use, var, device_index) :
+      auto new_tensor = copy_variables ? new_with_tensor_copy(type_to_use, var, device_index) :
                               new_with_type_conversion(type_to_use, var, device_index);
+      new_tensor.detach_(); // making copy constructed tensor a leaf node
+      new_tensor.set_requires_grad(args_requires_grad);
+      return new_tensor;
   }
 
 #ifdef USE_NUMPY
@@ -250,6 +257,15 @@ Tensor legacy_new_from_sequence(const Type & type, at::optional<Device> device,
   return legacy_new_from_data(type, device, data);
 }
 
+void check_legacy_ctor_device(const Type& type, at::optional<Device> device) {
+  if (device.has_value()) {
+    AT_CHECK(type.device_type() == device.value().type(),
+             "legacy constructor for device type: ", type.device_type(),
+             " was passed device type: ", device.value().type(),
+             ", but device type must be: ", type.device_type());
+  }
+}
+
 Tensor legacy_sparse_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
   static PythonArgParser parser({
     "new(*, Device? device=None)",
@@ -261,22 +277,30 @@ Tensor legacy_sparse_tensor_ctor(const Type& type, PyObject* args, PyObject* kwa
   ParsedArgs<4> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
+    auto deviceOptional = r.deviceOptional(0);
+    check_legacy_ctor_device(type, deviceOptional);
     return at::empty({0}, type.options(r.device(0).index()));
   } else if (r.idx == 1) {
     auto cdata = reinterpret_cast<void*>(r.toInt64(0));
     return type.unsafeTensorFromTH(cdata, true);
   } else if (r.idx == 2) {
-    at::DeviceGuard device_guard(r.device(2));
+    auto deviceOptional = r.deviceOptional(2);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.sparse_coo_tensor(r.tensor(0), r.tensor(1));
   } else if (r.idx == 3) {
-    at::DeviceGuard device_guard(r.device(3));
+    auto deviceOptional = r.deviceOptional(3);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
+    auto deviceOptional = r.deviceOptional(1);
+    check_legacy_ctor_device(type, deviceOptional);
     if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
       // new(sequence) binds to this signature but should be treated differently
       // unless the sequences is a torch.Size
-      return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+      return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0));
     }
     return new_with_sizes(type, r.device(1).index(), r.intlist(0));
   }
@@ -294,27 +318,35 @@ Tensor legacy_sparse_tensor_new(const Type& type, PyObject* args, PyObject* kwar
   ParsedArgs<5> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
-    at::DeviceGuard device_guard(r.device(0));
+    auto deviceOptional = r.deviceOptional(0);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.tensor();
   } else if (r.idx == 1) {
-    auto cdata = reinterpret_cast<void*>(r.device(0).index());
+    auto cdata = reinterpret_cast<void*>(r.toInt64(0));
     return type.unsafeTensorFromTH(cdata, true);
   } else if (r.idx == 2) {
     // Note: this signature doesn't have a dtype, even though it has a device; it probably shouldn't
     // have a device (we should infer it).
-    at::DeviceGuard device_guard(r.device(2));
+    auto deviceOptional = r.deviceOptional(2);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.sparse_coo_tensor(r.tensor(0), r.tensor(1));
   } else if (r.idx == 3) {
     // Note: this signature doesn't have a dtype, even though it has a device; it probably shouldn't
     // have a device (we should infer it).
-    at::DeviceGuard device_guard(r.device(3));
+    auto deviceOptional = r.deviceOptional(3);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.sparse_coo_tensor(r.tensor(0), r.tensor(1), r.intlist(2));
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
+    auto deviceOptional = r.deviceOptional(1);
+    check_legacy_ctor_device(type, deviceOptional);
     if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
       // new(sequence) binds to this signature but should be treated differently
       // unless the sequences is a torch.Size
-      return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+      return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0));
     }
     return new_with_sizes(type, r.device(1).index(), r.intlist(0));
   }
@@ -346,7 +378,9 @@ Tensor legacy_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
   ParsedArgs<2> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
-    at::DeviceGuard device_guard(r.device(0));
+    auto deviceOptional = r.deviceOptional(0);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.tensor();
   } else if (r.idx == 1) {
     return new_with_storage(type, r.storage(0));
@@ -357,14 +391,18 @@ Tensor legacy_tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
     return new_with_tensor(type, r.tensor(0));
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
+    auto deviceOptional = r.deviceOptional(1);
+    check_legacy_ctor_device(type, deviceOptional);
     if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
       // new(sequence) binds to this signature but should be treated differently
       // unless the sequences is a torch.Size
-      return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+      return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0));
     }
     return new_with_sizes(type, r.device(1).index(), r.intlist(0));
   } else if (r.idx == 5) {
-    return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+    auto deviceOptional = r.deviceOptional(1);
+    check_legacy_ctor_device(type, deviceOptional);
+    return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0));
   }
   throw std::runtime_error("new(): invalid arguments");
 }
@@ -386,7 +424,9 @@ Tensor legacy_tensor_new(const Type& type, PyObject* args, PyObject* kwargs) {
   ParsedArgs<3> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
-    at::DeviceGuard device_guard(r.device(0));
+    auto deviceOptional = r.deviceOptional(0);
+    check_legacy_ctor_device(type, deviceOptional);
+    at::DeviceGuard device_guard(deviceOptional);
     return type.tensor();
   } else if (r.idx == 1) {
     return new_with_storage(type, r.storage(0));
@@ -397,13 +437,17 @@ Tensor legacy_tensor_new(const Type& type, PyObject* args, PyObject* kwargs) {
     return new_with_tensor(type, r.tensor(0));
   } else if (r.idx == 4) {
     PyObject* arg = r.pyobject(0);
+    auto deviceOptional = r.deviceOptional(1);
+    check_legacy_ctor_device(type, deviceOptional);
     if (!THPSize_Check(arg) && PyTuple_GET_SIZE(args) >= 1 && arg == PyTuple_GET_ITEM(args, 0)) {
       // new(sequence) binds to this signature but should be treated differently
       // unless the sequences is a torch.Size
-      return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
+      return legacy_new_from_sequence(type, deviceOptional, r.pyobject(0));
     }
     return new_with_sizes(type, r.device(1).index(), r.intlist(0));
   } else if (r.idx == 5) {
+    auto deviceOptional = r.deviceOptional(1);
+    check_legacy_ctor_device(type, deviceOptional);
     return legacy_new_from_sequence(type, r.deviceOptional(1), r.pyobject(0));
   }
   throw std::runtime_error("new(): invalid arguments");
@@ -463,13 +507,16 @@ Tensor tensor_ctor(const Type& type, PyObject* args, PyObject* kwargs) {
   auto r = parser.parse(args, kwargs, parsed_args);
   if (r.idx == 0) {
     bool type_inference = r.isNone(1);
+    // args_requires_grad=True if requires_grad is set to True (requires_grad=True)
+    bool args_requires_grad = r.toBool(3);
     return internal_new_from_data(
                typeWithDefault(r, 1, 2, type),
                r.deviceOptional(2),
                r.pyobject(0),
                true,
                true,
-               type_inference)
+               type_inference,
+               args_requires_grad)
         .set_requires_grad(r.toBool(3));
   }
   throw std::runtime_error("tensor(): invalid arguments");
@@ -511,7 +558,7 @@ Tensor new_tensor(const Type& type, PyObject* args, PyObject* kwargs) {
 Tensor new_empty(const Type& type, PyObject* args, PyObject* kwargs) {
   static PythonArgParser parser({
     "new_empty(IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-  });
+  }, /*traceable=*/true);
 
   ParsedArgs<4> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
@@ -525,7 +572,7 @@ Tensor new_empty(const Type& type, PyObject* args, PyObject* kwargs) {
 Tensor new_full(const Type& type, PyObject* args, PyObject* kwargs) {
   static PythonArgParser parser({
     "new_full(IntList size, Scalar fill_value, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-  });
+  }, /*traceable=*/true);
 
   ParsedArgs<5> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
@@ -539,7 +586,7 @@ Tensor new_full(const Type& type, PyObject* args, PyObject* kwargs) {
 Tensor new_ones(const Type& type, PyObject* args, PyObject* kwargs) {
   static PythonArgParser parser({
     "new_ones(IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-  });
+  }, /*traceable=*/true);
 
   ParsedArgs<4> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
@@ -553,7 +600,7 @@ Tensor new_ones(const Type& type, PyObject* args, PyObject* kwargs) {
 Tensor new_zeros(const Type& type, PyObject* args, PyObject* kwargs) {
   static PythonArgParser parser({
     "new_zeros(IntList size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-  });
+  }, /*traceable=*/true);
 
   ParsedArgs<4> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index e717ae724c66a3..2a7b004d527093 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -1,566 +1,17 @@
-"""
-torch.distributed provides an MPI-like interface for exchanging tensor
-data across multi-machine networks. It supports a few different backends
-and initialization methods.
-"""
 import torch
-import atexit
-import warnings
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-
-class dist_backend:
-    UNDEFINED = -1
-    TCP = 0
-    MPI = 1
-    GLOO = 2
-    NCCL = 3
-
-
-_INITIALIZED_PG = 1
-_INITIALIZED_MW = 2
-_initialized = 0
-_backend = dist_backend.UNDEFINED
-_scope = locals()
-
-
-def _extend_scope(module):
-    _scope.update({k: getattr(module, k) for k in dir(module) if not k.startswith('_')})
 
 
 def is_available():
-    return torch._C._has_distributed()
-
-
-def destroy_process_group():
-    r"""Destroy the initialized distributed package
-    """
-    global _backend
-    global _initialized
-    torch._C._dist_destroy_process_group()
-    _backend = dist_backend.UNDEFINED
-    _initialized = 0
-
-
-def is_initialized():
-    r"""Checking if the process group has been initialized
-    """
-    return _initialized == _INITIALIZED_PG
-
-
-def init_process_group(backend, init_method='env://', **kwargs):
-    r"""Initializes the distributed package.
-
-    Arguments:
-        backend (str): Name of the backend to use. Depending on build-time configuration
-            valid values include: ``tcp``, ``mpi``, ``gloo`` and ``nccl``.
-        init_method (str, optional): URL specifying how to initialize the package.
-        world_size (int, optional): Number of processes participating in the job.
-        rank (int, optional): Rank of the current process.
-        group_name (str, optional): Group name. See description of init methods.
-
-    To enable ``backend == mpi``, PyTorch needs to built from source on a system that
-    supports MPI. If you want to use Open MPI with CUDA-aware support, please use
-    Open MPI major version 2 and above.
-
-    .. note::
-        This method initializes CUDA context. Therefore, if multiple processes
-        run on a single machine but use different GPUs, make sure to use
-        :func:`torch.cuda.set_device` before this method to avoid unnecessarily
-        creating context on the first visible device.
-
-    """
-    world_size = kwargs.pop('world_size', -1)
-    group_name = kwargs.pop('group_name', '')
-    rank = kwargs.pop('rank', -1)
-    assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
-
-    if not is_available():
-        raise RuntimeError("PyTorch built without distributed support")
-
-    global _initialized
-    if _initialized:
-        raise RuntimeError("trying to initialize torch.distributed twice!")
-
-    # Checking and assigning the distributed backend
-    global _backend
-
-    backend = backend.lower()
-    if backend == "tcp":
-        _backend = dist_backend.TCP
-    elif backend == "mpi":
-        _backend = dist_backend.MPI
-    elif backend == "gloo":
-        _backend = dist_backend.GLOO
-    elif backend == "nccl":
-        _backend = dist_backend.NCCL
-    else:
-        raise RuntimeError("Invalid distributed backend name: " + backend)
-
-    torch._C._dist_init_process_group(backend, init_method, world_size,
-                                      group_name, rank)
-    _initialized = _INITIALIZED_PG
-
-    if _backend == dist_backend.NCCL:
-        atexit.register(destroy_process_group)
-
-    if not torch._C._dist_init_extension(False, reduce_op, group):
-        raise RuntimeError("distributed module initialization failed")
-
-
-def init_master_worker(backend, init_method='env://', **kwargs):
-    warnings.warn("""
-    ================================================================================
-                                        WARNING
-    ================================================================================
-    Master-worker mode is still experimental. The API will change without
-    notice and we do not guarantee full correctness and expected performance yet.
-    We'll announce it once it's ready.
-    """)
-    world_size = kwargs.pop('world_size', -1)
-    group_name = kwargs.pop('group_name', '')
-    rank = kwargs.pop('rank', -1)
-    assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
-
-    if not is_available():
-        raise RuntimeError("PyTorch built without distributed support")
-
-    global _initialized
-    if _initialized:
-        raise RuntimeError("trying to initialize torch.distributed twice!")
-    torch._C._dist_init_master_worker(backend, init_method, world_size,
-                                      group_name, rank)
-    _initialized = _INITIALIZED_MW
-    import torch.distributed.collectives as collectives
-    import torch.distributed.remote_types as remote_types
-    _extend_scope(collectives)
-    _extend_scope(remote_types)
-    if not torch._C._dist_init_extension(True, reduce_op, group):
-        raise RuntimeError("distributed module initialization failed")
-
-
-class reduce_op(object):
-    SUM = object()
-    PRODUCT = object()
-    MAX = object()
-    MIN = object()
-
-
-class group(object):
-    WORLD = object()
-
-
-class _DistributedRequest(object):
-    def __init__(self, request):
-        self.request = request
-
-    def is_completed(self):
-        return torch._C._dist_request_is_completed(self.request)
-
-    def wait(self):
-        torch._C._dist_request_wait(self.request)
-
-
-def get_rank():
-    r"""Returns the rank of current process.
-
-    Rank is a unique identifier assigned to each process within a distributed
-    group. They are always consecutive integers ranging from ``0`` to
-    ``world_size - 1`` (inclusive).
-    """
-    assert torch.distributed._initialized
-    return torch._C._dist_get_rank()
-
-
-def get_world_size():
-    r"""Returns the number of processes in the distributed group."""
-    assert torch.distributed._initialized
-    return torch._C._dist_get_num_processes()
-
-
-def isend(tensor, dst):
-    r"""Sends a tensor asynchronously.
-
-    Arguments:
-        tensor (Tensor): Tensor to send.
-        dst (int): Destination rank.
-
-    Returns:
-        A distributed request object.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return _DistributedRequest(torch._C._dist_isend(tensor, dst))
-
-
-def irecv(tensor, src):
-    r"""Receives a tensor asynchronously.
-
-    Arguments:
-        tensor (Tensor): Tensor to fill with received data.
-        src (int): Source rank.
-
-    Returns:
-        A distributed request object.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return _DistributedRequest(torch._C._dist_irecv(tensor, src))
-
-
-def send(tensor, dst):
-    r"""Sends a tensor synchronously.
-
-    Arguments:
-        tensor (Tensor): Tensor to send.
-        dst (int): Destination rank.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return torch._C._dist_send(tensor, dst)
-
-
-def recv(tensor, src=None):
-    r"""Receives a tensor synchronously.
-
-    Arguments:
-        tensor (Tensor): Tensor to fill with received data.
-        src (int, optional): Source rank. Will receive from any
-            process if unspecified.
-
-    Returns:
-        Sender rank.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    if src is None:
-        return torch._C._dist_recv_any_source(tensor)
-    return torch._C._dist_recv(tensor, src)
-
-
-def broadcast_multigpu(tensor_list, src, group=group.WORLD):
-    r"""Broadcasts the tensor to the whole group with multiple GPU tensors
-    per node.
-
-    :attr:`tensor` must have the same number of elements in all the GPUs from
-    all processes participating in the collective. each tensor in the list must
-    be on a different GPU.
-
-    .. note::
-      Only NCCL backend is currently supported. :attr:`tensor_list` should only
-      contain GPU tensors.
-
-    Arguments:
-        tensor_list (List[Tensor]): Tensors that participate in the collective
-            operation. if ``src`` is the rank, then the first element of
-            ``tensor_list`` (``tensor_list[0]``) will be broadcasted to all
-            other tensors (on different GPUs) in the src process and all tensors
-            in ``tensor_list`` of other non-src processes. You also need to make
-            sure that ``len(tensor_list)`` is the same for all the distributed
-            processes calling this function.
-
-        src (int): Source rank.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-
-    return torch._C._dist_broadcast_multigpu(tensor_list, src, group)
-
-
-def broadcast(tensor, src, group=group.WORLD):
-    r"""Broadcasts the tensor to the whole group.
-
-    :attr:`tensor` must have the same number of elements in all processes
-    participating in the collective.
-
-    Arguments:
-        tensor (Tensor): Data to be sent if :attr:`src` is the rank of
-            current process, and tensor to be used to save received data
-            otherwise.
-        src (int): Source rank.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return torch._C._dist_broadcast(tensor, src, group)
-
-
-def all_reduce_multigpu(tensor_list, op=reduce_op.SUM, group=group.WORLD):
-    r"""Reduces the tensor data across all machines in such a way that all get
-    the final result. This function reduces a number of tensors on every node,
-    while each tensor resides on a different GPU.
-    Therefore, the input tensor in the tensor list needs to be GPU tensors.
-    Also, each tensor in the tensor list needs to reside on a different GPU.
-
-    After the call, all tensors in :attr:`tensor_list` will be bitwise identical
-    in all processes.
-
-    .. note::
-      Only NCCL backend is currently supported. :attr:`tensor_list` should only
-      contain GPU tensors.
-
-    Arguments:
-        tensor_list (List[Tensor]): List of input and output tensors of
-            the collective. The function operates in-place and requires that
-            each tensor to be a GPU tensor on different GPUs.
-            You also need to make sure that ``len(tensor_list)`` is the same for
-            all the distributed processes calling this function.
-
-        op (optional): One of the values from ``torch.distributed.reduce_op``
-            enum.  Specifies an operation used for element-wise reductions.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-
-    return torch._C._dist_all_reduce_multigpu(tensor_list, op, group)
-
-
-def all_reduce(tensor, op=reduce_op.SUM, group=group.WORLD):
-    r"""Reduces the tensor data across all machines in such a way that all get
-    the final result.
-
-    After the call :attr:`tensor` will be bitwise identical in all processes.
-
-    Arguments:
-        tensor (Tensor): Input and output of the collective. The function
-            operates in-place.
-        op (optional): One of the values from ``torch.distributed.reduce_op``
-            enum.  Specifies an operation used for element-wise reductions.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return torch._C._dist_all_reduce(tensor, op, group)
-
-
-def reduce_multigpu(tensor_list, dst, op=reduce_op.SUM, group=group.WORLD):
-    r"""Reduces the tensor data on multiple GPUs across all machines. Each tensor
-    in :attr`tensor_list` should reside on a separate GPU.
-
-    Only the GPU of ``tensor_list[0]`` on the process with rank :attr:`dst` is
-    going to receive the final result.
-
-    .. note::
-      Only NCCL backend is currently supported. :attr:`tensor_list` should only
-      contain GPU tensors.
-
-    Arguments:
-        tensor_list (List[Tensor]): Input and output GPU tensors of the
-            collective. The function operates in-place.
-            You also need to make sure that ``len(tensor_list)`` is the same for
-            all the distributed processes calling this function.
-
-        dst (int): Destination rank
-        op (optional): One of the values from ``torch.distributed.reduce_op``
-            enum.  Specifies an operation used for element-wise reductions.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-
-    return torch._C._dist_reduce_multigpu(tensor_list, dst, op, group)
-
-
-def reduce(tensor, dst, op=reduce_op.SUM, group=group.WORLD):
-    r"""Reduces the tensor data across all machines.
-
-    Only the process with rank :attr:`dst` is going to receive the final result.
-
-    Arguments:
-        tensor (Tensor): Input and output of the collective. The function
-            operates in-place.
-        dst (int): Destination rank
-        op (optional): One of the values from ``torch.distributed.reduce_op``
-            enum.  Specifies an operation used for element-wise reductions.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return torch._C._dist_reduce(tensor, dst, op, group)
-
-
-def all_gather_multigpu(output_tensor_lists,
-                        input_tensor_list,
-                        group=group.WORLD):
-    r"""Gathers tensors from the whole group in a list.
-    Each tensor in :attr:`input_tensor_list` should reside on a separate GPU.
-
-    .. note::
-      Only NCCL backend is currently supported. :attr:`output_tensor_lists` and
-      :attr:`input_tensor_list` should only contain GPU tensors.
-
-    Arguments:
-        output_tensor_lists (List[List[Tensor]]): Output lists. It should
-            contain correctly-sized tensors on each GPU to be used for output of
-            the collective.
-            e.g. ``output_tensor_lists[i]`` contains the all_gather
-            result that resides on the GPU of ``input_tensor_list[i]``.
-            Note that each element of ``output_tensor_lists[i]`` has the size of
-            ``world_size * len(input_tensor_list)``, since the function all
-            gathers the result from every single GPU in the group. To interpret
-            each element of ``output_tensor_list[i]``, note that
-            ``input_tensor_list[j]`` of rank k will be appear in
-            ``output_tensor_list[i][rank * world_size + j]``
-            Also note that ``len(output_tensor_lists)``, and the size of each
-            element in ``output_tensor_lists`` (each element is a list,
-            therefore ``len(output_tensor_lists[i])``) need to be the same
-            for all the distributed processes calling this function.
-
-        input_tensor_list (List[Tensor]): List of tensors (on different GPUs) to
-            be broadcast from current process.
-            Note that ``len(input_tensor_list)`` needs to be the same for
-            all the distributed processes calling this function.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-
-    flatten_tensor_list = []
-    for output_tensor_list in output_tensor_lists:
-        flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list))
-
-    ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list,
-                                             input_tensor_list,
-                                             group)
-
-    for output_tensor_list, flatten_tensor in zip(output_tensor_lists,
-                                                  flatten_tensor_list):
-        for tensor, value in zip(output_tensor_list,
-                                 _unflatten_dense_tensors(flatten_tensor,
-                                                          output_tensor_list)):
-            tensor.copy_(value)
-
-    return ret
-
-
-def all_gather(tensor_list, tensor, group=group.WORLD):
-    r"""Gathers tensors from the whole group in a list.
-
-    Arguments:
-        tensor_list (list[Tensor]): Output list. It should contain
-            correctly-sized tensors to be used for output of the collective.
-        tensor (Tensor): Tensor to be broadcast from current process.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    if _backend != dist_backend.NCCL:
-        return torch._C._dist_all_gather(tensor_list, tensor, group)
-    else:
-        return all_gather_multigpu([tensor_list], [tensor], group)
-
-
-def gather(tensor, **kwargs):
-    r"""Gathers a list of tensors in a single process.
-
-    Arguments:
-        tensor (Tensor): Input tensor.
-        dst (int): Destination rank. Required in all processes except the one that
-            is receiveing the data.
-        gather_list (list[Tensor]): List of appropriately-sized tensors to
-            use for received data. Required only in the receiving process.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    my_rank = get_rank()
-    dst = kwargs.pop('dst', my_rank)
-    gather_list = kwargs.pop('gather_list', None)
-    _group = kwargs.pop('group', group.WORLD)
-    if kwargs:
-        raise RuntimeError("got unexpected kwargs")
-    if dst == my_rank:
-        if gather_list is None:
-            raise RuntimeError("gather_list is a required argument in gather destination")
-        return torch._C._dist_gather_recv(gather_list, tensor, _group)
-    else:
-        if gather_list:
-            raise RuntimeError("non-empty gather_list can be given only to gather destination")
-        return torch._C._dist_gather_send(tensor, dst, _group)
-
-
-def scatter(tensor, **kwargs):
-    r"""Scatters a list of tensors to all processes in a group.
-
-    Each process will receive exactly one tensor and store its data in the
-    :attr:`tensor` argument.
-
-    Arguments:
-        tensor (Tensor): Output tensor.
-        src (int): Source rank. Required in all processes except the one that
-            is sending the data.
-        scatter_list (list[Tensor]): List of tensors to scatter. Required only
-            in the process that is sending the data.
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    my_rank = get_rank()
-    src = kwargs.pop('src', my_rank)
-    scatter_list = kwargs.pop('scatter_list', None)
-    _group = kwargs.pop('group', group.WORLD)
-    if kwargs:
-        raise RuntimeError("got unexpected kwargs: {}".format(", ".join(kwargs.keys())))
-    if src == my_rank:
-        if scatter_list is None:
-            raise RuntimeError("scatter_list is a required argument in scatter source")
-        return torch._C._dist_scatter_send(scatter_list, tensor, _group)
-    else:
-        if scatter_list:
-            raise RuntimeError("non-empty can be given only to scatter source")
-        return torch._C._dist_scatter_recv(tensor, src, _group)
-
-
-def barrier(group=group.WORLD):
-    r"""Synchronizes all processes.
-
-    This collective blocks processes until the whole group enters this function.
-
-    Arguments:
-        group (optional): Group of the collective.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    return torch._C._dist_barrier(group)
-
-
-def new_group(ranks=None):
-    r"""Creates a new distributed group.
-
-    This function requires that all processes in the main group (i.e., all
-    processes that are part of the distributed job) enter this function, even
-    if they are not going to be members of the group. Additionally, groups
-    should be created in the same order in all processes.
-
-    Arguments:
-        ranks (list[int]): List of ranks of group members.
-
-    Returns:
-        A handle of distributed group that can be given to collective calls.
-    """
-    assert torch.distributed._initialized == _INITIALIZED_PG, \
-        "collective only supported in process-group mode"
-    if ranks is None:
-        ranks = list(range(get_world_size()))
-    return torch._C._dist_new_group(ranks)
-
-
-def _clear_group_cache(group=group.WORLD):
-    r"""Clear the created distributed group's cached resource.
-
-    Only NCCL backend is currently supported.
+    return hasattr(torch._C, "_c10d_init")
 
-    Cached resource includes NCCL communicators and CUDA events.
 
-    Arguments:
-        group (optional): Group of the collective.
-    """
-    return torch._C._dist_clear_group_cache(group)
+if is_available() and not torch._C._c10d_init():
+    raise RuntimeError("Failed to initialize PyTorch distributed support")
 
 
-def _register_stream(stream):
-    if not _initialized:
-        raise RuntimeError("torch.distributed needs to be initialized first")
-    return torch._C._dist_register_stream(stream)
+if is_available():
+    from .distributed_c10d import *
+    # Variables prefixed with underscore are not auto imported
+    # See the comment in `distributed_c10d.py` above `_backend` on why we expose
+    # this.
+    from .distributed_c10d import _backend
diff --git a/torch/distributed/c10d/__init__.py b/torch/distributed/c10d/__init__.py
deleted file mode 100644
index 5356097743aa3c..00000000000000
--- a/torch/distributed/c10d/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import torch
-
-
-def is_available():
-    return hasattr(torch._C, "_c10d_init")
-
-
-if is_available() and not torch._C._c10d_init():
-    raise RuntimeError("Failed to initialize PyTorch distributed support")
-
-
-if is_available():
-    from .distributed_c10d import *
diff --git a/torch/distributed/deprecated/__init__.py b/torch/distributed/deprecated/__init__.py
new file mode 100644
index 00000000000000..151439aa332bc0
--- /dev/null
+++ b/torch/distributed/deprecated/__init__.py
@@ -0,0 +1,566 @@
+"""
+torch.distributed.deprecated provides an MPI-like interface for exchanging tensor
+data across multi-machine networks. It supports a few different backends
+and initialization methods.
+"""
+import torch
+import atexit
+import warnings
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+
+class dist_backend:
+    UNDEFINED = -1
+    TCP = 0
+    MPI = 1
+    GLOO = 2
+    NCCL = 3
+
+
+_INITIALIZED_PG = 1
+_INITIALIZED_MW = 2
+_initialized = 0
+_backend = dist_backend.UNDEFINED
+_scope = locals()
+
+
+def _extend_scope(module):
+    _scope.update({k: getattr(module, k) for k in dir(module) if not k.startswith('_')})
+
+
+def is_available():
+    return torch._C._has_distributed()
+
+
+def destroy_process_group():
+    r"""Destroy the initialized distributed package
+    """
+    global _backend
+    global _initialized
+    torch._C._dist_destroy_process_group()
+    _backend = dist_backend.UNDEFINED
+    _initialized = 0
+
+
+def is_initialized():
+    r"""Checking if the process group has been initialized
+    """
+    return _initialized == _INITIALIZED_PG
+
+
+def init_process_group(backend, init_method='env://', **kwargs):
+    r"""Initializes the distributed package.
+
+    Arguments:
+        backend (str): Name of the backend to use. Depending on build-time configuration
+            valid values include: ``tcp``, ``mpi``, ``gloo`` and ``nccl``.
+        init_method (str, optional): URL specifying how to initialize the package.
+        world_size (int, optional): Number of processes participating in the job.
+        rank (int, optional): Rank of the current process.
+        group_name (str, optional): Group name. See description of init methods.
+
+    To enable ``backend == mpi``, PyTorch needs to built from source on a system that
+    supports MPI. If you want to use Open MPI with CUDA-aware support, please use
+    Open MPI major version 2 and above.
+
+    .. note::
+        This method initializes CUDA context. Therefore, if multiple processes
+        run on a single machine but use different GPUs, make sure to use
+        :func:`torch.cuda.set_device` before this method to avoid unnecessarily
+        creating context on the first visible device.
+
+    """
+    world_size = kwargs.pop('world_size', -1)
+    group_name = kwargs.pop('group_name', '')
+    rank = kwargs.pop('rank', -1)
+    assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
+
+    if not is_available():
+        raise RuntimeError("PyTorch built without distributed support")
+
+    global _initialized
+    if _initialized:
+        raise RuntimeError("trying to initialize torch.distributed.deprecated twice!")
+
+    # Checking and assigning the distributed backend
+    global _backend
+
+    backend = backend.lower()
+    if backend == "tcp":
+        _backend = dist_backend.TCP
+    elif backend == "mpi":
+        _backend = dist_backend.MPI
+    elif backend == "gloo":
+        _backend = dist_backend.GLOO
+    elif backend == "nccl":
+        _backend = dist_backend.NCCL
+    else:
+        raise RuntimeError("Invalid distributed backend name: " + backend)
+
+    torch._C._dist_init_process_group(backend, init_method, world_size,
+                                      group_name, rank)
+    _initialized = _INITIALIZED_PG
+
+    if _backend == dist_backend.NCCL:
+        atexit.register(destroy_process_group)
+
+    if not torch._C._dist_init_extension(False, reduce_op, group):
+        raise RuntimeError("distributed module initialization failed")
+
+
+def init_master_worker(backend, init_method='env://', **kwargs):
+    warnings.warn("""
+    ================================================================================
+                                        WARNING
+    ================================================================================
+    Master-worker mode is still experimental. The API will change without
+    notice and we do not guarantee full correctness and expected performance yet.
+    We'll announce it once it's ready.
+    """)
+    world_size = kwargs.pop('world_size', -1)
+    group_name = kwargs.pop('group_name', '')
+    rank = kwargs.pop('rank', -1)
+    assert len(kwargs) == 0, "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
+
+    if not is_available():
+        raise RuntimeError("PyTorch built without distributed support")
+
+    global _initialized
+    if _initialized:
+        raise RuntimeError("trying to initialize torch.distributed.deprecated twice!")
+    torch._C._dist_init_master_worker(backend, init_method, world_size,
+                                      group_name, rank)
+    _initialized = _INITIALIZED_MW
+    import torch.distributed.deprecated.collectives as collectives
+    import torch.distributed.deprecated.remote_types as remote_types
+    _extend_scope(collectives)
+    _extend_scope(remote_types)
+    if not torch._C._dist_init_extension(True, reduce_op, group):
+        raise RuntimeError("distributed module initialization failed")
+
+
+class reduce_op(object):
+    SUM = object()
+    PRODUCT = object()
+    MAX = object()
+    MIN = object()
+
+
+class group(object):
+    WORLD = object()
+
+
+class _DistributedRequest(object):
+    def __init__(self, request):
+        self.request = request
+
+    def is_completed(self):
+        return torch._C._dist_request_is_completed(self.request)
+
+    def wait(self):
+        torch._C._dist_request_wait(self.request)
+
+
+def get_rank():
+    r"""Returns the rank of current process.
+
+    Rank is a unique identifier assigned to each process within a distributed
+    group. They are always consecutive integers ranging from ``0`` to
+    ``world_size - 1`` (inclusive).
+    """
+    assert torch.distributed.deprecated._initialized
+    return torch._C._dist_get_rank()
+
+
+def get_world_size():
+    r"""Returns the number of processes in the distributed group."""
+    assert torch.distributed.deprecated._initialized
+    return torch._C._dist_get_num_processes()
+
+
+def isend(tensor, dst):
+    r"""Sends a tensor asynchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to send.
+        dst (int): Destination rank.
+
+    Returns:
+        A distributed request object.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return _DistributedRequest(torch._C._dist_isend(tensor, dst))
+
+
+def irecv(tensor, src):
+    r"""Receives a tensor asynchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to fill with received data.
+        src (int): Source rank.
+
+    Returns:
+        A distributed request object.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return _DistributedRequest(torch._C._dist_irecv(tensor, src))
+
+
+def send(tensor, dst):
+    r"""Sends a tensor synchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to send.
+        dst (int): Destination rank.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_send(tensor, dst)
+
+
+def recv(tensor, src=None):
+    r"""Receives a tensor synchronously.
+
+    Arguments:
+        tensor (Tensor): Tensor to fill with received data.
+        src (int, optional): Source rank. Will receive from any
+            process if unspecified.
+
+    Returns:
+        Sender rank.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    if src is None:
+        return torch._C._dist_recv_any_source(tensor)
+    return torch._C._dist_recv(tensor, src)
+
+
+def broadcast_multigpu(tensor_list, src, group=group.WORLD):
+    r"""Broadcasts the tensor to the whole group with multiple GPU tensors
+    per node.
+
+    :attr:`tensor` must have the same number of elements in all the GPUs from
+    all processes participating in the collective. each tensor in the list must
+    be on a different GPU.
+
+    .. note::
+      Only NCCL backend is currently supported. :attr:`tensor_list` should only
+      contain GPU tensors.
+
+    Arguments:
+        tensor_list (List[Tensor]): Tensors that participate in the collective
+            operation. if ``src`` is the rank, then the first element of
+            ``tensor_list`` (``tensor_list[0]``) will be broadcasted to all
+            other tensors (on different GPUs) in the src process and all tensors
+            in ``tensor_list`` of other non-src processes. You also need to make
+            sure that ``len(tensor_list)`` is the same for all the distributed
+            processes calling this function.
+
+        src (int): Source rank.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+
+    return torch._C._dist_broadcast_multigpu(tensor_list, src, group)
+
+
+def broadcast(tensor, src, group=group.WORLD):
+    r"""Broadcasts the tensor to the whole group.
+
+    :attr:`tensor` must have the same number of elements in all processes
+    participating in the collective.
+
+    Arguments:
+        tensor (Tensor): Data to be sent if :attr:`src` is the rank of
+            current process, and tensor to be used to save received data
+            otherwise.
+        src (int): Source rank.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_broadcast(tensor, src, group)
+
+
+def all_reduce_multigpu(tensor_list, op=reduce_op.SUM, group=group.WORLD):
+    r"""Reduces the tensor data across all machines in such a way that all get
+    the final result. This function reduces a number of tensors on every node,
+    while each tensor resides on a different GPU.
+    Therefore, the input tensor in the tensor list needs to be GPU tensors.
+    Also, each tensor in the tensor list needs to reside on a different GPU.
+
+    After the call, all tensors in :attr:`tensor_list` will be bitwise identical
+    in all processes.
+
+    .. note::
+      Only NCCL backend is currently supported. :attr:`tensor_list` should only
+      contain GPU tensors.
+
+    Arguments:
+        tensor_list (List[Tensor]): List of input and output tensors of
+            the collective. The function operates in-place and requires that
+            each tensor to be a GPU tensor on different GPUs.
+            You also need to make sure that ``len(tensor_list)`` is the same for
+            all the distributed processes calling this function.
+
+        op (optional): One of the values from ``torch.distributed.deprecated.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+
+    return torch._C._dist_all_reduce_multigpu(tensor_list, op, group)
+
+
+def all_reduce(tensor, op=reduce_op.SUM, group=group.WORLD):
+    r"""Reduces the tensor data across all machines in such a way that all get
+    the final result.
+
+    After the call :attr:`tensor` will be bitwise identical in all processes.
+
+    Arguments:
+        tensor (Tensor): Input and output of the collective. The function
+            operates in-place.
+        op (optional): One of the values from ``torch.distributed.deprecated.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_all_reduce(tensor, op, group)
+
+
+def reduce_multigpu(tensor_list, dst, op=reduce_op.SUM, group=group.WORLD):
+    r"""Reduces the tensor data on multiple GPUs across all machines. Each tensor
+    in :attr`tensor_list` should reside on a separate GPU.
+
+    Only the GPU of ``tensor_list[0]`` on the process with rank :attr:`dst` is
+    going to receive the final result.
+
+    .. note::
+      Only NCCL backend is currently supported. :attr:`tensor_list` should only
+      contain GPU tensors.
+
+    Arguments:
+        tensor_list (List[Tensor]): Input and output GPU tensors of the
+            collective. The function operates in-place.
+            You also need to make sure that ``len(tensor_list)`` is the same for
+            all the distributed processes calling this function.
+
+        dst (int): Destination rank
+        op (optional): One of the values from ``torch.distributed.deprecated.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+
+    return torch._C._dist_reduce_multigpu(tensor_list, dst, op, group)
+
+
+def reduce(tensor, dst, op=reduce_op.SUM, group=group.WORLD):
+    r"""Reduces the tensor data across all machines.
+
+    Only the process with rank :attr:`dst` is going to receive the final result.
+
+    Arguments:
+        tensor (Tensor): Input and output of the collective. The function
+            operates in-place.
+        dst (int): Destination rank
+        op (optional): One of the values from ``torch.distributed.deprecated.reduce_op``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_reduce(tensor, dst, op, group)
+
+
+def all_gather_multigpu(output_tensor_lists,
+                        input_tensor_list,
+                        group=group.WORLD):
+    r"""Gathers tensors from the whole group in a list.
+    Each tensor in :attr:`input_tensor_list` should reside on a separate GPU.
+
+    .. note::
+      Only NCCL backend is currently supported. :attr:`output_tensor_lists` and
+      :attr:`input_tensor_list` should only contain GPU tensors.
+
+    Arguments:
+        output_tensor_lists (List[List[Tensor]]): Output lists. It should
+            contain correctly-sized tensors on each GPU to be used for output of
+            the collective.
+            e.g. ``output_tensor_lists[i]`` contains the all_gather
+            result that resides on the GPU of ``input_tensor_list[i]``.
+            Note that each element of ``output_tensor_lists[i]`` has the size of
+            ``world_size * len(input_tensor_list)``, since the function all
+            gathers the result from every single GPU in the group. To interpret
+            each element of ``output_tensor_list[i]``, note that
+            ``input_tensor_list[j]`` of rank k will be appear in
+            ``output_tensor_list[i][rank * world_size + j]``
+            Also note that ``len(output_tensor_lists)``, and the size of each
+            element in ``output_tensor_lists`` (each element is a list,
+            therefore ``len(output_tensor_lists[i])``) need to be the same
+            for all the distributed processes calling this function.
+
+        input_tensor_list (List[Tensor]): List of tensors (on different GPUs) to
+            be broadcast from current process.
+            Note that ``len(input_tensor_list)`` needs to be the same for
+            all the distributed processes calling this function.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+
+    flatten_tensor_list = []
+    for output_tensor_list in output_tensor_lists:
+        flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list))
+
+    ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list,
+                                             input_tensor_list,
+                                             group)
+
+    for output_tensor_list, flatten_tensor in zip(output_tensor_lists,
+                                                  flatten_tensor_list):
+        for tensor, value in zip(output_tensor_list,
+                                 _unflatten_dense_tensors(flatten_tensor,
+                                                          output_tensor_list)):
+            tensor.copy_(value)
+
+    return ret
+
+
+def all_gather(tensor_list, tensor, group=group.WORLD):
+    r"""Gathers tensors from the whole group in a list.
+
+    Arguments:
+        tensor_list (list[Tensor]): Output list. It should contain
+            correctly-sized tensors to be used for output of the collective.
+        tensor (Tensor): Tensor to be broadcast from current process.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    if _backend != dist_backend.NCCL:
+        return torch._C._dist_all_gather(tensor_list, tensor, group)
+    else:
+        return all_gather_multigpu([tensor_list], [tensor], group)
+
+
+def gather(tensor, **kwargs):
+    r"""Gathers a list of tensors in a single process.
+
+    Arguments:
+        tensor (Tensor): Input tensor.
+        dst (int): Destination rank. Required in all processes except the one that
+            is receiveing the data.
+        gather_list (list[Tensor]): List of appropriately-sized tensors to
+            use for received data. Required only in the receiving process.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    my_rank = get_rank()
+    dst = kwargs.pop('dst', my_rank)
+    gather_list = kwargs.pop('gather_list', None)
+    _group = kwargs.pop('group', group.WORLD)
+    if kwargs:
+        raise RuntimeError("got unexpected kwargs")
+    if dst == my_rank:
+        if gather_list is None:
+            raise RuntimeError("gather_list is a required argument in gather destination")
+        return torch._C._dist_gather_recv(gather_list, tensor, _group)
+    else:
+        if gather_list:
+            raise RuntimeError("non-empty gather_list can be given only to gather destination")
+        return torch._C._dist_gather_send(tensor, dst, _group)
+
+
+def scatter(tensor, **kwargs):
+    r"""Scatters a list of tensors to all processes in a group.
+
+    Each process will receive exactly one tensor and store its data in the
+    :attr:`tensor` argument.
+
+    Arguments:
+        tensor (Tensor): Output tensor.
+        src (int): Source rank. Required in all processes except the one that
+            is sending the data.
+        scatter_list (list[Tensor]): List of tensors to scatter. Required only
+            in the process that is sending the data.
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    my_rank = get_rank()
+    src = kwargs.pop('src', my_rank)
+    scatter_list = kwargs.pop('scatter_list', None)
+    _group = kwargs.pop('group', group.WORLD)
+    if kwargs:
+        raise RuntimeError("got unexpected kwargs: {}".format(", ".join(kwargs.keys())))
+    if src == my_rank:
+        if scatter_list is None:
+            raise RuntimeError("scatter_list is a required argument in scatter source")
+        return torch._C._dist_scatter_send(scatter_list, tensor, _group)
+    else:
+        if scatter_list:
+            raise RuntimeError("non-empty can be given only to scatter source")
+        return torch._C._dist_scatter_recv(tensor, src, _group)
+
+
+def barrier(group=group.WORLD):
+    r"""Synchronizes all processes.
+
+    This collective blocks processes until the whole group enters this function.
+
+    Arguments:
+        group (optional): Group of the collective.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    return torch._C._dist_barrier(group)
+
+
+def new_group(ranks=None):
+    r"""Creates a new distributed group.
+
+    This function requires that all processes in the main group (i.e., all
+    processes that are part of the distributed job) enter this function, even
+    if they are not going to be members of the group. Additionally, groups
+    should be created in the same order in all processes.
+
+    Arguments:
+        ranks (list[int]): List of ranks of group members.
+
+    Returns:
+        A handle of distributed group that can be given to collective calls.
+    """
+    assert torch.distributed.deprecated._initialized == _INITIALIZED_PG, \
+        "collective only supported in process-group mode"
+    if ranks is None:
+        ranks = list(range(get_world_size()))
+    return torch._C._dist_new_group(ranks)
+
+
+def _clear_group_cache(group=group.WORLD):
+    r"""Clear the created distributed group's cached resource.
+
+    Only NCCL backend is currently supported.
+
+    Cached resource includes NCCL communicators and CUDA events.
+
+    Arguments:
+        group (optional): Group of the collective.
+    """
+    return torch._C._dist_clear_group_cache(group)
+
+
+def _register_stream(stream):
+    if not _initialized:
+        raise RuntimeError("torch.distributed.deprecated needs to be initialized first")
+    return torch._C._dist_register_stream(stream)
diff --git a/torch/distributed/remote_types.py b/torch/distributed/deprecated/remote_types.py
similarity index 96%
rename from torch/distributed/remote_types.py
rename to torch/distributed/deprecated/remote_types.py
index a8d10cd93b341f..549a131dabd8c5 100644
--- a/torch/distributed/remote_types.py
+++ b/torch/distributed/deprecated/remote_types.py
@@ -55,6 +55,6 @@ class HalfStorage(_DistributedBase, torch._C.DistributedHalfStorageBase, _Storag
 _tensors = [_locals[t + 'Tensor'] for t in _type_names]
 _storages = [_locals[t + 'Storage'] for t in _type_names]
 for cls in _tensors + _storages:
-    cls.__module__ = 'torch.distributed'
+    cls.__module__ = 'torch.distributed.deprecated'
 torch._C._init_names(_tensors + _storages)
 del _locals, _type_names, _tensors, _storages
diff --git a/torch/distributed/c10d/distributed_c10d.py b/torch/distributed/distributed_c10d.py
similarity index 89%
rename from torch/distributed/c10d/distributed_c10d.py
rename to torch/distributed/distributed_c10d.py
index d1a697e58288bb..0568e4261f4480 100644
--- a/torch/distributed/c10d/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1,4 +1,5 @@
 import torch
+from torch._six import string_classes
 
 from .rendezvous import rendezvous, register_rendezvous_handler
 from . import BroadcastOptions, AllreduceOptions, ReduceOptions, \
@@ -23,11 +24,40 @@
     _NCCL_AVAILBLE = False
 
 
-class DistBackend:
-    UNDEFINED = -1
-    GLOO = 0
-    NCCL = 2
-    MPI = 3
+class DistBackend(object):
+    """
+    An enum-like class of available backends: GLOO, NCCL, and MPI.
+
+    The values of this class are lowercase strings, e.g., ``"gloo"``. They can
+    be accessed as attributes, e.g., ``DistBackend.NCCL``.
+
+    This class can be directly called to parse the string, e.g.,
+    ``DistBackend(backend_str)`` will check if ``backend_str`` is valid, and
+    return the parsed lowercase string if so. It also accepts uppercase strings,
+    e.g., ``DistBackend("GLOO")`` returns ``"gloo"``.
+
+    .. note:: The entry ``DistBackend.UNDEFINED`` is present but only used as
+              initial value of some fields. Users should neither use it directly
+              nor assume its existence.
+    """
+    UNDEFINED = "undefined"
+    GLOO = "gloo"
+    NCCL = "nccl"
+    MPI = "mpi"
+
+    def __new__(cls, name):
+        if not isinstance(name, string_classes):
+            raise ValueError("Backend name must be a string, but got: {}".format(name))
+        value = getattr(DistBackend, name.upper(), DistBackend.UNDEFINED)
+        if value == DistBackend.UNDEFINED:
+            raise ValueError("Invalid backend: '{}'".format(name))
+        return value
+
+# The following two values are here to maintain backward compatibility with
+# pre-c10d distributed package.
+# TODO: remove them when users are ready to take a hard dependency on PyTorch 1.
+_backend = DistBackend.UNDEFINED
+dist_backend = DistBackend
 
 
 class group(object):
@@ -166,6 +196,30 @@ def get_default_group():
     return _default_pg
 
 
+def get_backend(group=group.WORLD):
+    """
+    Returns the backend of the given process group.
+
+    Arguments:
+        group (ProcessGroup, optional): The process group to work on. The
+            default is the general main process group. If another specific group
+            is specified, the calling process must be part of :attr:`group`.
+
+    Returns:
+        The backend of the given process group as a lower case string.
+
+    """
+    _check_default_pg()
+
+    if group == GroupMember.WORLD:
+        pg = _default_pg
+    else:
+        pg = group
+    if _rank_not_in_group(pg):
+        raise RuntimeError("Invalid process group specified")
+    return _pg_map.get(pg, None)[0]
+
+
 def init_process_group(backend,
                        init_method="env://",
                        **kwargs):
@@ -174,9 +228,11 @@ def init_process_group(backend,
     initialize the distributed package
 
     Arguments:
-        backend (str): Name of the backend to use. Depending on build-time
-                       configuration valid values include:
-                        ``mpi`` and ``gloo``.
+        backend (str or DistBackend): The backend to use. Depending on
+            build-time configurations, valid values include ``mpi``, ``gloo``,
+            and ``nccl``. This field should be given as a lowercase string
+            (e.g., ``"gloo"``), which can also be accessed via
+            :class:`DistBackend` attributes (e.g., ``DistBackend.GLOO``).
         init_method (str, optional): URL specifying how to initialize the
                                      process group.
         world_size (int, optional): Number of processes participating in
@@ -184,12 +240,13 @@ def init_process_group(backend,
         rank (int, optional): Rank of the current process.
         group_name (str, optional, deprecated): Group name.
 
-    To enable ``backend == mpi``, PyTorch needs to built from source on
-    a system that supports MPI. The same applies to NCCL as well.
+    To enable ``backend == DistBackend.MPI``, PyTorch needs to built from source
+    on a system that supports MPI. The same applies to NCCL as well.
 
     """
     global _pg_map
     global _pg_names
+    global _backend
     global _default_pg
     global _default_pg_init_method
 
@@ -203,7 +260,9 @@ def init_process_group(backend,
     assert len(kwargs) == 0, \
         "got unexpected keyword arguments: %s" % ",".join(kwargs.keys())
 
-    if backend == "mpi":
+    backend = DistBackend(backend)
+
+    if backend == DistBackend.MPI:
         if not is_mpi_available():
             raise RuntimeError("Distributed package doesn't have MPI built in")
 
@@ -220,20 +279,19 @@ def init_process_group(backend,
         else:
             store, rank, world_size = next(rendezvous(init_method))
 
-        if backend == "gloo":
+        if backend == DistBackend.GLOO:
             _default_pg = ProcessGroupGloo(store, rank, world_size)
             _pg_map[_default_pg] = (DistBackend.GLOO, store)
             _pg_names[_default_pg] = group_name
-        elif backend == "nccl":
+        elif backend == DistBackend.NCCL:
             if not is_nccl_available():
                 raise RuntimeError("Distributed package doesn't have NCCL "
                                    "built in")
             _default_pg = ProcessGroupNCCL(store, rank, world_size)
             _pg_map[_default_pg] = (DistBackend.NCCL, store)
             _pg_names[_default_pg] = group_name
-        else:
-            raise RuntimeError("Invalid distributed backend name: " + backend)
 
+    _backend = _pg_map[_default_pg][0]
     _default_pg_init_method = init_method
 
 
@@ -373,7 +431,8 @@ def get_world_size(group=group.WORLD):
 
 def isend(tensor,
           dst,
-          group=group.WORLD):
+          group=group.WORLD,
+          tag=0):
     """
     Sends a tensor asynchronously.
 
@@ -381,6 +440,7 @@ def isend(tensor,
         tensor (Tensor): Tensor to send.
         dst (int): Destination rank.
         group (ProcessGroup, optional): The process group to work on
+        tag (int, optional): Tag to match send with remote recv
 
     Returns:
         A distributed request object.
@@ -392,15 +452,16 @@ def isend(tensor,
 
     if group == GroupMember.WORLD:
         _check_default_pg()
-        return _default_pg.send([tensor], dst)
+        return _default_pg.send([tensor], dst, tag)
     else:
         group_dst_rank = _get_group_rank(group, dst)
-        return group.send([tensor], group_dst_rank)
+        return group.send([tensor], group_dst_rank, tag)
 
 
 def irecv(tensor,
           src,
-          group=group.WORLD):
+          group=group.WORLD,
+          tag=0):
     """
     Receives a tensor asynchronously.
 
@@ -408,6 +469,7 @@ def irecv(tensor,
         tensor (Tensor): Tensor to fill with received data.
         src (int): Source rank.
         group (ProcessGroup, optional): The process group to work on
+        tag (int, optional): Tag to match recv with remote send
 
     Returns:
         A distributed request object.
@@ -419,15 +481,16 @@ def irecv(tensor,
 
     if group == GroupMember.WORLD:
         _check_default_pg()
-        return _default_pg.recv([tensor], src)
+        return _default_pg.recv([tensor], src, tag)
     else:
         group_src_rank = _get_group_rank(group, src)
-        return group.recv([tensor], group_src_rank)
+        return group.recv([tensor], group_src_rank, tag)
 
 
 def send(tensor,
          dst,
-         group=group.WORLD):
+         group=group.WORLD,
+         tag=0):
     """
     Sends a tensor synchronously.
 
@@ -435,6 +498,7 @@ def send(tensor,
         tensor (Tensor): Tensor to send.
         dst (int): Destination rank.
         group (ProcessGroup, optional): The process group to work on
+        tag (int, optional): Tag to match send with remote recv
 
     """
     if _rank_not_in_group(group):
@@ -442,15 +506,16 @@ def send(tensor,
 
     if group == GroupMember.WORLD:
         _check_default_pg()
-        _default_pg.send([tensor], dst).wait()
+        _default_pg.send([tensor], dst, tag).wait()
     else:
         group_dst_rank = _get_group_rank(group, dst)
-        group.send([tensor], group_dst_rank).wait()
+        group.send([tensor], group_dst_rank, tag).wait()
 
 
 def recv(tensor,
          src=None,
-         group=group.WORLD):
+         group=group.WORLD,
+         tag=0):
     """
     Receives a tensor synchronously.
 
@@ -459,6 +524,7 @@ def recv(tensor,
         src (int, optional): Source rank. Will receive from any
             process if unspecified.
         group (ProcessGroup, optional): The process group to work on
+        tag (int, optional): Tag to match recv with remote send
 
     Returns:
         Sender rank
@@ -476,7 +542,7 @@ def recv(tensor,
 
     if src is None:
         rank_tensor = torch.IntTensor([-1])
-        pg.recv_anysource([tensor], rank_tensor).wait()
+        pg.recv_anysource([tensor], rank_tensor, tag).wait()
         src_rank = rank_tensor[0].item()
         if group == GroupMember.WORLD:
             return src_rank
@@ -484,10 +550,10 @@ def recv(tensor,
             return _get_global_rank(pg, src_rank)
     else:
         if group == GroupMember.WORLD:
-            pg.recv([tensor], src).wait()
+            pg.recv([tensor], src, tag).wait()
         else:
             group_src_rank = _get_group_rank(pg, src)
-            pg.recv([tensor], group_src_rank).wait()
+            pg.recv([tensor], group_src_rank, tag).wait()
         return src
 
 
diff --git a/torch/distributed/c10d/rendezvous.py b/torch/distributed/rendezvous.py
similarity index 100%
rename from torch/distributed/c10d/rendezvous.py
rename to torch/distributed/rendezvous.py
diff --git a/torch/distributions/bernoulli.py b/torch/distributions/bernoulli.py
index 9db9691c4ddb9e..ddbbcde7bed300 100644
--- a/torch/distributions/bernoulli.py
+++ b/torch/distributions/bernoulli.py
@@ -46,6 +46,19 @@ def __init__(self, probs=None, logits=None, validate_args=None):
             batch_shape = self._param.size()
         super(Bernoulli, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Bernoulli, _instance)
+        batch_shape = torch.Size(batch_shape)
+        if 'probs' in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        else:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(Bernoulli, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
@@ -84,8 +97,7 @@ def entropy(self):
         return binary_cross_entropy_with_logits(self.logits, self.probs, reduction='none')
 
     def enumerate_support(self, expand=True):
-        values = self._new((2,))
-        torch.arange(2, out=values)
+        values = torch.arange(2, dtype=self._param.dtype, device=self._param.device)
         values = values.view((-1,) + (1,) * len(self._batch_shape))
         if expand:
             values = values.expand((-1,) + self._batch_shape)
diff --git a/torch/distributions/beta.py b/torch/distributions/beta.py
index f23415d5bf0cbd..35a94e8545190b 100644
--- a/torch/distributions/beta.py
+++ b/torch/distributions/beta.py
@@ -36,6 +36,14 @@ def __init__(self, concentration1, concentration0, validate_args=None):
         self._dirichlet = Dirichlet(concentration1_concentration0)
         super(Beta, self).__init__(self._dirichlet._batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Beta, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new._dirichlet = self._dirichlet.expand(batch_shape)
+        super(Beta, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def mean(self):
         return self.concentration1 / (self.concentration1 + self.concentration0)
diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py
index acbb636ce5ee16..e1763c1c6ecbfd 100644
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@@ -51,6 +51,20 @@ def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
             batch_shape = self._param.size()
         super(Binomial, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Binomial, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.total_count = self.total_count.expand(batch_shape)
+        if 'probs' in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        else:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(Binomial, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
@@ -84,9 +98,13 @@ def sample(self, sample_shape=torch.Size()):
             shape = self._extended_shape(sample_shape) + (max_count,)
             bernoullis = torch.bernoulli(self.probs.unsqueeze(-1).expand(shape))
             if self.total_count.min() != max_count:
-                arange = torch.arange(max_count, out=self.total_count.new_empty(max_count))
+                arange = torch.arange(max_count, dtype=self._param.dtype, device=self._param.device)
                 mask = arange >= self.total_count.unsqueeze(-1)
-                bernoullis.masked_fill_(mask, 0.)
+                if torch._C._get_tracing_state():
+                    # [JIT WORKAROUND] lack of support for .masked_fill_()
+                    bernoullis[mask.expand(shape)] = 0.
+                else:
+                    bernoullis.masked_fill_(mask, 0.)
             return bernoullis.sum(dim=-1)
 
     def log_prob(self, value):
@@ -105,8 +123,7 @@ def enumerate_support(self, expand=True):
         total_count = int(self.total_count.max())
         if not self.total_count.min() == total_count:
             raise NotImplementedError("Inhomogeneous total count not supported by `enumerate_support`.")
-        values = self._new(1 + total_count,)
-        torch.arange(1 + total_count, out=values)
+        values = torch.arange(1 + total_count, dtype=self._param.dtype, device=self._param.device)
         values = values.view((-1,) + (1,) * len(self._batch_shape))
         if expand:
             values = values.expand((-1,) + self._batch_shape)
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index 6dc046dfab42d4..fcfea47c2eadd5 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -45,14 +45,33 @@ def __init__(self, probs=None, logits=None, validate_args=None):
         if (probs is None) == (logits is None):
             raise ValueError("Either `probs` or `logits` must be specified, but not both.")
         if probs is not None:
+            if probs.dim() < 1:
+                raise ValueError("`probs` parameter must be at least one-dimensional.")
             self.probs = probs / probs.sum(-1, keepdim=True)
         else:
+            if logits.dim() < 1:
+                raise ValueError("`logits` parameter must be at least one-dimensional.")
             self.logits = logits - logits.logsumexp(dim=-1, keepdim=True)
         self._param = self.probs if probs is not None else self.logits
         self._num_events = self._param.size()[-1]
         batch_shape = self._param.size()[:-1] if self._param.ndimension() > 1 else torch.Size()
         super(Categorical, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Categorical, _instance)
+        batch_shape = torch.Size(batch_shape)
+        param_shape = batch_shape + torch.Size((self._num_events,))
+        if 'probs' in self.__dict__:
+            new.probs = self.probs.expand(param_shape)
+            new._param = new.probs
+        else:
+            new.logits = self.logits.expand(param_shape)
+            new._param = new.logits
+        new._num_events = self._num_events
+        super(Categorical, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
@@ -105,10 +124,8 @@ def entropy(self):
 
     def enumerate_support(self, expand=True):
         num_events = self._num_events
-        values = torch.arange(num_events).long()
+        values = torch.arange(num_events, dtype=torch.long, device=self._param.device)
         values = values.view((-1,) + (1,) * len(self._batch_shape))
         if expand:
             values = values.expand((-1,) + self._batch_shape)
-        if self._param.is_cuda:
-            values = values.cuda(self._param.get_device())
         return values
diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py
index dec9cfafe134fb..da31c78abbb413 100644
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@@ -36,6 +36,15 @@ def __init__(self, loc, scale, validate_args=None):
             batch_shape = self.loc.size()
         super(Cauchy, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Cauchy, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        super(Cauchy, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def mean(self):
         return self.loc.new_tensor(nan).expand(self._extended_shape())
diff --git a/torch/distributions/chi2.py b/torch/distributions/chi2.py
index fcb0c5b7837727..7fdc5e8d0fd114 100644
--- a/torch/distributions/chi2.py
+++ b/torch/distributions/chi2.py
@@ -1,3 +1,4 @@
+import torch
 from torch.distributions import constraints
 from torch.distributions.gamma import Gamma
 
@@ -21,6 +22,10 @@ class Chi2(Gamma):
     def __init__(self, df, validate_args=None):
         super(Chi2, self).__init__(0.5 * df, 0.5, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Chi2, _instance)
+        return super(Chi2, self).expand(batch_shape, new)
+
     @property
     def df(self):
         return self.concentration * 2
diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py
index f8688af3f3a392..0981a3ce85b3b6 100644
--- a/torch/distributions/constraint_registry.py
+++ b/torch/distributions/constraint_registry.py
@@ -152,7 +152,9 @@ def __call__(self, constraint):
 ################################################################################
 
 @biject_to.register(constraints.real)
+@biject_to.register(constraints.real_vector)
 @transform_to.register(constraints.real)
+@transform_to.register(constraints.real_vector)
 def _transform_to_real(constraint):
     return transforms.identity_transform
 
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
index f214cf1caab1ea..83205353761fd5 100644
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@@ -269,7 +269,7 @@ def check(self, value):
         lower_triangular = (value_tril == value).view(value.shape[:-2] + (-1,)).min(-1)[0]
 
         n = value.size(-1)
-        diag_mask = torch.eye(n, n, out=value.new(n, n))
+        diag_mask = torch.eye(n, n, dtype=value.dtype, device=value.device)
         positive_diagonal = (value * diag_mask > (diag_mask - 1)).min(-1)[0].min(-1)[0]
         return lower_triangular & positive_diagonal
 
diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py
index ca014e0d9da40d..f618628c9f68e7 100644
--- a/torch/distributions/dirichlet.py
+++ b/torch/distributions/dirichlet.py
@@ -54,10 +54,20 @@ class Dirichlet(ExponentialFamily):
     has_rsample = True
 
     def __init__(self, concentration, validate_args=None):
-        self.concentration, = broadcast_all(concentration)
+        if concentration.dim() < 1:
+            raise ValueError("`concentration` parameter must be at least one-dimensional.")
+        self.concentration = concentration
         batch_shape, event_shape = concentration.shape[:-1], concentration.shape[-1:]
         super(Dirichlet, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Dirichlet, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.concentration = self.concentration.expand(batch_shape + self.event_shape)
+        super(Dirichlet, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def rsample(self, sample_shape=()):
         shape = self._extended_shape(sample_shape)
         concentration = self.concentration.expand(shape)
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index 3d1aeb9cadc9c9..726158b8694432 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -35,6 +35,27 @@ def __init__(self, batch_shape=torch.Size(), event_shape=torch.Size(), validate_
                 if not constraint.check(getattr(self, param)).all():
                     raise ValueError("The parameter {} has invalid values".format(param))
 
+    def expand(self, batch_shape, _instance=None):
+        """
+        Returns a new distribution instance (or populates an existing instance
+        provided by a derived class) with batch dimensions expanded to
+        `batch_shape`. This method calls :class:`~torch.Tensor.expand` on
+        the distribution's parameters. As such, this does not allocate new
+        memory for the expanded distribution instance. Additionally,
+        this does not repeat any args checking or parameter broadcasting in
+        `__init__.py`, when an instance is first created.
+
+        Args:
+            batch_shape (torch.Size): the desired expanded size.
+            _instance: new instance provided by subclasses that
+                need to override `.expand`.
+
+        Returns:
+            New distribution instance with batch dimensions expanded to
+            `batch_size`.
+        """
+        raise NotImplementedError
+
     @property
     def batch_shape(self):
         """
@@ -194,7 +215,9 @@ def _extended_shape(self, sample_shape=torch.Size()):
         Args:
             sample_shape (torch.Size): the size of the sample to be drawn.
         """
-        return torch.Size(sample_shape + self._batch_shape + self._event_shape)
+        if not isinstance(sample_shape, torch.Size):
+            sample_shape = torch.Size(sample_shape)
+        return sample_shape + self._batch_shape + self._event_shape
 
     def _validate_sample(self, value):
         """
@@ -228,6 +251,13 @@ def _validate_sample(self, value):
         if not self.support.check(value).all():
             raise ValueError('The value argument must be within the support')
 
+    def _get_checked_instance(self, cls, _instance=None):
+        if _instance is None and type(self).__init__ != cls.__init__:
+            raise NotImplementedError("Subclass {} of {} that defines a custom __init__ method "
+                                      "must also define a custom .expand() method.".
+                                      format(self.__class__.__name__, cls.__name__))
+        return self.__new__(type(self)) if _instance is None else _instance
+
     def __repr__(self):
         param_names = [k for k, _ in self.arg_constraints.items() if k in self.__dict__]
         args_string = ', '.join(['{}: {}'.format(p, self.__dict__[p]
diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py
index 85decc0e239fe5..41d7cd9f9787a5 100644
--- a/torch/distributions/exponential.py
+++ b/torch/distributions/exponential.py
@@ -41,8 +41,20 @@ def __init__(self, rate, validate_args=None):
         batch_shape = torch.Size() if isinstance(rate, Number) else self.rate.size()
         super(Exponential, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Exponential, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.rate = self.rate.expand(batch_shape)
+        super(Exponential, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
+        if torch._C._get_tracing_state():
+            # [JIT WORKAROUND] lack of support for ._exponential()
+            u = torch.rand(shape, dtype=self.rate.dtype, device=self.rate.device)
+            return -(-u).log1p() / self.rate
         return self.rate.new(shape).exponential_() / self.rate
 
     def log_prob(self, value):
diff --git a/torch/distributions/fishersnedecor.py b/torch/distributions/fishersnedecor.py
index 202621218eea7f..6fe09a7cff14db 100644
--- a/torch/distributions/fishersnedecor.py
+++ b/torch/distributions/fishersnedecor.py
@@ -37,6 +37,17 @@ def __init__(self, df1, df2, validate_args=None):
             batch_shape = self.df1.size()
         super(FisherSnedecor, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(FisherSnedecor, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.df1 = self.df1.expand(batch_shape)
+        new.df2 = self.df2.expand(batch_shape)
+        new._gamma1 = self._gamma1.expand(batch_shape)
+        new._gamma2 = self._gamma2.expand(batch_shape)
+        super(FisherSnedecor, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def mean(self):
         df2 = self.df2.clone()
diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py
index 3f96fc031b2345..655ddac5f0c643 100644
--- a/torch/distributions/gamma.py
+++ b/torch/distributions/gamma.py
@@ -1,7 +1,6 @@
 from numbers import Number
 
 import torch
-from torch.autograd import Function
 from torch.distributions import constraints
 from torch.distributions.exp_family import ExponentialFamily
 from torch.distributions.utils import _finfo, broadcast_all, lazy_property
@@ -48,6 +47,15 @@ def __init__(self, concentration, rate, validate_args=None):
             batch_shape = self.concentration.size()
         super(Gamma, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Gamma, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.concentration = self.concentration.expand(batch_shape)
+        new.rate = self.rate.expand(batch_shape)
+        super(Gamma, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
         value = _standard_gamma(self.concentration.expand(shape)) / self.rate.expand(shape)
diff --git a/torch/distributions/geometric.py b/torch/distributions/geometric.py
index 196a28b29e7d75..7b9e796b0a9ac2 100644
--- a/torch/distributions/geometric.py
+++ b/torch/distributions/geometric.py
@@ -45,6 +45,17 @@ def __init__(self, probs=None, logits=None, validate_args=None):
             batch_shape = probs_or_logits.size()
         super(Geometric, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Geometric, _instance)
+        batch_shape = torch.Size(batch_shape)
+        if 'probs' in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+        else:
+            new.logits = self.logits.expand(batch_shape)
+        super(Geometric, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def mean(self):
         return 1. / self.probs - 1.
@@ -64,7 +75,12 @@ def probs(self):
     def sample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
         with torch.no_grad():
-            u = self.probs.new(shape).uniform_(_finfo(self.probs).tiny, 1)
+            if torch._C._get_tracing_state():
+                # [JIT WORKAROUND] lack of support for .uniform_()
+                u = torch.rand(shape, dtype=self.probs.dtype, device=self.probs.device)
+                u = u.clamp(min=_finfo(self.probs).tiny)
+            else:
+                u = self.probs.new(shape).uniform_(_finfo(self.probs).tiny, 1)
             return (u.log() / (-self.probs).log1p()).floor()
 
     def log_prob(self, value):
diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py
index e4e96aa9eb4373..b489c8754aa441 100644
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@@ -31,15 +31,19 @@ def __init__(self, loc, scale, validate_args=None):
         self.loc, self.scale = broadcast_all(loc, scale)
         finfo = _finfo(self.loc)
         if isinstance(loc, Number) and isinstance(scale, Number):
-            batch_shape = torch.Size()
             base_dist = Uniform(finfo.tiny, 1 - finfo.eps)
         else:
-            batch_shape = self.scale.size()
             base_dist = Uniform(self.loc.new(self.loc.size()).fill_(finfo.tiny), 1 - finfo.eps)
         transforms = [ExpTransform().inv, AffineTransform(loc=0, scale=-torch.ones_like(self.scale)),
                       ExpTransform().inv, AffineTransform(loc=loc, scale=-self.scale)]
         super(Gumbel, self).__init__(base_dist, transforms, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Gumbel, _instance)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        return super(Gumbel, self).expand(batch_shape, _instance=new)
+
     @property
     def mean(self):
         return self.loc + self.scale * euler_constant
diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py
index 77a50d3f03c49f..916cb47019a28f 100644
--- a/torch/distributions/half_cauchy.py
+++ b/torch/distributions/half_cauchy.py
@@ -1,5 +1,6 @@
 import math
 
+import torch
 from torch._six import inf
 from torch.distributions import constraints
 from torch.distributions.transforms import AbsTransform
@@ -28,9 +29,14 @@ class HalfCauchy(TransformedDistribution):
     has_rsample = True
 
     def __init__(self, scale, validate_args=None):
-        super(HalfCauchy, self).__init__(Cauchy(0, scale), AbsTransform(),
+        base_dist = Cauchy(0, scale)
+        super(HalfCauchy, self).__init__(base_dist, AbsTransform(),
                                          validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(HalfCauchy, _instance)
+        return super(HalfCauchy, self).expand(batch_shape, _instance=new)
+
     @property
     def scale(self):
         return self.base_dist.scale
diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py
index 059f3837604a63..00d0015231c2e6 100644
--- a/torch/distributions/half_normal.py
+++ b/torch/distributions/half_normal.py
@@ -1,5 +1,6 @@
 import math
 
+import torch
 from torch._six import inf
 from torch.distributions import constraints
 from torch.distributions.transforms import AbsTransform
@@ -28,9 +29,14 @@ class HalfNormal(TransformedDistribution):
     has_rsample = True
 
     def __init__(self, scale, validate_args=None):
-        super(HalfNormal, self).__init__(Normal(0, scale), AbsTransform(),
+        base_dist = Normal(0, scale)
+        super(HalfNormal, self).__init__(base_dist, AbsTransform(),
                                          validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(HalfNormal, _instance)
+        return super(HalfNormal, self).expand(batch_shape, _instance=new)
+
     @property
     def scale(self):
         return self.base_dist.scale
diff --git a/torch/distributions/independent.py b/torch/distributions/independent.py
index 938e478472a9b7..ed8edb43f11644 100644
--- a/torch/distributions/independent.py
+++ b/torch/distributions/independent.py
@@ -46,6 +46,16 @@ def __init__(self, base_distribution, reinterpreted_batch_ndims, validate_args=N
         self.reinterpreted_batch_ndims = reinterpreted_batch_ndims
         super(Independent, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Independent, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.base_dist = self.base_dist.expand(batch_shape +
+                                              self.event_shape[:self.reinterpreted_batch_ndims])
+        new.reinterpreted_batch_ndims = self.reinterpreted_batch_ndims
+        super(Independent, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def has_rsample(self):
         return self.base_dist.has_rsample
diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py
index d3c09faec43051..c15a35ce7c7db1 100644
--- a/torch/distributions/laplace.py
+++ b/torch/distributions/laplace.py
@@ -43,8 +43,21 @@ def __init__(self, loc, scale, validate_args=None):
             batch_shape = self.loc.size()
         super(Laplace, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Laplace, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        super(Laplace, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
+        if torch._C._get_tracing_state():
+            # [JIT WORKAROUND] lack of support for .uniform_()
+            u = torch.rand(shape, dtype=self.loc.dtype, device=self.loc.device) * 2 - 1
+            return self.loc - self.scale * u.sign() * torch.log1p(-u.abs().clamp(min=_finfo(self.loc).tiny))
         u = self.loc.new(shape).uniform_(_finfo(self.loc).eps - 1, 1)
         # TODO: If we ever implement tensor.nextafter, below is what we want ideally.
         # u = self.loc.new(shape).uniform_(self.loc.nextafter(-.5, 0), .5)
diff --git a/torch/distributions/log_normal.py b/torch/distributions/log_normal.py
index 9487dc9b5a9679..38fe532bde31f7 100644
--- a/torch/distributions/log_normal.py
+++ b/torch/distributions/log_normal.py
@@ -1,3 +1,4 @@
+import torch
 from torch.distributions import constraints
 from torch.distributions.transforms import ExpTransform
 from torch.distributions.normal import Normal
@@ -27,7 +28,12 @@ class LogNormal(TransformedDistribution):
     has_rsample = True
 
     def __init__(self, loc, scale, validate_args=None):
-        super(LogNormal, self).__init__(Normal(loc, scale), ExpTransform(), validate_args=validate_args)
+        base_dist = Normal(loc, scale)
+        super(LogNormal, self).__init__(base_dist, ExpTransform(), validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LogNormal, _instance)
+        return super(LogNormal, self).expand(batch_shape, _instance=new)
 
     @property
     def loc(self):
diff --git a/torch/distributions/logistic_normal.py b/torch/distributions/logistic_normal.py
index 39b2f3e488f183..c1be0d2b086b46 100644
--- a/torch/distributions/logistic_normal.py
+++ b/torch/distributions/logistic_normal.py
@@ -32,12 +32,17 @@ class LogisticNormal(TransformedDistribution):
     has_rsample = True
 
     def __init__(self, loc, scale, validate_args=None):
-        super(LogisticNormal, self).__init__(
-            Normal(loc, scale), StickBreakingTransform(),
-            validate_args=validate_args)
+        base_dist = Normal(loc, scale)
+        super(LogisticNormal, self).__init__(base_dist,
+                                             StickBreakingTransform(),
+                                             validate_args=validate_args)
         # Adjust event shape since StickBreakingTransform adds 1 dimension
         self._event_shape = torch.Size([s + 1 for s in self._event_shape])
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LogisticNormal, _instance)
+        return super(LogisticNormal, self).expand(batch_shape, _instance=new)
+
     @property
     def loc(self):
         return self.base_dist.loc
diff --git a/torch/distributions/lowrank_multivariate_normal.py b/torch/distributions/lowrank_multivariate_normal.py
index 4e3e25ae947fad..4cc3d986c7a6c1 100644
--- a/torch/distributions/lowrank_multivariate_normal.py
+++ b/torch/distributions/lowrank_multivariate_normal.py
@@ -5,7 +5,7 @@
 from torch.distributions.distribution import Distribution
 from torch.distributions.multivariate_normal import (_batch_diag, _batch_mahalanobis, _batch_mv,
                                                      _batch_potrf_lower, _batch_trtrs_lower)
-from torch.distributions.utils import lazy_property
+from torch.distributions.utils import _standard_normal, lazy_property
 
 
 def _batch_vector_diag(bvec):
@@ -116,6 +116,20 @@ def __init__(self, loc, cov_factor, cov_diag, validate_args=None):
         super(LowRankMultivariateNormal, self).__init__(batch_shape, event_shape,
                                                         validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LowRankMultivariateNormal, _instance)
+        batch_shape = torch.Size(batch_shape)
+        loc_shape = batch_shape + self.event_shape
+        new.loc = self.loc.expand(loc_shape)
+        new.cov_diag = self.cov_diag.expand(loc_shape)
+        new.cov_factor = self.cov_factor.expand(loc_shape + self.cov_factor.shape[-1:])
+        new._capacitance_tril = self._capacitance_tril.expand(batch_shape + self._capacitance_tril.shape[-2:])
+        super(LowRankMultivariateNormal, new).__init__(batch_shape,
+                                                       self.event_shape,
+                                                       validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @property
     def mean(self):
         return self.loc
@@ -155,8 +169,9 @@ def precision_matrix(self):
 
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
-        eps_W = self.loc.new_empty(shape[:-1] + (self.cov_factor.size(-1),)).normal_()
-        eps_D = self.loc.new_empty(shape).normal_()
+        W_shape = shape[:-1] + self.cov_factor.shape[-1:]
+        eps_W = _standard_normal(W_shape, dtype=self.loc.dtype, device=self.loc.device)
+        eps_D = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device)
         return self.loc + _batch_mv(self.cov_factor, eps_W) + self.cov_diag.sqrt() * eps_D
 
     def log_prob(self, value):
diff --git a/torch/distributions/multinomial.py b/torch/distributions/multinomial.py
index dd1e20d373469b..d02266c0d8db50 100644
--- a/torch/distributions/multinomial.py
+++ b/torch/distributions/multinomial.py
@@ -58,6 +58,15 @@ def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
         event_shape = self._categorical.param_shape[-1:]
         super(Multinomial, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Multinomial, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.total_count = self.total_count
+        new._categorical = self._categorical.expand(batch_shape)
+        super(Multinomial, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._categorical._new(*args, **kwargs)
 
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
index 51de7816cafaff..014a07e53c9532 100644
--- a/torch/distributions/multivariate_normal.py
+++ b/torch/distributions/multivariate_normal.py
@@ -3,7 +3,7 @@
 import torch
 from torch.distributions import constraints
 from torch.distributions.distribution import Distribution
-from torch.distributions.utils import lazy_property
+from torch.distributions.utils import _standard_normal, lazy_property
 
 
 def _batch_mv(bmat, bvec):
@@ -146,6 +146,25 @@ def __init__(self, loc, covariance_matrix=None, precision_matrix=None, scale_tri
         batch_shape, event_shape = self.loc.shape[:-1], self.loc.shape[-1:]
         super(MultivariateNormal, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(MultivariateNormal, _instance)
+        batch_shape = torch.Size(batch_shape)
+        loc_shape = batch_shape + self.event_shape
+        cov_shape = batch_shape + self.event_shape + self.event_shape
+        new.loc = self.loc.expand(loc_shape)
+        new._unbroadcasted_scale_tril = self._unbroadcasted_scale_tril.expand(cov_shape)
+        if 'covariance_matrix' in self.__dict__:
+            new.covariance_matrix = self.covariance_matrix.expand(cov_shape)
+        if 'scale_tril' in self.__dict__:
+            new.scale_tril = self.scale_tril.expand(cov_shape)
+        if 'precision_matrix' in self.__dict__:
+            new.precision_matrix = self.precision_matrix.expand(cov_shape)
+        super(MultivariateNormal, new).__init__(batch_shape,
+                                                self.event_shape,
+                                                validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @lazy_property
     def scale_tril(self):
         return self._unbroadcasted_scale_tril.expand(
@@ -175,7 +194,7 @@ def variance(self):
 
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
-        eps = self.loc.new_empty(shape).normal_()
+        eps = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device)
         return self.loc + _batch_mv(self._unbroadcasted_scale_tril, eps)
 
     def log_prob(self, value):
diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
index de0b748f6a6fdf..01a009b9869d0a 100644
--- a/torch/distributions/negative_binomial.py
+++ b/torch/distributions/negative_binomial.py
@@ -38,6 +38,20 @@ def __init__(self, total_count, probs=None, logits=None, validate_args=None):
         batch_shape = self._param.size()
         super(NegativeBinomial, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(NegativeBinomial, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.total_count = self.total_count.expand(batch_shape)
+        if 'probs' in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        else:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(NegativeBinomial, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py
index 0f1375e6640f53..a125806108e8cd 100644
--- a/torch/distributions/normal.py
+++ b/torch/distributions/normal.py
@@ -4,7 +4,7 @@
 import torch
 from torch.distributions import constraints
 from torch.distributions.exp_family import ExponentialFamily
-from torch.distributions.utils import broadcast_all
+from torch.distributions.utils import _standard_normal, broadcast_all
 
 
 class Normal(ExponentialFamily):
@@ -48,6 +48,15 @@ def __init__(self, loc, scale, validate_args=None):
             batch_shape = self.loc.size()
         super(Normal, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Normal, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        super(Normal, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def sample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
         with torch.no_grad():
@@ -55,7 +64,7 @@ def sample(self, sample_shape=torch.Size()):
 
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
-        eps = self.loc.new(shape).normal_()
+        eps = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device)
         return self.loc + eps * self.scale
 
     def log_prob(self, value):
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
index fbfec01ab7159a..5165ed6768bd89 100644
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@@ -38,9 +38,21 @@ def __init__(self, probs=None, logits=None, validate_args=None):
         event_shape = self._categorical.param_shape[-1:]
         super(OneHotCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(OneHotCategorical, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new._categorical = self._categorical.expand(batch_shape)
+        super(OneHotCategorical, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._categorical._new(*args, **kwargs)
 
+    @property
+    def _param(self):
+        return self._categorical._param
+
     @property
     def probs(self):
         return self._categorical.probs
@@ -64,11 +76,15 @@ def param_shape(self):
     def sample(self, sample_shape=torch.Size()):
         sample_shape = torch.Size(sample_shape)
         probs = self._categorical.probs
-        one_hot = probs.new(self._extended_shape(sample_shape)).zero_()
         indices = self._categorical.sample(sample_shape)
+        if torch._C._get_tracing_state():
+            # [JIT WORKAROUND] lack of support for .scatter_()
+            eye = torch.eye(self.event_shape[-1], dtype=self._param.dtype, device=self._param.device)
+            return eye[indices]
+        one_hot = probs.new_zeros(self._extended_shape(sample_shape))
         if indices.dim() < one_hot.dim():
             indices = indices.unsqueeze(-1)
-        return one_hot.scatter_(-1, indices, 1)
+        return one_hot.scatter_(-1, indices, 1.)
 
     def log_prob(self, value):
         if self._validate_args:
@@ -81,8 +97,7 @@ def entropy(self):
 
     def enumerate_support(self, expand=True):
         n = self.event_shape[0]
-        values = self._new((n, n))
-        torch.eye(n, out=values)
+        values = torch.eye(n, dtype=self._param.dtype, device=self._param.device)
         values = values.view((n,) + (1,) * len(self.batch_shape) + (n,))
         if expand:
             values = values.expand((n,) + self.batch_shape + (n,))
diff --git a/torch/distributions/pareto.py b/torch/distributions/pareto.py
index 82fff3c34b02cb..c860f07b13835d 100644
--- a/torch/distributions/pareto.py
+++ b/torch/distributions/pareto.py
@@ -1,7 +1,3 @@
-from numbers import Number
-
-import math
-
 import torch
 from torch.distributions import constraints
 from torch.distributions.exponential import Exponential
@@ -32,6 +28,12 @@ def __init__(self, scale, alpha, validate_args=None):
         transforms = [ExpTransform(), AffineTransform(loc=0, scale=self.scale)]
         super(Pareto, self).__init__(base_dist, transforms, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Pareto, _instance)
+        new.scale = self.scale.expand(batch_shape)
+        new.alpha = self.alpha.expand(batch_shape)
+        return super(Pareto, self).expand(batch_shape, _instance=new)
+
     @property
     def mean(self):
         # mean is inf for alpha <= 1
diff --git a/torch/distributions/poisson.py b/torch/distributions/poisson.py
index 6be54070a55f2c..954ed6e0d32064 100644
--- a/torch/distributions/poisson.py
+++ b/torch/distributions/poisson.py
@@ -43,6 +43,14 @@ def __init__(self, rate, validate_args=None):
             batch_shape = self.rate.size()
         super(Poisson, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Poisson, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.rate = self.rate.expand(batch_shape)
+        super(Poisson, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def sample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
         with torch.no_grad():
diff --git a/torch/distributions/relaxed_bernoulli.py b/torch/distributions/relaxed_bernoulli.py
index 6b6c540ec48c00..09897e1d34190f 100644
--- a/torch/distributions/relaxed_bernoulli.py
+++ b/torch/distributions/relaxed_bernoulli.py
@@ -46,6 +46,20 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None):
             batch_shape = self._param.size()
         super(LogitRelaxedBernoulli, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LogitRelaxedBernoulli, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.temperature = self.temperature
+        if 'probs' in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        else:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(LogitRelaxedBernoulli, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
@@ -64,7 +78,7 @@ def param_shape(self):
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
         probs = clamp_probs(self.probs.expand(shape))
-        uniforms = clamp_probs(self.probs.new(shape).uniform_())
+        uniforms = clamp_probs(torch.rand(shape, dtype=probs.dtype, device=probs.device))
         return (uniforms.log() - (-uniforms).log1p() + probs.log() - (-probs).log1p()) / self.temperature
 
     def log_prob(self, value):
@@ -99,8 +113,14 @@ class RelaxedBernoulli(TransformedDistribution):
     has_rsample = True
 
     def __init__(self, temperature, probs=None, logits=None, validate_args=None):
-        super(RelaxedBernoulli, self).__init__(LogitRelaxedBernoulli(temperature, probs, logits),
-                                               SigmoidTransform(), validate_args=validate_args)
+        base_dist = LogitRelaxedBernoulli(temperature, probs, logits)
+        super(RelaxedBernoulli, self).__init__(base_dist,
+                                               SigmoidTransform(),
+                                               validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(RelaxedBernoulli, _instance)
+        return super(RelaxedBernoulli, self).expand(batch_shape, _instance=new)
 
     @property
     def temperature(self):
diff --git a/torch/distributions/relaxed_categorical.py b/torch/distributions/relaxed_categorical.py
index 0c35defaf35bd7..86406cd207ab52 100644
--- a/torch/distributions/relaxed_categorical.py
+++ b/torch/distributions/relaxed_categorical.py
@@ -41,6 +41,15 @@ def __init__(self, temperature, probs=None, logits=None, validate_args=None):
         event_shape = self._categorical.param_shape[-1:]
         super(ExpRelaxedCategorical, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(ExpRelaxedCategorical, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.temperature = self.temperature
+        new._categorical = self._categorical.expand(batch_shape)
+        super(ExpRelaxedCategorical, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def _new(self, *args, **kwargs):
         return self._categorical._new(*args, **kwargs)
 
@@ -57,8 +66,8 @@ def probs(self):
         return self._categorical.probs
 
     def rsample(self, sample_shape=torch.Size()):
-        sample_shape = torch.Size(sample_shape)
-        uniforms = clamp_probs(self.logits.new(self._extended_shape(sample_shape)).uniform_())
+        shape = self._extended_shape(sample_shape)
+        uniforms = clamp_probs(torch.rand(shape, dtype=self.logits.dtype, device=self.logits.device))
         gumbels = -((-(uniforms.log())).log())
         scores = (self.logits + gumbels) / self.temperature
         return scores - scores.logsumexp(dim=-1, keepdim=True)
@@ -68,7 +77,7 @@ def log_prob(self, value):
         if self._validate_args:
             self._validate_sample(value)
         logits, value = broadcast_all(self.logits, value)
-        log_scale = (self.temperature.new(self.temperature.shape).fill_(K).lgamma() -
+        log_scale = (self.temperature.new_tensor(float(K)).lgamma() -
                      self.temperature.log().mul(-(K - 1)))
         score = logits - value.mul(self.temperature)
         score = (score - score.logsumexp(dim=-1, keepdim=True)).sum(-1)
@@ -100,8 +109,14 @@ class RelaxedOneHotCategorical(TransformedDistribution):
     has_rsample = True
 
     def __init__(self, temperature, probs=None, logits=None, validate_args=None):
-        super(RelaxedOneHotCategorical, self).__init__(ExpRelaxedCategorical(temperature, probs, logits),
-                                                       ExpTransform(), validate_args=validate_args)
+        base_dist = ExpRelaxedCategorical(temperature, probs, logits)
+        super(RelaxedOneHotCategorical, self).__init__(base_dist,
+                                                       ExpTransform(),
+                                                       validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(RelaxedOneHotCategorical, _instance)
+        return super(RelaxedOneHotCategorical, self).expand(batch_shape, _instance=new)
 
     @property
     def temperature(self):
diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py
index c7738afa9cd63b..6530940b328e7f 100644
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@@ -1,11 +1,11 @@
+import math
 from numbers import Number
+
 import torch
 from torch._six import inf, nan
-import math
-from torch.distributions import constraints
+from torch.distributions import Chi2, constraints
 from torch.distributions.distribution import Distribution
-from torch.distributions import Chi2
-from torch.distributions.utils import broadcast_all
+from torch.distributions.utils import _standard_normal, broadcast_all
 
 
 class StudentT(Distribution):
@@ -45,6 +45,17 @@ def __init__(self, df, loc=0., scale=1., validate_args=None):
         batch_shape = torch.Size() if isinstance(df, Number) else self.df.size()
         super(StudentT, self).__init__(batch_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(StudentT, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.df = self.df.expand(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        new._chi2 = self._chi2.expand(batch_shape)
+        super(StudentT, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     def rsample(self, sample_shape=torch.Size()):
         # NOTE: This does not agree with scipy implementation as much as other distributions.
         # (see https://github.com/fritzo/notebooks/blob/master/debug-student-t.ipynb). Using DoubleTensor
@@ -54,7 +65,7 @@ def rsample(self, sample_shape=torch.Size()):
         #   Z ~ Chi2(df)
         #   Y = X / sqrt(Z / df) ~ StudentT(df)
         shape = self._extended_shape(sample_shape)
-        X = self.df.new(shape).normal_()
+        X = _standard_normal(shape, dtype=self.df.dtype, device=self.df.device)
         Z = self._chi2.rsample(sample_shape)
         Y = X * torch.rsqrt(Z / self.df)
         return self.loc + self.scale * Y
diff --git a/torch/distributions/transformed_distribution.py b/torch/distributions/transformed_distribution.py
index 79e2c691638076..a7c49b4f7b1810 100644
--- a/torch/distributions/transformed_distribution.py
+++ b/torch/distributions/transformed_distribution.py
@@ -56,6 +56,16 @@ def __init__(self, base_distribution, transforms, validate_args=None):
         event_shape = shape[len(shape) - event_dim:]
         super(TransformedDistribution, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(TransformedDistribution, _instance)
+        batch_shape = torch.Size(batch_shape)
+        base_dist_batch_shape = batch_shape + self.base_dist.batch_shape[len(self.batch_shape):]
+        new.base_dist = self.base_dist.expand(base_dist_batch_shape)
+        new.transforms = self.transforms
+        super(TransformedDistribution, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @constraints.dependent_property
     def support(self):
         return self.transforms[-1].codomain if self.transforms else self.base_dist.support
diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py
index fa39f800e97df7..071f086d782673 100644
--- a/torch/distributions/uniform.py
+++ b/torch/distributions/uniform.py
@@ -1,4 +1,3 @@
-import math
 from numbers import Number
 
 import torch
@@ -50,13 +49,22 @@ def __init__(self, low, high, validate_args=None):
         if self._validate_args and not torch.lt(self.low, self.high).all():
             raise ValueError("Uniform is not defined when low>= high")
 
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Uniform, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.low = self.low.expand(batch_shape)
+        new.high = self.high.expand(batch_shape)
+        super(Uniform, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
     @constraints.dependent_property
     def support(self):
         return constraints.interval(self.low, self.high)
 
     def rsample(self, sample_shape=torch.Size()):
         shape = self._extended_shape(sample_shape)
-        rand = self.low.new(shape).uniform_()
+        rand = torch.rand(shape, dtype=self.low.dtype, device=self.low.device)
         return self.low + rand * (self.high - self.low)
 
     def log_prob(self, value):
diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py
index 0219942aac155a..1fcc7278e3f481 100644
--- a/torch/distributions/utils.py
+++ b/torch/distributions/utils.py
@@ -65,6 +65,14 @@ def broadcast_all(*values):
     return torch.broadcast_tensors(*values)
 
 
+def _standard_normal(shape, dtype, device):
+    if torch._C._get_tracing_state():
+        # [JIT WORKAROUND] lack of support for .normal_()
+        return torch.normal(torch.zeros(shape, dtype=dtype, device=device),
+                            torch.ones(shape, dtype=dtype, device=device))
+    return torch.empty(shape, dtype=dtype, device=device).normal_()
+
+
 def _sum_rightmost(value, dim):
     r"""
     Sum out ``dim`` many rightmost dimensions of a given tensor.
diff --git a/torch/distributions/weibull.py b/torch/distributions/weibull.py
index 4e196174f84e98..8b5afee400b780 100644
--- a/torch/distributions/weibull.py
+++ b/torch/distributions/weibull.py
@@ -1,5 +1,3 @@
-from numbers import Number
-import math
 import torch
 from torch.distributions import constraints
 from torch.distributions.exponential import Exponential
@@ -32,7 +30,23 @@ def __init__(self, scale, concentration, validate_args=None):
         base_dist = Exponential(self.scale.new(self.scale.size()).fill_(1.0))
         transforms = [PowerTransform(exponent=self.concentration_reciprocal),
                       AffineTransform(loc=0, scale=self.scale)]
-        super(Weibull, self).__init__(base_dist, transforms, validate_args=validate_args)
+        super(Weibull, self).__init__(base_dist,
+                                      transforms,
+                                      validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Weibull, _instance)
+        new.scale = self.scale.expand(batch_shape)
+        new.concentration = self.concentration.expand(batch_shape)
+        new.concentration_reciprocal = new.concentration.reciprocal()
+        base_dist = self.base_dist.expand(batch_shape)
+        transforms = [PowerTransform(exponent=new.concentration_reciprocal),
+                      AffineTransform(loc=0, scale=new.scale)]
+        super(Weibull, new).__init__(base_dist,
+                                     transforms,
+                                     validate_args=False)
+        new._validate_args = self._validate_args
+        return new
 
     @property
     def mean(self):
diff --git a/torch/functional.py b/torch/functional.py
index d84597f3fff437..40c74ba59abd94 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -16,6 +16,7 @@
     'isfinite',
     'isinf',
     'isnan',
+    'meshgrid',
     'split',
     'stft',
     'tensordot',
@@ -50,7 +51,7 @@ def split(tensor, split_size_or_sections, dim=0):
 
     If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will
     be split into equally sized chunks (if possible). Last chunk will be smaller if
-    the tensor size along the given dimension :attr:`dim= is not divisible by
+    the tensor size along the given dimension :attr:`dim` is not divisible by
     :attr:`split_size`.
 
     If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split
@@ -278,6 +279,43 @@ def isinf(tensor):
     return tensor.abs() == inf
 
 
+def meshgrid(*tensors, **kwargs):
+    r"""Take :math:`N` tensors, each of which can be either scalar or 1-dimensional
+vector, and create :math:`N` N-dimensional grids, where the :math:`i`th grid is defined by
+expanding the :math:`i`th input over dimensions defined by other inputs.
+
+
+    Args:
+        tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be
+        treated as tensors of size :math:`(1,)` automatically
+
+    Returns:
+        seq (sequence of Tensors): If the input has :math:`k` tensors of size
+        :math:`(N_1,), (N_2,), \ldots , (N_k,)`, then the output would also has :math:`k` tensors,
+        where all tensors are of size :math:`(N_1, N_2, \ldots , N_k)`.
+
+    Example::
+
+        >>> x = torch.tensor([1, 2, 3])
+        >>> y = torch.tensor([4, 5, 6])
+        >>> grid_x, grid_y = torch.meshgrid(x, y)
+        >>> grid_x
+        tensor([[1, 1, 1],
+                [2, 2, 2],
+                [3, 3, 3]])
+        >>> grid_y
+        tensor([[4, 5, 6],
+                [4, 5, 6],
+                [4, 5, 6]])
+    """
+    if kwargs:
+        raise TypeError("meshgrid() got an unexpected keyword argument '%s'" % (list(kwargs)[0],))
+    if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)):
+        # the old interface of passing the operands as one list argument
+        tensors = tensors[0]
+    return torch._C._VariableFunctions.meshgrid(tensors)
+
+
 def stft(input, n_fft, hop_length=None, win_length=None, window=None,
          center=True, pad_mode='reflect', normalized=False, onesided=True):
     r"""Short-time Fourier transform (STFT).
@@ -287,8 +325,8 @@ def stft(input, n_fft, hop_length=None, win_length=None, window=None,
 
     .. math::
         X[m, \omega] = \sum_{k = 0}^{\text{win\_length}}%
-                            window[k]\ input[m \times hop_length + k]\ %
-                            e^{- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}},
+                            \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ %
+                            \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{win\_length}}\right),
 
     where :math:`m` is the index of the sliding window, and :math:`\omega` is
     the frequency that :math:`0 \leq \omega < \text{n\_fft}`. When
@@ -345,7 +383,7 @@ def stft(input, n_fft, hop_length=None, win_length=None, window=None,
         win_length (int): the size of window frame and STFT filter.
             Default: ``None``  (treated as equal to :attr:`n_fft`)
         window (Tensor, optional): the optional window function.
-            Default: ``None`` (treated as window of all :math:`1`s)
+            Default: ``None`` (treated as window of all :math:`1` s)
         center (bool, optional): whether to pad :attr:`input` on both sides so
             that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
             Default: ``True``
@@ -452,7 +490,7 @@ def unique(input, sorted=False, return_inverse=False, dim=None):
 
 
 def argmax(input, dim=None, keepdim=False):
-    """Returns the indices of the maximum values of a tensor across a dimension.
+    r"""Returns the indices of the maximum values of a tensor across a dimension.
 
     This is the second value returned by :meth:`torch.max`. See its
     documentation for the exact semantics of this method.
@@ -483,7 +521,7 @@ def argmax(input, dim=None, keepdim=False):
 
 
 def argmin(input, dim=None, keepdim=False):
-    """Returns the indices of the minimum values of a tensor across a dimension.
+    r"""Returns the indices of the minimum values of a tensor across a dimension.
 
     This is the second value returned by :meth:`torch.min`. See its
     documentation for the exact semantics of this method.
@@ -514,7 +552,7 @@ def argmin(input, dim=None, keepdim=False):
 
 
 def tensordot(a, b, dims=2):
-    """Returns a contraction of a and b over multiple dimensions.
+    r"""Returns a contraction of a and b over multiple dimensions.
 
     :attr:`tensordot` implements a generalizes the matrix product.
 
@@ -531,7 +569,7 @@ def tensordot(a, b, dims=2):
 
     .. math::
         r_{i_0,...,i_{m-d}, i_d,...,i_n}
-          = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} * b_{k_0,...,k_{d-1}, i_d,...,i_n}.
+          = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}.
 
     When called with :attr:`dims` of the list form, the given dimensions will be contracted
     in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes
@@ -569,7 +607,7 @@ def tensordot(a, b, dims=2):
 
 
 def argsort(input, dim=None, descending=False):
-    """Returns the indices that sort a tensor along a given dimension in ascending
+    r"""Returns the indices that sort a tensor along a given dimension in ascending
     order by value.
 
     This is the second value returned by :meth:`torch.sort`.  See its documentation
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index dd497d5c50da90..5f6a1c1e011167 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -6,7 +6,7 @@
 import torch.jit.annotations
 from torch._six import raise_from, with_metaclass
 import torch.testing
-from collections import defaultdict, OrderedDict, namedtuple, Iterable
+from collections import defaultdict, OrderedDict, namedtuple
 import sys
 import warnings
 import itertools
@@ -62,6 +62,10 @@ def load(filename):
     r"""
         Load a ``ScriptModule`` previously saved with :func:`save <torch.jit.ScriptModule.save>`
 
+        .. DANGER::
+           All previously saved modules, no matter their device, are always loaded onto the CPU.
+           This is different from :func:`torch.load`'s semantics and may change in the future.
+
         Arguments:
             filename (string): the file to load
 
@@ -69,7 +73,16 @@ def load(filename):
             A ``ScriptModule`` object.
     """
     m = ScriptModule()
-    m._load(filename)
+
+    def module_lookup(names):
+        curr = m
+        for name in names:
+            if not hasattr(curr, name):
+                setattr(curr, name, ScriptModule())
+            curr = getattr(curr, name)
+        return curr
+
+    torch._C.import_ir_module(module_lookup, filename)
     return m
 
 
@@ -302,6 +315,7 @@ def __init__(self, graph_diff_error, tensor_compare_error, extra_msg=None):
 
 
 # Check the traced module against a set of user-provided validation inputs
+@torch.no_grad()
 def _check_trace(check_inputs, func, executor_options, module, check_tolerance):
     for inputs in check_inputs:
         if isinstance(inputs, torch.Tensor):
@@ -389,7 +403,7 @@ def maybe_warn_nondeterministic():
                 nondeterministic_ops_warning = "Trace had nondeterministic nodes. Nodes:\n"
                 nondeterministic_ops_warning += "\n".join([indent(str(op)) for op in nondeterm_ops][:20])
                 nondeterministic_ops_warning += "\nThis may cause errors in trace checking. To disable trace checking,"\
-                                                " pass disable_checks=True to torch.jit.trace()"
+                                                " pass check_trace=False to torch.jit.trace()"
                 warnings.warn(nondeterministic_ops_warning, category=TracerWarning, stacklevel=5)
 
         def compare_outputs(original, reference, match_what):
@@ -998,7 +1012,7 @@ def _get_methods(cls):
     '_named_members', 'parameters', 'named_parameters',
     'buffers', 'named_buffers', 'children', 'named_children', 'modules',
     'named_modules', 'zero_grad', 'share_memory', '_get_name', 'extra_repr',
-    '_slow_forward', '_tracing_name'
+    '_slow_forward', '_tracing_name', 'eval', 'train',
 }
 
 
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index f5d824130f4167..6930d61ed5a64e 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -105,16 +105,6 @@ def get_num_params(fn):
         return num_params
 
 
-def flatten_return_type(type):
-    if isinstance(type, TupleType):
-        return_types = []
-        for elem_type in type.elements():
-            return_types.append(elem_type)
-        return return_types
-    else:
-        return [type]
-
-
 def parse_type_line(type_line):
     """Parses a type annotation specified as a comment.
 
@@ -138,9 +128,7 @@ def parse_type_line(type_line):
         raise RuntimeError("Failed to parse the return type of a type annotation")
 
     arg_types = [ann_to_type(ann) for ann in arg_ann]
-    ret_types = flatten_return_type(ann_to_type(ret_ann))
-
-    return arg_types, ret_types
+    return arg_types, ann_to_type(ret_ann)
 
 
 def get_type_line(source):
@@ -191,8 +179,8 @@ def as_ann(ann):
 
     arg_types = [ann_to_type(as_ann(p.annotation))
                  for p in sig.parameters.values()]
-    return_types = flatten_return_type(ann_to_type(as_ann(sig.return_annotation)))
-    return arg_types, return_types
+    return_type = ann_to_type(as_ann(sig.return_annotation))
+    return arg_types, return_type
 
 
 def ann_to_type(ann):
diff --git a/torch/jit/batchop.py b/torch/jit/batchop.py
index cc022a37da2e3f..229cafbb94119d 100644
--- a/torch/jit/batchop.py
+++ b/torch/jit/batchop.py
@@ -140,7 +140,7 @@ def batch_select(data, mask, dims, dim_, index_):
     # if dim == 0:
     #     raise ValueError("Cannot select 0 dim in BatchTensor")
     data = data.select(dim, index)
-    if dims[dim - 1]:
+    if bool(dims[dim - 1]):
         mask = mask.select(dim, index)
     else:
         mask = mask.select(dim, 0)
@@ -171,7 +171,7 @@ def batch_index_select(data, mask, dims, dim_, index_data, index_mask, index_dim
     res_mask = torch.zeros([0])
     for i in range(batch_size):
         d = data[i].index_select(dim - 1, index_data[i]).unsqueeze(0)
-        if dims[dim - 1]:
+        if bool(dims[dim - 1]):
             m = mask[i].index_select(dim - 1, index_data[i]).unsqueeze(0)
         else:
             m = mask[i].unsqueeze(0)
@@ -310,7 +310,7 @@ def batch_argmax(data, mask, dims, dim_, keepdim_):
     batch_size = data.size(0)
     res_data = torch.zeros([0])
     for i in range(batch_size):
-        if dims[dim - 1]:
+        if bool(dims[dim - 1]):
             if dim - 1 != 0:
                 m = mask[i].transpose(0, dim - 1)
             else:
@@ -346,7 +346,7 @@ def batch_topk(data, mask, dims, k_, dim_, largest_, sorted_):
     res_data = torch.zeros([0])
     res_index = torch.zeros([0])
     for i in range(batch_size):
-        if dims[dim - 1]:
+        if bool(dims[dim - 1]):
             if dim - 1 != 0:
                 m = mask[i].transpose(0, dim - 1)
             else:
@@ -364,7 +364,7 @@ def batch_topk(data, mask, dims, k_, dim_, largest_, sorted_):
         else:
             res_data = torch.cat([res_data, d], 0)
             res_index = torch.cat([res_index, idx], 0)
-    if dims[dim - 1]:
+    if bool(dims[dim - 1]):
         mask = mask.narrow(dim, 0, k)
     return res_data, mask, dims, res_index, mask, dims
 
@@ -378,7 +378,7 @@ def batch_softmax(data, mask, dims, dim_):
     max_len = data.size(dim)
     res_data = torch.zeros([0])
     for i in range(batch_size):
-        if dims[dim - 1]:
+        if bool(dims[dim - 1]):
             if dim - 1 != 0:
                 m = mask[i].transpose(0, dim - 1)
             else:
@@ -417,7 +417,7 @@ def batch_view(data, mask, dims, sizes):
     res_dims = data_sizes_.narrow(0, 0, 1)
     for i_ in range(sizes.size(0) - 1):
         i = i_ + 1
-        if(sizes[i] == -1):
+        if bool(sizes[i] == -1):
             cur_size_ = mask.size(i)
             cur_dim = 1
         else:
@@ -434,7 +434,7 @@ def batch_view(data, mask, dims, sizes):
 def batch_cat2(data1, mask1, dims1, data2, mask2, dims2, dim_):
     dim = int(dim_)
     data = torch.cat([data1, data2], dim)
-    if(dims1[dim - 1]):
+    if bool(dims1[dim - 1]):
         mask = torch.cat([mask1, mask2], dim)
     else:
         mask = mask1
@@ -445,7 +445,7 @@ def batch_cat2(data1, mask1, dims1, data2, mask2, dims2, dim_):
 def batch_cat3(data1, mask1, dims1, data2, mask2, dims2, data3, mask3, dims3, dim_):
     dim = int(dim_)
     data = torch.cat([data1, data2, data3], dim)
-    if(dims1[dim - 1]):
+    if bool(dims1[dim - 1]):
         mask = torch.cat([mask1, mask2, mask3], dim)
     else:
         mask = mask1
@@ -460,7 +460,7 @@ def batch_narrow(data, mask, dims, dimension_, start_, length_):
     # if dimension == 0:
     #     raise ValueError("cannot do narrow along batch_dim")
     data = data.narrow(dimension, start, length)
-    if dims[dimension - 1]:
+    if bool(dims[dimension - 1]):
         mask = mask.narrow(dimension, start, length)
     else:
         mask = mask.narrow(dimension, 0, 1)
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 659c3bc4cdd70f..99d767797e1b7e 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -303,6 +303,7 @@ class ExprBuilder(Builder):
         ast.Mult: '*',
         ast.Div: '/',
         ast.Pow: '**',
+        ast.Mod: '%',
     }
 
     if not PY2:
diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index 9a09065bc4e380..c8afadc345d34f 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -14,6 +14,21 @@ if(NOT Caffe2_FOUND)
   message(FATAL_ERROR "Caffe2 not found")
 endif()
 
+if(USE_CUDA)
+  find_package(CUDA)
+  if(CUDA_FOUND)
+    set(C10D_USE_CUDA true)
+    message(STATUS "Building C10D with CUDA support")
+    add_definitions(-DUSE_CUDA=1)
+  else()
+    set(C10D_USE_CUDA false)
+    message(STATUS "CUDA not found, building C10D without CUDA support")
+  endif()
+else()
+  set(C10D_USE_CUDA false)
+  message(STATUS "Building C10D without CUDA support")
+endif()
+
 find_package(Gloo REQUIRED)
 if(Gloo_FOUND)
   message(STATUS "Gloo_LIBRARY: ${Gloo_LIBRARY}")
@@ -32,27 +47,24 @@ else()
   message(STATUS "Not able to find MPI, will compile c10d without MPI support")
 endif()
 
-find_package(NCCL)
-if(NCCL_FOUND)
-  message(STATUS "NCCL_LIBRARIES: ${NCCL_LIBRARIES}")
-  message(STATUS "NCCL_INCLUDE_DIRS: ${NCCL_INCLUDE_DIRS}")
-  IF(NCCL_MAJOR_VERSION AND NOT (NCCL_MAJOR_VERSION LESS 2))
-    message(STATUS "NCCL Version 2 or higher found, will "
-                   "compile with NCCL distributed backend")
-    SET(DISTRIBUTED_NCCL_FOUND TRUE)
+if(C10D_USE_CUDA)
+  find_package(NCCL)
+  if(NCCL_FOUND)
+    message(STATUS "NCCL_LIBRARIES: ${NCCL_LIBRARIES}")
+    message(STATUS "NCCL_INCLUDE_DIRS: ${NCCL_INCLUDE_DIRS}")
+    if(NCCL_MAJOR_VERSION AND NOT (NCCL_MAJOR_VERSION LESS 2))
+      message(STATUS "NCCL Version 2 or higher found, will "
+        "compile with NCCL distributed backend")
+      set(DISTRIBUTED_NCCL_FOUND true)
+    else()
+      message(STATUS "Found NCCL, but the NCCL version is either not 2+ or not "
+        "determinable, will not compile with NCCL distributed "
+        "backend")
+    endif()
   else()
-    message(STATUS "Found NCCL, but the NCCL version is either not 2+ or not "
-                   "determinable, will not compile with NCCL distributed "
-                   "backend")
+    message(STATUS "Not able to find NCCL, will not "
+      "compile with NCCL distributed backend")
   endif()
-else()
-  message(STATUS "Not able to find NCCL, will not "
-                 "compile with NCCL distributed backend")
-endif()
-
-find_package(CUDA REQUIRED)
-if(NOT CUDA_FOUND)
-  message(FATAL_ERROR "CUDA not found")
 endif()
 
 function(copy_header file)
@@ -68,7 +80,6 @@ endif()
 configure_file(cmake/Def.hpp.in ${CMAKE_BINARY_DIR}/include/c10d/Def.hpp @ONLY)
 
 set(C10D_SRCS
-  CUDAUtils.cpp
   FileStore.cpp
   ProcessGroup.cpp
   Store.cpp
@@ -78,11 +89,21 @@ set(C10D_SRCS
   ProcessGroupGloo.cpp
   )
 
-set(C10D_LIBS
-  caffe2_gpu
-  ${Gloo_LIBRARY}
-  ${Gloo_NATIVE_LIBRARY}
-  )
+if(C10D_USE_CUDA)
+  list(APPEND C10D_SRCS CUDAUtils.cpp)
+  set(C10D_LIBS
+    caffe2_gpu
+    ${Gloo_LIBRARY}
+    ${Gloo_NATIVE_LIBRARY}
+    )
+else()
+  set(C10D_LIBS
+    caffe2
+    ${Gloo_LIBRARY}
+    ${Gloo_NATIVE_LIBRARY}
+    )
+endif()
+
 
 if(DISTRIBUTED_NCCL_FOUND)
   list(APPEND C10D_SRCS ProcessGroupNCCL.cpp)
@@ -108,7 +129,11 @@ target_compile_options(c10d PUBLIC
 
 # c10d links to Caffe2/ATen, but the targets don't add TH/THC to the include path
 target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/TH)
-target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/THC)
+
+if(C10D_USE_CUDA)
+  target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../tmp_install/include/THC)
+endif()
+
 # For <c10d/...>
 target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..)
 # For torch/csrc/utils/hash.h and torch/csrc/utils/functional.h
@@ -116,7 +141,10 @@ target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 # For <gloo/...>
 target_include_directories(c10d PUBLIC ${GLOO_INCLUDE_DIR})
 
-copy_header(CUDAUtils.hpp)
+if(C10D_USE_CUDA)
+  copy_header(CUDAUtils.hpp)
+endif()
+
 copy_header(FileStore.hpp)
 copy_header(PrefixStore.hpp)
 copy_header(ProcessGroup.hpp)
diff --git a/torch/lib/c10d/ProcessGroup.hpp b/torch/lib/c10d/ProcessGroup.hpp
index f312ed779e43ae..3f80f72cf36382 100644
--- a/torch/lib/c10d/ProcessGroup.hpp
+++ b/torch/lib/c10d/ProcessGroup.hpp
@@ -38,7 +38,7 @@ class ProcessGroup {
     virtual ~Work();
 
     // Checks if request has completed. Non-blocking operation.
-    virtual bool isCompleted() const = 0;
+    virtual bool isCompleted() = 0;
 
     // Returns if the work completed successfully.
     // If false, the exception function can be called to get details.
@@ -115,15 +115,18 @@ class ProcessGroup {
 
   virtual std::shared_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
-      int dstRank) = 0;
+      int dstRank,
+      int tag) = 0;
 
   virtual std::shared_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
-      int srcRank) = 0;
+      int srcRank,
+      int tag) = 0;
 
   virtual std::shared_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
-      int* srcRank) = 0;
+      int* srcRank,
+      int tag) = 0;
 
   virtual std::shared_ptr<ProcessGroup::Work> barrier() = 0;
 
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index b7bf001be9f891..4417364641efd2 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -3,15 +3,20 @@
 #include <gloo/allreduce_halving_doubling.h>
 #include <gloo/allreduce_ring_chunked.h>
 #include <gloo/broadcast_one_to_all.h>
+
+#ifdef USE_CUDA
 #include <gloo/cuda_allreduce_halving_doubling.h>
 #include <gloo/cuda_allreduce_ring_chunked.h>
 #include <gloo/cuda_broadcast_one_to_all.h>
+#endif
+
 #include <gloo/rendezvous/context.h>
 #include <gloo/transport/tcp/device.h>
 
+#ifdef USE_CUDA
 #include <THC.h>
-
 #include <c10d/private/CUDAUtils.hpp>
+#endif
 
 #define GENERATE_ALL_TYPES(type, func, args...)        \
   switch (type) {                                      \
@@ -94,6 +99,7 @@ const ::gloo::ReductionFunction<T>* reductionFunction(const ReduceOp& r) {
   throw std::runtime_error("Unhandled ReduceOp");
 }
 
+#ifdef USE_CUDA
 std::vector<cudaStream_t> getStreamVector(AlgorithmEntry& entry) {
   std::vector<cudaStream_t> streams(entry.streams.size());
   for (size_t i = 0; i < entry.streams.size(); i++) {
@@ -124,14 +130,22 @@ void synchronizeStreams(THCState* thcState, AlgorithmEntry* entry) {
     C10D_CUDA_CHECK(cudaStreamWaitEvent(privateStream, event, 0));
   }
 }
+#endif
 
 } // namespace
 
-ProcessGroupGloo::WorkGloo::WorkGloo() : completed_(false), cuda_(false) {}
+ProcessGroupGloo::WorkGloo::WorkGloo()
+    : completed_(false)
+#ifdef USE_CUDA
+      ,
+      cuda_(false)
+#endif
+{
+}
 
 ProcessGroupGloo::WorkGloo::~WorkGloo() {}
 
-bool ProcessGroupGloo::WorkGloo::isCompleted() const {
+bool ProcessGroupGloo::WorkGloo::isCompleted() {
   return completed_;
 }
 
@@ -140,6 +154,7 @@ bool ProcessGroupGloo::WorkGloo::isSuccess() const {
 }
 
 void ProcessGroupGloo::WorkGloo::synchronize() {
+#ifdef USE_CUDA
   if (cuda_) {
     auto thcState = ::at::globalContext().lazyInitCUDA();
     for (size_t i = 0; i < devices_.size(); i++) {
@@ -148,6 +163,7 @@ void ProcessGroupGloo::WorkGloo::synchronize() {
       C10D_CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0));
     }
   }
+#endif
 }
 
 bool ProcessGroupGloo::WorkGloo::wait() {
@@ -170,8 +186,8 @@ void ProcessGroupGloo::WorkGloo::finish(const AlgorithmEntry& entry) {
   {
     std::unique_lock<std::mutex> lock(m_);
     completed_ = true;
+#ifdef USE_CUDA
     cuda_ = entry.key.type->is_cuda();
-
     // Populate devices and events so that we can later synchronize
     // with the operation associated with this work finishing.
     if (cuda_) {
@@ -186,6 +202,7 @@ void ProcessGroupGloo::WorkGloo::finish(const AlgorithmEntry& entry) {
         C10D_CUDA_CHECK(cudaEventRecord(event, stream));
       }
     }
+#endif
   }
   cv_.notify_all();
 }
@@ -203,11 +220,9 @@ void ProcessGroupGloo::WorkGloo::finishWithException(
 ProcessGroupGloo::SendWork::SendWork(
     at::Tensor& tensor,
     std::unique_ptr<::gloo::transport::UnboundBuffer> buffer)
-    : tensor_(tensor),
-      buffer_(std::move(buffer)) {
-}
+    : tensor_(tensor), buffer_(std::move(buffer)) {}
 
-bool ProcessGroupGloo::SendWork::isCompleted() const {
+bool ProcessGroupGloo::SendWork::isCompleted() {
   // No way to poll for completion yet
   return true;
 }
@@ -235,12 +250,9 @@ ProcessGroupGloo::RecvWork::RecvWork(
     at::Tensor& tensor,
     std::unique_ptr<::gloo::transport::UnboundBuffer> buffer,
     int* srcRank)
-    : tensor_(tensor),
-      buffer_(std::move(buffer)),
-      srcRank_(srcRank) {
-}
+    : tensor_(tensor), buffer_(std::move(buffer)), srcRank_(srcRank) {}
 
-bool ProcessGroupGloo::RecvWork::isCompleted() const {
+bool ProcessGroupGloo::RecvWork::isCompleted() {
   // No way to poll for completion yet
   return true;
 }
@@ -295,7 +307,9 @@ ProcessGroupGloo::ProcessGroupGloo(
     threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this);
   }
 
+#ifdef USE_CUDA
   thcState_ = ::at::globalContext().lazyInitCUDA();
+#endif
 }
 
 ProcessGroupGloo::~ProcessGroupGloo() {
@@ -405,6 +419,7 @@ void ProcessGroupGloo::createAllreduce(AlgorithmEntry& entry) {
     return;
   }
 
+#ifdef USE_CUDA
   if (backend == at::Backend::CUDA) {
     if (getSize() < 16) {
       entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
@@ -423,6 +438,7 @@ void ProcessGroupGloo::createAllreduce(AlgorithmEntry& entry) {
     }
     return;
   }
+#endif
 
   throw std::runtime_error(
       "Unhandled backend: " + std::string(at::toString(backend)));
@@ -448,6 +464,7 @@ void ProcessGroupGloo::createBroadcast(AlgorithmEntry& entry) {
     return;
   }
 
+#ifdef USE_CUDA
   if (backend == at::Backend::CUDA) {
     entry.algorithm =
         std::unique_ptr<::gloo::Algorithm>(new ::gloo::CudaBroadcastOneToAll<T>(
@@ -459,6 +476,7 @@ void ProcessGroupGloo::createBroadcast(AlgorithmEntry& entry) {
             getStreamVector(entry)));
     return;
   }
+#endif
 
   throw std::runtime_error(
       "Unhandled backend: " + std::string(at::toString(backend)));
@@ -483,10 +501,18 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) {
   auto& srcSizes = key.srcSizes;
   entry->src.resize(srcSizes.size());
   for (size_t i = 0; i < srcSizes.size(); i++) {
+#ifdef USE_CUDA
     deviceGuard.set_index(key.type->is_cuda() ? key.devices[i] : -1);
+#else
+    if (key.type->is_cuda()) {
+      throw std::runtime_error("ProcessGroupGloo is not built with CUDA");
+    }
+    deviceGuard.set_index(-1);
+#endif
     entry->src[i] = key.type->tensor(srcSizes[i]);
   }
 
+#ifdef USE_CUDA
   // If these are CUDA tensors, create streams and events
   if (key.type->is_cuda()) {
     entry->streams.resize(key.devices.size());
@@ -497,6 +523,7 @@ EntryType ProcessGroupGloo::construct(const AlgorithmKey& key) {
       entry->events[i] = CUDAEvent::create();
     }
   }
+#endif
 
   return entry;
 }
@@ -561,6 +588,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
     entry->src[opts.rootTensor].copy_(tensors[opts.rootTensor]);
   }
 
+#ifdef USE_CUDA
   // In case of CUDA, ensure that operations that are queued after
   // this collective wait for the collective to complete.
   if (key.type->is_cuda()) {
@@ -576,13 +604,16 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::broadcast(
       }
     };
   } else {
+#endif
     entry->run = [=]() mutable {
       entry->algorithm->run();
       for (size_t i = 0; i < tensors.size(); i++) {
         tensors[i].copy_(entry->src[i]);
       }
     };
+#ifdef USE_CUDA
   }
+#endif
 
   return enqueue(entry);
 }
@@ -607,6 +638,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
     entry->src[i].copy_(tensors[i]);
   }
 
+#ifdef USE_CUDA
   // In case of CUDA, ensure that operations that are queued after
   // this collective wait for the collective to complete.
   if (key.type->is_cuda()) {
@@ -622,14 +654,16 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
       }
     };
   } else {
+#endif
     entry->run = [=]() mutable {
       entry->algorithm->run();
       for (size_t i = 0; i < tensors.size(); i++) {
         tensors[i].copy_(entry->src[i]);
       }
     };
+#ifdef USE_CUDA
   }
-
+#endif
   return enqueue(entry);
 }
 
@@ -673,17 +707,26 @@ at::Tensor& checkSingleTensor(std::vector<at::Tensor>& tensors) {
   return tensor;
 }
 
+uint32_t checkTag(int32_t tag) {
+  if (tag < 0) {
+    throw std::runtime_error("Tag must be >= 0");
+  }
+  return (uint32_t) tag;
+}
+
 std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::send(
     std::vector<at::Tensor>& tensors,
-    int dstRank) {
+    int dstRank,
+    int tag) {
   auto& tensor = checkSingleTensor(tensors);
+  auto utag = checkTag(tag);
   auto ptr = tensor.data_ptr();
   auto size = tensor.numel() * tensor.type().elementSizeInBytes();
 
   // Construct unbound buffer.
   auto& context = contexts_[0];
   auto buf = context->createUnboundBuffer(ptr, size);
-  buf->send(dstRank, 0);
+  buf->send(dstRank, utag);
 
   // The work captures the tensor to prevent it being deallocated and
   // the unbound buffer to synchronize on completion of the send.
@@ -692,15 +735,17 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::send(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recv(
     std::vector<at::Tensor>& tensors,
-    int srcRank) {
+    int srcRank,
+    int tag) {
   auto& tensor = checkSingleTensor(tensors);
+  auto utag = checkTag(tag);
   auto ptr = tensor.data_ptr();
   auto size = tensor.numel() * tensor.type().elementSizeInBytes();
 
   // Construct unbound buffer.
   auto& context = contexts_[0];
   auto buf = context->createUnboundBuffer(ptr, size);
-  buf->recv(srcRank, 0);
+  buf->recv(srcRank, utag);
 
   // The work captures the tensor to prevent it being deallocated and
   // the unbound buffer to synchronize on completion of the recv.
@@ -709,8 +754,10 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recv(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recvAnysource(
     std::vector<at::Tensor>& tensors,
-    int* srcRank) {
+    int* srcRank,
+    int tag) {
   auto& tensor = checkSingleTensor(tensors);
+  auto utag = checkTag(tag);
   auto ptr = tensor.data_ptr();
   auto size = tensor.numel() * tensor.type().elementSizeInBytes();
 
@@ -727,7 +774,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recvAnysource(
     srcRanks.push_back(i);
   }
 
-  buf->recv(srcRanks, 0);
+  buf->recv(srcRanks, utag);
 
   // The work captures the tensor to prevent it being deallocated and
   // the unbound buffer to synchronize on completion of the recv.
diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp
index 9d5be5d200d3d1..773ad600d4402f 100644
--- a/torch/lib/c10d/ProcessGroupGloo.hpp
+++ b/torch/lib/c10d/ProcessGroupGloo.hpp
@@ -15,14 +15,19 @@
 
 #include <torch/csrc/utils/hash.h>
 
+#ifdef USE_CUDA
 #include <c10d/CUDAUtils.hpp>
+#endif
+
 #include <c10d/ProcessGroup.hpp>
 #include <c10d/Store.hpp>
 #include <c10d/Types.hpp>
 #include <c10d/Utils.hpp>
 
+#ifdef USE_CUDA
 // Forward declaration
 struct THCState;
+#endif
 
 namespace c10d {
 
@@ -95,6 +100,7 @@ struct AlgorithmEntry {
   std::vector<at::Tensor> dst;
   std::function<void()> run;
 
+#ifdef USE_CUDA
   // For CUDA tensors, the following happens:
   //
   // - Input tensor A is copied to persistent tensor B on the stream
@@ -120,6 +126,7 @@ struct AlgorithmEntry {
   //
   std::vector<CUDAStream> streams;
   std::vector<CUDAEvent> events;
+#endif
 
   // Used to synchronize between calling thread and worker threads.
   std::mutex m;
@@ -170,7 +177,7 @@ class ProcessGroupGloo : public ProcessGroup {
     explicit WorkGloo();
     virtual ~WorkGloo();
 
-    bool isCompleted() const override;
+    bool isCompleted() override;
     bool isSuccess() const override;
     void synchronize() override;
     bool wait() override;
@@ -189,6 +196,7 @@ class ProcessGroupGloo : public ProcessGroup {
     // is probably cheaper (this is highly speculative).
     std::unique_ptr<::gloo::Exception> ex_;
 
+#ifdef USE_CUDA
     // List of devices and events so that we can synchronize the
     // streams of the caller with the kernels that were launched
     // asynchronously to finish this operation.
@@ -208,6 +216,7 @@ class ProcessGroupGloo : public ProcessGroup {
     bool cuda_;
     std::vector<int> devices_;
     std::vector<CUDAEvent> events_;
+#endif
 
     friend class ProcessGroupGloo;
   };
@@ -226,7 +235,7 @@ class ProcessGroupGloo : public ProcessGroup {
 
     virtual ~SendWork() = default;
 
-    bool isCompleted() const override;
+    bool isCompleted() override;
 
     bool isSuccess() const override;
 
@@ -250,7 +259,7 @@ class ProcessGroupGloo : public ProcessGroup {
 
     virtual ~RecvWork() = default;
 
-    bool isCompleted() const override;
+    bool isCompleted() override;
 
     bool isSuccess() const override;
 
@@ -318,15 +327,18 @@ class ProcessGroupGloo : public ProcessGroup {
 
   std::shared_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
-      int dstRank) override;
+      int dstRank,
+      int tag) override;
 
   std::shared_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
-      int srcRank) override;
+      int srcRank,
+      int tag) override;
 
   std::shared_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
-      int* srcRank) override;
+      int* srcRank,
+      int tag) override;
 
   std::shared_ptr<ProcessGroup::Work> barrier() override;
 
@@ -378,8 +390,10 @@ class ProcessGroupGloo : public ProcessGroup {
   std::condition_variable queueProduceCV_;
   std::condition_variable queueConsumeCV_;
 
+#ifdef USE_CUDA
   // Store copy of pointer to THCState retrieved from ::at::globalContext().
   THCState* thcState_;
+#endif
 };
 
 } // namespace c10d
diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp
index 31578089677acd..63846b443ea072 100644
--- a/torch/lib/c10d/ProcessGroupMPI.cpp
+++ b/torch/lib/c10d/ProcessGroupMPI.cpp
@@ -93,7 +93,7 @@ ProcessGroupMPI::WorkMPI::WorkMPI() : completed_(false) {}
 
 ProcessGroupMPI::WorkMPI::~WorkMPI() {}
 
-bool ProcessGroupMPI::WorkMPI::isCompleted() const {
+bool ProcessGroupMPI::WorkMPI::isCompleted() {
   return completed_;
 }
 
@@ -137,6 +137,78 @@ const std::exception& ProcessGroupMPI::WorkMPI::exception() const {
   }
 }
 
+ProcessGroupMPI::AsyncWork::AsyncWork(
+    at::Tensor tensor,
+    MPI_Request request,
+    int* srcRank)
+    : tensor_(std::move(tensor)), request_(request), srcRank_(srcRank) {
+  memset(&status_, 0, sizeof(status_));
+}
+
+ProcessGroupMPI::AsyncWork::~AsyncWork() {
+  if (request_ != MPI_REQUEST_NULL) {
+    throw std::runtime_error(
+        "Attempted destruction of AsyncWork before work has completed");
+  }
+}
+
+bool ProcessGroupMPI::AsyncWork::isCompleted() {
+  if (request_ == MPI_REQUEST_NULL) {
+    return true;
+  }
+
+  std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+  int flag = 0;
+  MPI_CHECK(MPI_Test(&request_, &flag, &status_));
+  if (request_ != MPI_REQUEST_NULL) {
+    return false;
+  }
+
+  // request_ == MPI_REQUEST_NULL; the work has completed
+  if (srcRank_ != nullptr) {
+    *srcRank_ = status_.MPI_SOURCE;
+  }
+
+  return true;
+}
+
+bool ProcessGroupMPI::AsyncWork::isSuccess() const {
+  if (request_ != MPI_REQUEST_NULL) {
+    throw std::runtime_error(
+        "Invalid call to AsyncWork::isSuccess before work has completed");
+  }
+
+  return status_.MPI_ERROR == MPI_SUCCESS;
+}
+
+void ProcessGroupMPI::AsyncWork::synchronize() {}
+
+bool ProcessGroupMPI::AsyncWork::wait() {
+  if (request_ == MPI_REQUEST_NULL) {
+    return true;
+  }
+
+  std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+  MPI_CHECK(MPI_Wait(&request_, &status_));
+  if (srcRank_ != nullptr && status_.MPI_ERROR == MPI_SUCCESS) {
+    *srcRank_ = status_.MPI_SOURCE;
+  }
+
+  return status_.MPI_ERROR == MPI_SUCCESS;
+}
+
+const std::exception& ProcessGroupMPI::AsyncWork::exception() const {
+  if (request_ != MPI_REQUEST_NULL) {
+    throw std::runtime_error(
+        "Invalid call to AsyncWork::exception before work has completed");
+  }
+
+  std::array<char, MPI_MAX_ERROR_STRING> buf;
+  int len = buf.size();
+  MPI_CHECK(MPI_Error_string(status_.MPI_ERROR, buf.data(), &len));
+  return std::runtime_error(std::string(buf.data(), len));
+}
+
 // Static global states
 int ProcessGroupMPI::numProcessGroups_ = 0;
 int ProcessGroupMPI::mpiThreadSupport_ = 0;
@@ -181,33 +253,30 @@ std::shared_ptr<ProcessGroupMPI> ProcessGroupMPI::createProcessGroupMPI(
   MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &size));
   MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
 
-  globalLock.unlock();
-
   if (rank < 0 || size < 0) {
     throw std::runtime_error("Failed to get the world_size / rank");
   }
 
+  // If no ranks are specified, assume we're creating the root group
   if (ranks.empty()) {
+    globalLock.unlock();
     return std::make_shared<ProcessGroupMPI>(rank, size, MPI_COMM_WORLD);
-  } else {
-    std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+  }
 
-    MPI_Group worldGroup;
-    MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
+  MPI_Group worldGroup;
+  MPI_CHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
 
-    MPI_Group ranksGroup;
-    MPI_CHECK(
-        MPI_Group_incl(worldGroup, ranks.size(), ranks.data(), &ranksGroup));
+  MPI_Group ranksGroup;
+  MPI_CHECK(MPI_Group_incl(worldGroup, ranks.size(), ranks.data(), &ranksGroup));
 
-    MPI_Comm groupComm;
-    MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, ranksGroup, &groupComm));
+  MPI_Comm groupComm;
+  MPI_CHECK(MPI_Comm_create(MPI_COMM_WORLD, ranksGroup, &groupComm));
 
-    MPI_CHECK(MPI_Group_free(&worldGroup));
-    MPI_CHECK(MPI_Group_free(&ranksGroup));
+  MPI_CHECK(MPI_Group_free(&worldGroup));
+  MPI_CHECK(MPI_Group_free(&ranksGroup));
 
-    globalLock.unlock();
-    return std::make_shared<ProcessGroupMPI>(rank, size, groupComm);
-  }
+  globalLock.unlock();
+  return std::make_shared<ProcessGroupMPI>(rank, size, groupComm);
 }
 
 ProcessGroupMPI::ProcessGroupMPI(int rank, int size, MPI_Comm pgComm)
@@ -577,82 +646,86 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::scatter(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::send(
     std::vector<at::Tensor>& tensors,
-    int dstRank) {
+    int dstRank,
+    int tag) {
   if (pgComm_ == MPI_COMM_NULL) {
     return nullptr;
   }
+
   checkSingleTensor(tensors);
-  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
-      [dstRank, this](std::unique_ptr<WorkEntry>& entry) {
-        auto data = (entry->src)[0];
 
-        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-        MPI_CHECK(MPI_Send(
-            data.data_ptr(),
-            data.numel(),
-            mpiDatatype.at(data.type().scalarType()),
-            dstRank,
-            0,
-            pgComm_));
-      };
-  auto entry = std::unique_ptr<WorkEntry>(
-      new WorkEntry(&tensors, nullptr, std::move(runFunc)));
-  return enqueue(std::move(entry));
+  auto& tensor = tensors[0];
+  MPI_Request request = MPI_REQUEST_NULL;
+
+  {
+    std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+    MPI_CHECK(MPI_Isend(
+        tensor.data_ptr(),
+        tensor.numel(),
+        mpiDatatype.at(tensor.type().scalarType()),
+        dstRank,
+        tag,
+        pgComm_,
+        &request));
+  }
+
+  return std::make_shared<AsyncWork>(tensor, request);
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recv(
     std::vector<at::Tensor>& tensors,
-    int srcRank) {
+    int srcRank,
+    int tag) {
   if (pgComm_ == MPI_COMM_NULL) {
     return nullptr;
   }
+
   checkSingleTensor(tensors);
-  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
-      [srcRank, this](std::unique_ptr<WorkEntry>& entry) {
-        auto data = (entry->src)[0];
 
-        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-        MPI_CHECK(MPI_Recv(
-            data.data_ptr(),
-            data.numel(),
-            mpiDatatype.at(data.type().scalarType()),
-            srcRank,
-            0,
-            pgComm_,
-            MPI_STATUS_IGNORE));
-      };
-  auto entry = std::unique_ptr<WorkEntry>(
-      new WorkEntry(&tensors, nullptr, std::move(runFunc)));
-  return enqueue(std::move(entry));
+  auto& tensor = tensors[0];
+  MPI_Request request = MPI_REQUEST_NULL;
+
+  {
+    std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+    MPI_CHECK(MPI_Irecv(
+        tensor.data_ptr(),
+        tensor.numel(),
+        mpiDatatype.at(tensor.type().scalarType()),
+        srcRank,
+        tag,
+        pgComm_,
+        &request));
+  }
+
+  return std::make_shared<AsyncWork>(tensor, request);
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::recvAnysource(
     std::vector<at::Tensor>& tensors,
-    int* srcRank) {
+    int* srcRank,
+    int tag) {
   if (pgComm_ == MPI_COMM_NULL) {
     return nullptr;
   }
+
   checkSingleTensor(tensors);
-  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
-      [srcRank, this](std::unique_ptr<WorkEntry>& entry) {
-        auto data = (entry->src)[0];
-        MPI_Status status;
 
-        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
-        MPI_CHECK(MPI_Recv(
-            data.data_ptr(),
-            data.numel(),
-            mpiDatatype.at(data.type().scalarType()),
-            MPI_ANY_SOURCE,
-            0,
-            pgComm_,
-            &status));
-        *(entry->srcRank) = status.MPI_SOURCE;
-      };
-  auto entry = std::unique_ptr<WorkEntry>(
-      new WorkEntry(&tensors, nullptr, std::move(runFunc)));
-  entry->srcRank = srcRank;
-  return enqueue(std::move(entry));
+  auto& tensor = tensors[0];
+  MPI_Request request = MPI_REQUEST_NULL;
+
+  {
+    std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+    MPI_CHECK(MPI_Irecv(
+        tensor.data_ptr(),
+        tensor.numel(),
+        mpiDatatype.at(tensor.type().scalarType()),
+        MPI_ANY_SOURCE,
+        tag,
+        pgComm_,
+        &request));
+  }
+
+  return std::make_shared<AsyncWork>(tensor, request, srcRank);
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupMPI::barrier() {
diff --git a/torch/lib/c10d/ProcessGroupMPI.hpp b/torch/lib/c10d/ProcessGroupMPI.hpp
index b7989aeba69e32..5bd2b303c1a4e1 100644
--- a/torch/lib/c10d/ProcessGroupMPI.hpp
+++ b/torch/lib/c10d/ProcessGroupMPI.hpp
@@ -78,7 +78,7 @@ class ProcessGroupMPI : public ProcessGroup {
     virtual ~WorkMPI();
 
     // Checks if request has completed. Non-blocking operation.
-    bool isCompleted() const override;
+    bool isCompleted() override;
 
     // Returns if the work completed successfully
     // if false, the exception function can be called to get details.
@@ -107,6 +107,28 @@ class ProcessGroupMPI : public ProcessGroup {
     friend class ProcessGroupMPI;
   };
 
+  class AsyncWork : public ProcessGroup::Work {
+   public:
+    AsyncWork(at::Tensor tensor, MPI_Request request, int* srcRank = nullptr);
+    virtual ~AsyncWork();
+
+    bool isCompleted() override;
+
+    bool isSuccess() const override;
+
+    void synchronize() override;
+
+    bool wait() override;
+
+    const std::exception& exception() const override;
+
+   protected:
+    at::Tensor tensor_;
+    MPI_Request request_;
+    int* const srcRank_;
+    MPI_Status status_;
+  };
+
   // Constructor will spawn up the worker thread loop
   explicit ProcessGroupMPI(int rank, int size, MPI_Comm pgComm);
 
@@ -143,15 +165,18 @@ class ProcessGroupMPI : public ProcessGroup {
 
   std::shared_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
-      int dstRank);
+      int dstRank,
+      int tag);
 
   std::shared_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
-      int srcRank);
+      int srcRank,
+      int tag);
 
   std::shared_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensor,
-      int* srcRank);
+      int* srcRank,
+      int tag);
 
   std::shared_ptr<ProcessGroup::Work> barrier();
 
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index a28757ede24c3e..db3fcf5e211789 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -100,7 +100,7 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const std::vector<at::Device>& devices)
 ProcessGroupNCCL::WorkNCCL::~WorkNCCL() {}
 
 // Check if the NCCL kernels are queued on the GPUs
-bool ProcessGroupNCCL::WorkNCCL::isCompleted() const {
+bool ProcessGroupNCCL::WorkNCCL::isCompleted() {
   return true;
 }
 
@@ -599,19 +599,22 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::scatter(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
     std::vector<at::Tensor>& /* unused */,
+    int /* unused */,
     int /* unused */) {
   throw std::runtime_error("ProcessGroupNCCL does not support send");
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
     std::vector<at::Tensor>& /* unused */,
+    int /* unused */,
     int /* unused */) {
   throw std::runtime_error("ProcessGroupNCCL does not support recv");
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recvAnysource(
     std::vector<at::Tensor>& /* unused */,
-    int* /* unused */) {
+    int* /* unused */,
+    int /* unused */) {
   throw std::runtime_error("ProcessGroupNCCL does not support recv");
 }
 
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
index 282c679f0dd1e8..3eca7c4d95c535 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.hpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -66,7 +66,7 @@ class ProcessGroupNCCL : public ProcessGroup {
     // Checks if request has completed. In this specific case of NCCL, it checks
     // if the NCCL operation has completed on the GPU in its own NCCL stream.
     // Non-blocking operation.
-    bool isCompleted() const override;
+    bool isCompleted() override;
 
     // Let current THC stream wait on the completing of the NCCL work
     // always return true and will throw if there are exceptions
@@ -130,15 +130,18 @@ class ProcessGroupNCCL : public ProcessGroup {
 
   std::shared_ptr<ProcessGroup::Work> send(
       std::vector<at::Tensor>& tensors,
-      int dstRank) override;
+      int dstRank,
+      int tag) override;
 
   std::shared_ptr<ProcessGroup::Work> recv(
       std::vector<at::Tensor>& tensors,
-      int srcRank) override;
+      int srcRank,
+      int tag) override;
 
   std::shared_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
-      int* srcRank) override;
+      int* srcRank,
+      int tag) override;
 
   std::shared_ptr<ProcessGroup::Work> barrier() override;
 
diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt
index 6bcb2b5d625fb0..d7a8643e16db1a 100644
--- a/torch/lib/c10d/test/CMakeLists.txt
+++ b/torch/lib/c10d/test/CMakeLists.txt
@@ -1,5 +1,7 @@
-cuda_add_library(c10d_cuda_test CUDATest.cu)
-target_link_libraries(c10d_cuda_test c10d)
+if(C10D_USE_CUDA)
+  cuda_add_library(c10d_cuda_test CUDATest.cu)
+  target_link_libraries(c10d_cuda_test c10d)
+endif()
 
 function(c10d_add_test test_src)
   get_filename_component(test_name ${test_src} NAME_WE)
@@ -11,12 +13,18 @@ endfunction()
 
 c10d_add_test(FileStoreTest.cpp c10d)
 c10d_add_test(TCPStoreTest.cpp c10d)
-c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d_cuda_test)
-c10d_add_test(ProcessGroupGlooAsyncTest.cpp c10d c10d_cuda_test)
+
+if(C10D_USE_CUDA)
+  c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d_cuda_test)
+  c10d_add_test(ProcessGroupGlooAsyncTest.cpp c10d c10d_cuda_test)
+  if(DISTRIBUTED_NCCL_FOUND)
+    c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test)
+  endif()
+else()
+  c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d)
+endif()
+
 if(MPI_FOUND)
   add_definitions(-DMPIEXEC=${MPIEXEC})
   c10d_add_test(ProcessGroupMPITest.cpp c10d)
 endif()
-if(DISTRIBUTED_NCCL_FOUND)
-  c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test)
-endif()
diff --git a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
index 7e7062b9ce5401..74e96acc15152b 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp
@@ -52,6 +52,8 @@ class AsyncTest {
     // Use tiny timeout to make this test run fast
     ::c10d::ProcessGroupGloo::Options options;
     options.timeout = std::chrono::milliseconds(50);
+    ::gloo::transport::tcp::attr attr;
+    options.devices.push_back(::gloo::transport::tcp::CreateDevice(attr));
 
     pg_ = std::unique_ptr<::c10d::ProcessGroupGloo>(
         new ::c10d::ProcessGroupGloo(store, rank, size, options));
@@ -262,4 +264,5 @@ int main(int argc, char** argv) {
     TemporaryFile file;
     runAsyncBroadcastTest(file.path, 4, 1);
   }
+  std::cout << "Test successful" << std::endl;
 }
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
index 89f3377dce1a26..a9b272cab8f5c5 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
@@ -11,7 +11,10 @@
 
 #include <gloo/transport/tcp/device.h>
 
+#ifdef USE_CUDA
 #include <c10d/CUDAUtils.hpp>
+#endif
+
 #include <c10d/FileStore.hpp>
 #include <c10d/ProcessGroupGloo.hpp>
 #include <c10d/test/TestUtils.hpp>
@@ -43,6 +46,8 @@ class SignalTest {
     // Use tiny timeout to make this test run fast
     ::c10d::ProcessGroupGloo::Options options;
     options.timeout = std::chrono::milliseconds(50);
+    ::gloo::transport::tcp::attr attr;
+    options.devices.push_back(::gloo::transport::tcp::CreateDevice(attr));
 
     ::c10d::ProcessGroupGloo pg(store, rank, size, options);
 
@@ -125,6 +130,9 @@ class CollectiveTest {
     ::c10d::ProcessGroupGloo::Options options;
     options.timeout = std::chrono::milliseconds(50);
 
+    ::gloo::transport::tcp::attr attr;
+    options.devices.push_back(::gloo::transport::tcp::CreateDevice(attr));
+
     pg_ = std::unique_ptr<::c10d::ProcessGroupGloo>(
         new ::c10d::ProcessGroupGloo(store, rank, size, options));
   }
@@ -264,20 +272,25 @@ int main(int argc, char** argv) {
     testAllreduce(file.path, at::Backend::CPU);
   }
 
+#ifdef USE_CUDA
   {
     TemporaryFile file;
     testAllreduce(file.path, at::Backend::CUDA);
   }
+#endif
 
   {
     TemporaryFile file;
     testBroadcast(file.path, at::Backend::CPU);
   }
 
+#ifdef USE_CUDA
   {
     TemporaryFile file;
     testBroadcast(file.path, at::Backend::CUDA);
   }
+#endif
 
+  std::cout << "Test successful" << std::endl;
   return 0;
 }
diff --git a/torch/lib/c10d/test/ProcessGroupMPITest.cpp b/torch/lib/c10d/test/ProcessGroupMPITest.cpp
index 3a470fd1b88904..398bf31b9e06ae 100644
--- a/torch/lib/c10d/test/ProcessGroupMPITest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupMPITest.cpp
@@ -318,7 +318,7 @@ void testSendRecv(bool recvAnysource, int iter = 10000) {
     std::vector<std::shared_ptr<::c10d::ProcessGroup::Work>> works;
     for (auto& tensors : allTensors) {
       // Kick off work
-      std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->send(tensors, 1);
+      std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->send(tensors, 1, 0);
       works.push_back(std::move(work));
     }
     for (auto& work : works) {
@@ -337,11 +337,11 @@ void testSendRecv(bool recvAnysource, int iter = 10000) {
     for (auto& tensors : allTensors) {
       // Kick off work
       if (!recvAnysource) {
-        std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->recv(tensors, 0);
+        std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->recv(tensors, 0, 0);
         works.push_back(std::move(work));
       } else {
         std::shared_ptr<::c10d::ProcessGroup::Work> work =
-            pg->recvAnysource(tensors, &srcRanks[i]);
+            pg->recvAnysource(tensors, &srcRanks[i], 0);
         works.push_back(std::move(work));
       }
       ++i;
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 47b7375bd1f456..3c67393a01119b 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -67,17 +67,19 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 
 See :class:`~torch.nn.Conv1d` for details and output shape.
 
+.. include:: cudnn_deterministic.rst
+
 Args:
-    input: input tensor of shape :math:`minibatch \times in\_channels \times iW`
-    weight: filters of shape :math:`out\_channels \times \frac{in\_channels}{groups} \times kW`
-    bias: optional bias of shape (:math:`out\_channels`). Default: ``None``
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)`
+    weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kW)`
+    bias: optional bias of shape :math:`(\text{out\_channels})`. Default: ``None``
     stride: the stride of the convolving kernel. Can be a single number or
       a one-element tuple `(sW,)`. Default: 1
     padding: implicit zero paddings on both sides of the input. Can be a
       single number or a one-element tuple `(padW,)`. Default: 0
     dilation: the spacing between kernel elements. Can be a single number or
       a one-element tuple `(dW,)`. Default: 1
-    groups: split input into groups, :math:`in\_channels` should be divisible by
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by
       the number of groups. Default: 1
 
 Examples::
@@ -95,17 +97,19 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 
 See :class:`~torch.nn.Conv2d` for details and output shape.
 
+.. include:: cudnn_deterministic.rst
+
 Args:
-    input: input tensor of shape (:math:`minibatch \times in\_channels \times iH \times iW`)
-    weight: filters of shape (:math:`out\_channels \times \frac{in\_channels}{groups} \times kH \times kW`)
-    bias: optional bias tensor of shape (:math:`out\_channels`). Default: ``None``
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)`
+    weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kH \times kW)`
+    bias: optional bias tensor of shape :math:`(\text{out\_channels})`. Default: ``None``
     stride: the stride of the convolving kernel. Can be a single number or a
       tuple `(sH, sW)`. Default: 1
     padding: implicit zero paddings on both sides of the input. Can be a
       single number or a tuple `(padH, padW)`. Default: 0
     dilation: the spacing between kernel elements. Can be a single number or
       a tuple `(dH, dW)`. Default: 1
-    groups: split input into groups, :math:`in\_channels` should be divisible by the
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
       number of groups. Default: 1
 
 Examples::
@@ -114,7 +118,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
     >>> filters = torch.randn(8,4,3,3)
     >>> inputs = torch.randn(1,4,5,5)
     >>> F.conv2d(inputs, filters, padding=1)
-""")
+""")  # noqa: E501
 
 conv3d = _add_docstr(torch.conv3d, r"""
 conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
@@ -124,17 +128,19 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 
 See :class:`~torch.nn.Conv3d` for details and output shape.
 
+.. include:: cudnn_deterministic.rst
+
 Args:
-    input: input tensor of shape (:math:`minibatch \times in\_channels \times iT \times iH \times iW`)
-    weight: filters of shape (:math:`out\_channels \times \frac{in\_channels}{groups} \times kT \times kH \times kW`)
-    bias: optional bias tensor of shape (:math:`out\_channels`). Default: None
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)`
+    weight: filters of shape :math:`(\text{out\_channels} \times \frac{\text{in\_channels}}{\text{groups}} \times kT \times kH \times kW)`
+    bias: optional bias tensor of shape :math:`(\text{out\_channels})`. Default: None
     stride: the stride of the convolving kernel. Can be a single number or a
       tuple `(sT, sH, sW)`. Default: 1
     padding: implicit zero paddings on both sides of the input. Can be a
       single number or a tuple `(padT, padH, padW)`. Default: 0
     dilation: the spacing between kernel elements. Can be a single number or
       a tuple `(dT, dH, dW)`. Default: 1
-    groups: split input into groups, :math:`in\_channels` should be divisible by
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by
       the number of groups. Default: 1
 
 Examples::
@@ -142,7 +148,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
     >>> filters = torch.randn(33, 16, 3, 3, 3)
     >>> inputs = torch.randn(20, 16, 50, 10, 20)
     >>> F.conv3d(inputs, filters)
-""")
+""")  # noqa: E501
 
 conv_transpose1d = _add_docstr(torch.conv_transpose1d, r"""
 conv_transpose1d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
@@ -152,10 +158,12 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 
 See :class:`~torch.nn.ConvTranspose1d` for details and output shape.
 
+.. include:: cudnn_deterministic.rst
+
 Args:
-    input: input tensor of shape (:math:`minibatch \times in\_channels \times iW`)
-    weight: filters of shape (:math:`in\_channels \times \frac{out\_channels}{groups} \times kW`)
-    bias: optional bias of shape (:math:`out\_channels`). Default: None
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)`
+    weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kW)`
+    bias: optional bias of shape :math:`(\text{out\_channels})`. Default: None
     stride: the stride of the convolving kernel. Can be a single number or a
       tuple ``(sW,)``. Default: 1
     padding: ``kernel_size - 1 - padding`` zero-padding will be added to both
@@ -163,7 +171,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
       ``(padW,)``. Default: 0
     output_padding: additional size added to one side of each dimension in the
       output shape. Can be a single number or a tuple ``(out_padW)``. Default: 0
-    groups: split input into groups, :math:`in\_channels` should be divisible by the
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
       number of groups. Default: 1
     dilation: the spacing between kernel elements. Can be a single number or
       a tuple ``(dW,)``. Default: 1
@@ -183,10 +191,12 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 
 See :class:`~torch.nn.ConvTranspose2d` for details and output shape.
 
+.. include:: cudnn_deterministic.rst
+
 Args:
-    input: input tensor of shape (:math:`minibatch \times in\_channels \times iH \times iW`)
-    weight: filters of shape (:math:`in\_channels \times \frac{out\_channels}{groups} \times kH \times kW`)
-    bias: optional bias of shape (:math:`out\_channels`). Default: None
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)`
+    weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kH \times kW)`
+    bias: optional bias of shape :math:`(\text{out\_channels})`. Default: None
     stride: the stride of the convolving kernel. Can be a single number or a
       tuple ``(sH, sW)``. Default: 1
     padding: ``kernel_size - 1 - padding`` zero-padding will be added to both
@@ -195,7 +205,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
     output_padding: additional size added to one side of each dimension in the
       output shape. Can be a single number or a tuple ``(out_padH, out_padW)``.
       Default: 0
-    groups: split input into groups, :math:`in\_channels` should be divisible by the
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
       number of groups. Default: 1
     dilation: the spacing between kernel elements. Can be a single number or
       a tuple ``(dH, dW)``. Default: 1
@@ -206,7 +216,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
     >>> inputs = torch.randn(1, 4, 5, 5)
     >>> weights = torch.randn(4, 8, 3, 3)
     >>> F.conv_transpose2d(inputs, weights, padding=1)
-""")
+""")  # noqa: E501
 
 conv_transpose3d = _add_docstr(torch.conv_transpose3d, r"""
 conv_transpose3d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
@@ -216,10 +226,12 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
 
 See :class:`~torch.nn.ConvTranspose3d` for details and output shape.
 
+.. include:: cudnn_deterministic.rst
+
 Args:
-    input: input tensor of shape (:math:`minibatch \times in\_channels \times iT \times iH \times iW`)
-    weight: filters of shape (:math:`in\_channels \times \frac{out\_channels}{groups} \times kT \times kH \times kW`)
-    bias: optional bias of shape (:math:`out\_channels`). Default: None
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)`
+    weight: filters of shape :math:`(\text{in\_channels} \times \frac{\text{out\_channels}}{\text{groups}} \times kT \times kH \times kW)`
+    bias: optional bias of shape :math:`(\text{out\_channels})`. Default: None
     stride: the stride of the convolving kernel. Can be a single number or a
       tuple ``(sT, sH, sW)``. Default: 1
     padding: ``kernel_size - 1 - padding`` zero-padding will be added to both
@@ -228,7 +240,7 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
     output_padding: additional size added to one side of each dimension in the
       output shape. Can be a single number or a tuple
       ``(out_padT, out_padH, out_padW)``. Default: 0
-    groups: split input into groups, :math:`in\_channels` should be divisible by the
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
       number of groups. Default: 1
     dilation: the spacing between kernel elements. Can be a single number or
       a tuple `(dT, dH, dW)`. Default: 1
@@ -238,20 +250,18 @@ def legacy_get_enum(size_average, reduce, emit_warning=True):
     >>> inputs = torch.randn(20, 16, 50, 10, 20)
     >>> weights = torch.randn(16, 33, 3, 3, 3)
     >>> F.conv_transpose3d(inputs, weights)
-""")
+""")  # noqa: E501
 
+conv_tbc = _add_docstr(torch.conv_tbc, r"""
+Applies a 1-dimensional sequence convolution over an input sequence.
+Input and output dimensions are (Time, Batch, Channels) - hence TBC.
 
-def conv_tbc(input, weight, bias, pad=0):
-    r"""Applies a 1-dimensional sequence convolution over an input sequence.
-    Input and output dimensions are (Time, Batch, Channels) - hence TBC.
-
-    Args:
-        input: input tensor of shape (:math:`\text{sequence length} \times batch \times in\_channels`)
-        weight: filter of shape (:math:`\text{kernel width} \times in\_channels \times out\_channels`)
-        bias: bias of shape (:math:`out\_channels`)
-        pad: number of timesteps to pad
-    """
-    return torch.conv_tbc(input, weight, bias, pad)
+Args:
+    input: input tensor of shape :math:`(\text{sequence length} \times batch \times \text{in\_channels})`
+    weight: filter of shape (:math:`\text{kernel width} \times \text{in\_channels} \times \text{out\_channels}`)
+    bias: bias of shape (:math:`\text{out\_channels}`)
+    pad: number of timesteps to pad. Default: 0
+""")
 
 
 # Pooling
@@ -264,9 +274,9 @@ def conv_tbc(input, weight, bias, pad=0):
 See :class:`~torch.nn.AvgPool1d` for details and output shape.
 
 Args:
-    input: input tensor of shape (:math:`minibatch \times in\_channels \times iW`)
+    input: input tensor of shape :math:`(\text{minibatch} \times \text{in\_channels} \times iW)`
     kernel_size: the size of the window. Can be a single number or a
-      tuple `(kW,)`
+      tuple :math:`(kW,)`
     stride: the stride of the window. Can be a single number or a tuple
       `(sW,)`. Default: :attr:`kernel_size`
     padding: implicit zero paddings on both sides of the input. Can be a
@@ -276,11 +286,12 @@ def conv_tbc(input, weight, bias, pad=0):
     count_include_pad: when True, will include the zero-padding in the
         averaging calculation. Default: ``True``
 
-Example::
+Examples::
     >>> # pool of square window of size=3, stride=2
     >>> input = torch.tensor([[[1,2,3,4,5,6,7]]])
     >>> F.avg_pool1d(input, kernel_size=3, stride=2)
     tensor([[[ 2.,  4.,  6.]]])
+
 """)
 
 
@@ -294,9 +305,9 @@ def conv_tbc(input, weight, bias, pad=0):
 See :class:`~torch.nn.AvgPool2d` for details and output shape.
 
 Args:
-    input: input tensor (:math:`minibatch \times in\_channels \times iH \times iW`)
+    input: input tensor :math:`(\text{minibatch} \times \text{in\_channels} \times iH \times iW)`
     kernel_size: size of the pooling region. Can be a single number or a
-      tuple (:math:`kH \times kW`)
+      tuple :math:`(kH \times kW)`
     stride: stride of the pooling operation. Can be a single number or a
       tuple `(sH, sW)`. Default: :attr:`kernel_size`
     padding: implicit zero paddings on both sides of the input. Can be a
@@ -317,9 +328,9 @@ def conv_tbc(input, weight, bias, pad=0):
 See :class:`~torch.nn.AvgPool3d` for details and output shape.
 
 Args:
-    input: input tensor (:math:`minibatch \times in\_channels \times iT \times iH \times iW`)
+    input: input tensor :math:`(\text{minibatch} \times \text{in\_channels} \times iT \times iH \times iW)`
     kernel_size: size of the pooling region. Can be a single number or a
-      tuple (:math:`kT \times kH \times kW`)
+      tuple :math:`(kT \times kH \times kW)`
     stride: stride of the pooling operation. Can be a single number or a
       tuple `(sT, sH, sW)`. Default: :attr:`kernel_size`
     padding: implicit zero paddings on both sides of the input. Can be a
@@ -345,13 +356,13 @@ def fractional_max_pool2d(input, kernel_size, output_size=None,
     Args:
         kernel_size: the size of the window to take a max over.
                      Can be a single number :math:`k` (for a square kernel of :math:`k \times k`)
-                     or a tuple (:math:`kH \times kW`)
+                     or a tuple :math:`(kH \times kW)`
         output_size: the target output size of the image of the form :math:`oH \times oW`.
                      Can be a tuple `(oH, oW)` or a single number :math:`oH` for a square image :math:`oH \times oH`
         output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
                       This has to be a number or tuple in the range (0, 1)
         return_indices: if ``True``, will return the indices along with the outputs.
-                        Useful to pass to `max_unpool2d`.
+                        Useful to pass to :func:`~torch.nn.functional.max_unpool2d`.
 
     Examples::
         >>> input = torch.randn(20, 16, 50, 32)
@@ -662,6 +673,8 @@ def dropout3d(input, p=0.5, training=True, inplace=False):
         training: apply dropout if is ``True``. Defualt: ``True``
         inplace: If set to ``True``, will do this operation in-place. Default: ``False``
     """
+    # This is 100% the same code as dropout2d. We duplicate this code so that
+    # stack traces are not confusing.
     if p < 0 or p > 1:
         raise ValueError("dropout probability has to be between 0 and 1, "
                          "but got {}".format(p))
@@ -1193,7 +1206,7 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2,
 
 def embedding_bag(input, weight, offsets=None, max_norm=None, norm_type=2,
                   scale_grad_by_freq=False, mode='mean', sparse=False):
-    r"""Computes sums or means of 'bags' of embeddings, without instantiating the
+    r"""Computes sums, means or maxes of 'bags' of embeddings, without instantiating the
     intermediate embeddings.
 
     See :class:`torch.nn.EmbeddingBag` for more details.
@@ -1303,7 +1316,7 @@ def embedding_bag(input, weight, offsets=None, max_norm=None, norm_type=2,
             raise ValueError("max mode does not support sparse weights")
 
     else:
-        raise ValueError("mode has to be one of sum or mean")
+        raise ValueError("mode has to be one of sum, mean or max")
 
     if max_norm is not None:
         with torch.no_grad():
@@ -1344,46 +1357,10 @@ def instance_norm(input, running_mean=None, running_var=None, weight=None,
     See :class:`~torch.nn.InstanceNorm1d`, :class:`~torch.nn.InstanceNorm2d`,
     :class:`~torch.nn.InstanceNorm3d` for details.
     """
-    if not use_input_stats and (running_mean is None or running_var is None):
-        raise ValueError('Expected running_mean and running_var to be not None when use_input_stats=False')
-
-    b, c = input.size(0), input.size(1)
-    if weight is not None:
-        weight = weight.repeat(b)
-    if bias is not None:
-        bias = bias.repeat(b)
-
-    import torch.onnx.symbolic
-
-    @torch.onnx.symbolic_override(torch.onnx.symbolic.instance_norm)
-    def _instance_norm(input, running_mean=None, running_var=None, weight=None,
-                       bias=None, use_input_stats=None, momentum=None, eps=None):
-        # Repeat stored stats and affine transform params if necessary
-        if running_mean is not None:
-            running_mean_orig = running_mean
-            running_mean = running_mean_orig.repeat(b)
-        if running_var is not None:
-            running_var_orig = running_var
-            running_var = running_var_orig.repeat(b)
-
-        # Apply instance norm
-        input_reshaped = input.contiguous().view(1, b * c, *input.size()[2:])
-
-        out = batch_norm(
-            input_reshaped, running_mean, running_var, weight=weight, bias=bias,
-            training=use_input_stats, momentum=momentum, eps=eps)
-
-        # Reshape and copy back
-        if running_mean is not None:
-            running_mean_orig.copy_(running_mean.view(b, c).mean(0, keepdim=False))
-        if running_var is not None:
-            running_var_orig.copy_(running_var.view(b, c).mean(0, keepdim=False))
-
-        return out.view(b, c, *input.size()[2:])
-    return _instance_norm(input, running_mean=running_mean,
-                          running_var=running_var, weight=weight, bias=bias,
-                          use_input_stats=use_input_stats, momentum=momentum,
-                          eps=eps)
+    return torch.instance_norm(
+        input, weight, bias, running_mean, running_var,
+        use_input_stats, momentum, eps, torch.backends.cudnn.enabled
+    )
 
 
 def layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-5):
@@ -1437,6 +1414,8 @@ def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank=0,
 
     See :class:`~torch.nn.CTCLoss` for details.
 
+    .. include:: cudnn_deterministic.rst
+
     Args:
         log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`,
             `T = input length`, and `N = batch size`.
@@ -1939,36 +1918,23 @@ def multi_margin_loss(input, target, p=1, margin=1, weight=None, size_average=No
     return torch._C._nn.multi_margin_loss(input, target, p, margin, weight, reduction)
 
 
-def pixel_shuffle(input, upscale_factor):
-    r"""Rearranges elements in a tensor of shape :math:`[*, C*r^2, H, W]` to a
-    tensor of shape :math:`[C, H*r, W*r]`.
+pixel_shuffle = _add_docstr(torch.pixel_shuffle, r"""
+Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)` to a
+tensor of shape :math:`(C, H \times r, W \times r)`.
 
-    See :class:`~torch.nn.PixelShuffle` for details.
+See :class:`~torch.nn.PixelShuffle` for details.
 
-    Args:
-        input (Tensor): Input
-        upscale_factor (int): factor to increase spatial resolution by
-
-    Examples::
-
-        >>> ps = nn.PixelShuffle(3)
-        >>> input = torch.empty(1, 9, 4, 4)
-        >>> output = ps(input)
-        >>> print(output.size())
-        torch.Size([1, 1, 12, 12])
-    """
-    batch_size, channels, in_height, in_width = input.size()
-    channels //= upscale_factor ** 2
-
-    out_height = in_height * upscale_factor
-    out_width = in_width * upscale_factor
+Args:
+    input (Tensor): the input tensor
+    upscale_factor (int): factor to increase spatial resolution by
 
-    input_view = input.contiguous().view(
-        batch_size, channels, upscale_factor, upscale_factor,
-        in_height, in_width)
+Examples::
 
-    shuffle_out = input_view.permute(0, 1, 4, 2, 5, 3).contiguous()
-    return shuffle_out.view(batch_size, channels, out_height, out_width)
+    >>> input = torch.randn(1, 9, 4, 4)
+    >>> output = torch.nn.functional.pixel_shuffle(input, 3)
+    >>> print(output.size())
+    torch.Size([1, 1, 12, 12])
+""")
 
 
 def upsample(input, size=None, scale_factor=None, mode='nearest', align_corners=None):
@@ -2261,22 +2227,22 @@ def affine_grid(theta, size):
 def pad(input, pad, mode='constant', value=0):
     r"""Pads tensor.
 
-    `Nd` constant padding:  The number of dimensions to pad is
-        :math:`\left\lfloor\frac{len(padding)}{2}\right\rfloor` and the dimensions that get padded begins with the
-        last dimension and moves forward. See below for examples.
-
-    `1D`, `2D` and `3D` "reflect" / "replicate" padding:
-        for 1D:
-                3D input tensor with padding of the form `(padLeft, padRight)`
-        for 2D:
-                4D input tensor with padding of the form `(padLeft, padRight, padTop, padBottom)`.
-        for 3D:
-                5D input tensor with padding of the form
-                `(padLeft, padRight, padTop, padBottom, padFront, padBack)`. No "reflect" implementation.
-
-    See :class:`torch.nn.ConstantPad2d`, :class:`torch.nn.ReflectionPad2d`, and
-    :class:`torch.nn.ReplicationPad2d` for concrete examples on how each of the
-    padding modes works.
+    Pading size:
+        The number of dimensions to pad is :math:`\left\lfloor\frac{\text{len(pad)}}{2}\right\rfloor`
+        and the dimensions that get padded begins with the last dimension and moves forward.
+        For example, to pad the last dimension of the input tensor, then `pad` has form
+        `(padLeft, padRight)`; to pad the last 2 dimensions of the input tensor, then use
+        `(padLeft, padRight, padTop, padBottom)`; to pad the last 3 dimensions, use
+        `(padLeft, padRight, padTop, padBottom, padFront, padBack)`.
+
+    Padding mode:
+        See :class:`torch.nn.ConstantPad2d`, :class:`torch.nn.ReflectionPad2d`, and
+        :class:`torch.nn.ReplicationPad2d` for concrete examples on how each of the
+        padding modes works. Constant padding is implemented for arbitrary dimensions.
+        Replicate padding is implemented for padding the last 3 dimensions of 5D input
+        tensor, or the last 2 dimensions of 4D input tensor, or the last dimension of
+        3D input tensor. Reflect padding is only implemented for padding the last 2
+        dimensions of 4D input tensor, or the last dimension of 3D input tensor.
 
     Args:
         input (Tensor): `Nd` tensor
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 454151afed8201..dc8f61c2309c21 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -1,5 +1,6 @@
 import warnings
-from collections import OrderedDict, Iterable, Mapping
+from collections import OrderedDict
+from torch._six import container_abcs
 from itertools import islice
 import operator
 
@@ -62,7 +63,7 @@ def _get_item_by_idx(self, iterator, idx):
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):
-            return Sequential(OrderedDict(list(self._modules.items())[idx]))
+            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
         else:
             return self._get_item_by_idx(self._modules.values(), idx)
 
@@ -131,7 +132,7 @@ def _get_abs_string_index(self, idx):
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):
-            return ModuleList(list(self._modules.values())[idx])
+            return self.__class__(list(self._modules.values())[idx])
         else:
             return self._modules[self._get_abs_string_index(idx)]
 
@@ -163,6 +164,17 @@ def __dir__(self):
         keys = [key for key in keys if not key.isdigit()]
         return keys
 
+    def insert(self, index, module):
+        r"""Insert a given module before a given index in the list.
+
+        Arguments:
+            index (int): index to insert.
+            module (nn.Module): module to insert
+        """
+        for i in range(len(self._modules), index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+
     def append(self, module):
         r"""Appends a given module to the end of the list.
 
@@ -178,7 +190,7 @@ def extend(self, modules):
         Arguments:
             modules (iterable): iterable of modules to append
         """
-        if not isinstance(modules, Iterable):
+        if not isinstance(modules, container_abcs.Iterable):
             raise TypeError("ModuleList.extend should be called with an "
                             "iterable, but got " + type(modules).__name__)
         offset = len(self)
@@ -278,12 +290,12 @@ def update(self, modules):
             modules (iterable): a mapping (dictionary) of (string: :class:`~torch.nn.Module``) or
                 an iterable of key/value pairs of type (string, :class:`~torch.nn.Module``)
         """
-        if not isinstance(modules, Iterable):
+        if not isinstance(modules, container_abcs.Iterable):
             raise TypeError("ModuleDict.update should be called with an "
                             "iterable of key/value pairs, but got " +
                             type(modules).__name__)
 
-        if isinstance(modules, Mapping):
+        if isinstance(modules, container_abcs.Mapping):
             if isinstance(modules, OrderedDict):
                 for key, module in modules.items():
                     self[key] = module
@@ -292,7 +304,7 @@ def update(self, modules):
                     self[key] = module
         else:
             for j, m in enumerate(modules):
-                if not isinstance(m, Iterable):
+                if not isinstance(m, container_abcs.Iterable):
                     raise TypeError("ModuleDict update sequence element "
                                     "#" + str(j) + " should be Iterable; is" +
                                     type(m).__name__)
@@ -333,7 +345,7 @@ def __init__(self, parameters=None):
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):
-            return ParameterList(list(self._parameters.values())[idx])
+            return self.__class__(list(self._parameters.values())[idx])
         else:
             idx = operator.index(idx)
             if not (-len(self) <= idx < len(self)):
@@ -375,7 +387,7 @@ def extend(self, parameters):
         Arguments:
             parameters (iterable): iterable of parameters to append
         """
-        if not isinstance(parameters, Iterable):
+        if not isinstance(parameters, container_abcs.Iterable):
             raise TypeError("ParameterList.extend should be called with an "
                             "iterable, but got " + type(parameters).__name__)
         offset = len(self)
@@ -403,8 +415,8 @@ class ParameterDict(Module):
 
     Arguments:
         parameters (iterable, optional): a mapping (dictionary) of
-            (string : :class:`~torch.nn.Parameter``) or an iterable of key,value pairs
-            of type (string, :class:`~torch.nn.Parameter``)
+            (string : :class:`~torch.nn.Parameter`) or an iterable of key,value pairs
+            of type (string, :class:`~torch.nn.Parameter`)
 
     Example::
 
@@ -480,15 +492,15 @@ def update(self, parameters):
 
         Arguments:
             parameters (iterable): a mapping (dictionary) of
-                (string : :class:`~torch.nn.Parameter``) or an iterable of
-                key/value pairs of type (string, :class:`~torch.nn.Parameter``)
+                (string : :class:`~torch.nn.Parameter`) or an iterable of
+                key/value pairs of type (string, :class:`~torch.nn.Parameter`)
         """
-        if not isinstance(parameters, Iterable):
+        if not isinstance(parameters, container_abcs.Iterable):
             raise TypeError("ParametersDict.update should be called with an "
                             "iterable of key/value pairs, but got " +
                             type(parameters).__name__)
 
-        if isinstance(parameters, Mapping):
+        if isinstance(parameters, container_abcs.Mapping):
             if isinstance(parameters, OrderedDict):
                 for key, parameter in parameters.items():
                     self[key] = parameter
@@ -497,7 +509,7 @@ def update(self, parameters):
                     self[key] = parameter
         else:
             for j, p in enumerate(parameters):
-                if not isinstance(p, Iterable):
+                if not isinstance(p, container_abcs.Iterable):
                     raise TypeError("ParameterDict update sequence element "
                                     "#" + str(j) + " should be Iterable; is" +
                                     type(p).__name__)
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index 03cf297e3c0aa6..a1bfcbc08e097e 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -101,24 +101,26 @@ class Conv1d(_ConvNd):
         * At groups= :attr:`in_channels`, each input channel is convolved with
           its own set of filters,
           of size
-          :math:`\left\lfloor\frac{\text{out\_channels}}{\text{in\_channels}}\right\rfloor`
+          :math:`\left\lfloor\frac{C_\text{out}}{C_\text{in}}\right\rfloor`
 
     .. note::
 
-         Depending of the size of your kernel, several (of the last)
-         columns of the input might be lost, because it is a valid
-         `cross-correlation`_, and not a full `cross-correlation`_.
-         It is up to the user to add proper padding.
+        Depending of the size of your kernel, several (of the last)
+        columns of the input might be lost, because it is a valid
+        `cross-correlation`_, and not a full `cross-correlation`_.
+        It is up to the user to add proper padding.
 
     .. note::
 
-         The configuration when `groups == in_channels` and `out_channels == K * in_channels`
-         where `K` is a positive integer is termed in literature as depthwise convolution.
+        When `groups == in_channels` and `out_channels == K * in_channels`,
+        where `K` is a positive integer, this operation is also termed in
+        literature as depthwise convolution.
+
+        In other words, for an input of size :math:`(N, C_{in}, L_{in})`,
+        a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
+        :math:`(C_\text{in}=C_{in}, C_\text{out}=C_{in} \times K, ..., \text{groups}=C_{in})`.
 
-         In other words, for an input of size :math:`(N, C_{in}, L_{in})`, if you want a
-         depthwise convolution with a depthwise multiplier `K`,
-         then you use the constructor arguments
-         :math:`(\text{in\_channels}=C_{in}, \text{out\_channels}=C_{in} * K, ..., \text{groups}=C_{in})`
+    .. include:: cudnn_deterministic.rst
 
     Args:
         in_channels (int): Number of channels in the input image
@@ -145,11 +147,11 @@ class Conv1d(_ConvNd):
         weight (Tensor): the learnable weights of the module of shape
             (out_channels, in_channels, kernel_size). The values of these weights are sampled from
             :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-            :math:`k = \frac{1}{\text{in\_channels} * \text{kernel\_size}}`
+            :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}`
         bias (Tensor):   the learnable bias of the module of shape
             (out_channels). If :attr:`bias` is ``True``, then the values of these weights are
             sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-            :math:`k = \frac{1}{\text{in\_channels} * \text{kernel\_size}}`
+            :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}`
 
     Examples::
 
@@ -218,7 +220,7 @@ class Conv2d(_ConvNd):
           concatenated.
         * At groups= :attr:`in_channels`, each input channel is convolved with
           its own set of filters, of size:
-          :math:`\left\lfloor\frac{\text{out\_channels}}{\text{in\_channels}}\right\rfloor`.
+          :math:`\left\lfloor\frac{C_\text{out}}{C_\text{in}}\right\rfloor`.
 
     The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
 
@@ -235,13 +237,15 @@ class Conv2d(_ConvNd):
 
     .. note::
 
-         The configuration when `groups == in_channels` and `out_channels == K * in_channels`
-         where `K` is a positive integer is termed in literature as depthwise convolution.
+        When `groups == in_channels` and `out_channels == K * in_channels`,
+        where `K` is a positive integer, this operation is also termed in
+        literature as depthwise convolution.
 
-         In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`, if you want a
-         depthwise convolution with a depthwise multiplier `K`,
-         then you use the constructor arguments
-         :math:`(in\_channels=C_{in}, out\_channels=C_{in} * K, ..., groups=C_{in})`
+        In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`,
+        a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
+        :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
+
+    .. include:: cudnn_deterministic.rst
 
     Args:
         in_channels (int): Number of channels in the input image
@@ -261,6 +265,7 @@ class Conv2d(_ConvNd):
               H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] - \text{dilation}[0]
                         \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
 
+          .. math::
               W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] - \text{dilation}[1]
                         \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
 
@@ -269,11 +274,11 @@ class Conv2d(_ConvNd):
                          (out_channels, in_channels, kernel_size[0], kernel_size[1]).
                          The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
         bias (Tensor):   the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``,
                          then the values of these weights are
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
 
     Examples::
 
@@ -357,13 +362,15 @@ class Conv3d(_ConvNd):
 
     .. note::
 
-         The configuration when `groups == in_channels` and `out_channels == K * in_channels`
-         where `K` is a positive integer is termed in literature as depthwise convolution.
+        When `groups == in_channels` and `out_channels == K * in_channels`,
+        where `K` is a positive integer, this operation is also termed in
+        literature as depthwise convolution.
+
+         In other words, for an input of size :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`,
+         a depthwise convolution with a depthwise multiplier `K`, can be constructed by arguments
+         :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
 
-         In other words, for an input of size :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`, if you want a
-         depthwise convolution with a depthwise multiplier `K`,
-         then you use the constructor arguments
-         :math:`(in\_channels=C_{in}, out\_channels=C_{in} * K, ..., groups=C_{in})`
+    .. include:: cudnn_deterministic.rst
 
     Args:
         in_channels (int): Number of channels in the input image
@@ -383,9 +390,11 @@ class Conv3d(_ConvNd):
               D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0]
                     \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
 
+          .. math::
               H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1]
                     \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
 
+          .. math::
               W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2]
                     \times (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor
 
@@ -394,11 +403,11 @@ class Conv3d(_ConvNd):
                          (out_channels, in_channels, kernel_size[0], kernel_size[1], kernel_size[2])
                          The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
         bias (Tensor):   the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``,
                          then the values of these weights are
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
 
     Examples::
 
@@ -524,6 +533,8 @@ class ConvTranspose1d(_ConvTransposeMixin, _ConvNd):
         that :attr:`output_padding` is only used to find output shape, but does
         not actually add zero-padding to output.
 
+    .. include:: cudnn_deterministic.rst
+
     Args:
         in_channels (int): Number of channels in the input image
         out_channels (int): Number of channels produced by the convolution
@@ -550,11 +561,11 @@ class ConvTranspose1d(_ConvTransposeMixin, _ConvNd):
                          (in_channels, out_channels, kernel_size[0], kernel_size[1]). The values
                          of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \text{kernel\_size}}`
+                         :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}`
         bias (Tensor):   the learnable bias of the module of shape (out_channels).
                          If :attr:`bias` is ``True``, then the values of these weights are
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \text{kernel\_size}}`
+                         :math:`k = \frac{1}{C_\text{in} * \text{kernel\_size}}`
     """
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
@@ -634,6 +645,8 @@ class ConvTranspose2d(_ConvTransposeMixin, _ConvNd):
         that :attr:`output_padding` is only used to find output shape, but does
         not actually add zero-padding to output.
 
+    .. include:: cudnn_deterministic.rst
+
     Args:
         in_channels (int): Number of channels in the input image
         out_channels (int): Number of channels produced by the convolution
@@ -663,11 +676,11 @@ class ConvTranspose2d(_ConvTransposeMixin, _ConvNd):
                          (in_channels, out_channels, kernel_size[0], kernel_size[1])
                          The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
         bias (Tensor):   the learnable bias of the module of shape (out_channels)
                          If :attr:`bias` is ``True``, then the values of these weights are
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
 
     Examples::
 
@@ -774,6 +787,8 @@ class ConvTranspose3d(_ConvTransposeMixin, _ConvNd):
         that :attr:`output_padding` is only used to find output shape, but does
         not actually add zero-padding to output.
 
+    .. include:: cudnn_deterministic.rst
+
     Args:
         in_channels (int): Number of channels in the input image
         out_channels (int): Number of channels produced by the convolution
@@ -806,11 +821,11 @@ class ConvTranspose3d(_ConvTransposeMixin, _ConvNd):
                          (in_channels, out_channels, kernel_size[0], kernel_size[1], kernel_size[2])
                          The values of these weights are sampled from
                          :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
         bias (Tensor):   the learnable bias of the module of shape (out_channels)
                          If :attr:`bias` is ``True``, then the values of these weights are
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
-                         :math:`k = \frac{1}{\text{in\_channels} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+                         :math:`k = \frac{1}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
 
     Examples::
 
diff --git a/torch/nn/modules/fold.py b/torch/nn/modules/fold.py
index 4465b9b10f0e7e..d0035820927e32 100644
--- a/torch/nn/modules/fold.py
+++ b/torch/nn/modules/fold.py
@@ -15,11 +15,12 @@ class Fold(Module):
     :math:`L` is the total number of blocks. (This is exacly the
     same specification as the output shape of :class:`~torch.nn.Unfold`.) This
     operation combines these local blocks into the large :attr:`output` tensor
-    of shape :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`.
-    Similar to :class:`~torch.nn.Unfold`, the arguments must satisfy
+    of shape :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`
+    by summing the overlapping values. Similar to :class:`~torch.nn.Unfold`, the
+    arguments must satisfy
 
     .. math::
-        L = \prod_d \left\lfloor\frac{\text{output\_size}[d] + 2 \times \text{padding}[d] \
+        L = \prod_d \left\lfloor\frac{\text{output\_size}[d] + 2 \times \text{padding}[d] %
             - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
 
     where :math:`d` is over all spatial dimensions.
@@ -42,7 +43,8 @@ class Fold(Module):
       It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
 
     Args:
-        output_size (int or tuple): the shape of the spatial dimensions [2:] of the output
+        output_size (int or tuple): the shape of the spatial dimensions of the
+                                    output (i.e., ``input.sizes()[2:]``)
         kernel_size (int or tuple): the size of the sliding blocks
         stride (int or tuple): the stride of the sliding blocks in the input
                                spatial dimensions. Default: 1
@@ -59,6 +61,13 @@ class Fold(Module):
     * For the case of two output spatial dimensions this operation is sometimes
       called ``col2im``.
 
+    .. note::
+        :class:`~torch.nn.Fold` calculates each combined value in the resulting
+        large tensor by summing all values from all containing blocks.
+        :class:`~torch.nn.Unfold` extracts the values in the local blocks by
+        copying from the large tensor. So, if the blocks overlap, they are not
+        inverses of each other.
+
     .. warning::
         Currently, only 4-D output tensors (batched image-like tensors) are
         supported.
@@ -113,10 +122,10 @@ class Unfold(Module):
     the total number of such blocks:
 
     .. math::
-        L = \prod_d \left\lfloor\frac{\text{input\_spatial\_size}[d] + 2 \times \text{padding}[d] \
+        L = \prod_d \left\lfloor\frac{\text{spatial\_size}[d] + 2 \times \text{padding}[d] %
             - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
 
-    where :math:`\text{input\_spatial\_size}` is formed by the spatial dimensions
+    where :math:`\text{spatial\_size}` is formed by the spatial dimensions
     of :attr:`input` (:math:`*` above), and :math:`d` is over all spatial
     dimensions.
 
@@ -152,6 +161,13 @@ class Unfold(Module):
     * For the case of two input spatial dimensions this operation is sometimes
       called ``im2col``.
 
+    .. note::
+        :class:`~torch.nn.Fold` calculates each combined value in the resulting
+        large tensor by summing all values from all containing blocks.
+        :class:`~torch.nn.Unfold` extracts the values in the local blocks by
+        copying from the large tensor. So, if the blocks overlap, they are not
+        inverses of each other.
+
     .. warning::
         Currently, only 4-D input tensors (batched image-like tensors) are
         supported.
diff --git a/torch/nn/modules/instancenorm.py b/torch/nn/modules/instancenorm.py
index f79bbc61dd4e71..24932a678ec568 100644
--- a/torch/nn/modules/instancenorm.py
+++ b/torch/nn/modules/instancenorm.py
@@ -78,6 +78,15 @@ class InstanceNorm1d(_InstanceNorm):
         where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
         new observed value.
 
+    .. note::
+        :class:`InstanceNorm1d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm1d` is applied
+        on each channel of channeled data like multidimensional time series, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionaly, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm1d` usually don't apply affine
+        transform.
+
     Args:
         num_features: :math:`C` from an expected input of size
             :math:`(N, C, L)` or :math:`L` from input of size :math:`(N, L)`
@@ -143,6 +152,15 @@ class InstanceNorm2d(_InstanceNorm):
         where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
         new observed value.
 
+    .. note::
+        :class:`InstanceNorm2d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm2d` is applied
+        on each channel of channeled data like RGB images, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionaly, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm2d` usually don't apply affine
+        transform.
+
     Args:
         num_features: :math:`C` from an expected input of size
             :math:`(N, C, H, W)`
@@ -208,6 +226,15 @@ class InstanceNorm3d(_InstanceNorm):
         where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
         new observed value.
 
+    .. note::
+        :class:`InstanceNorm3d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm3d` is applied
+        on each channel of channeled data like 3D models with RGB color, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionaly, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm3d` usually don't apply affine
+        transform.
+
     Args:
         num_features: :math:`C` from an expected input of size
             :math:`(N, C, D, H, W)`
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index 573c6b20cbf035..3cd4661e5a6de2 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -17,17 +17,17 @@ class Linear(Module):
             Default: ``True``
 
     Shape:
-        - Input: :math:`(N, *, in\_features)` where :math:`*` means any number of
+        - Input: :math:`(N, *, \text{in\_features})` where :math:`*` means any number of
           additional dimensions
-        - Output: :math:`(N, *, out\_features)` where all but the last dimension
+        - Output: :math:`(N, *, \text{out\_features})` where all but the last dimension
           are the same shape as the input.
 
     Attributes:
         weight: the learnable weights of the module of shape
-            `(out_features x in_features)`. The values are initialized from
-            :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+            :math:`(\text{out\_features}, \text{in\_features})`. The values are
+            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
             :math:`k = \frac{1}{\text{in\_features}}`
-        bias:   the learnable bias of the module of shape :math:`(out_features)`.
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
                 If :attr:`bias` is ``True``, the values are initialized from
                 :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                 :math:`k = \frac{1}{\text{in\_features}}`
@@ -38,6 +38,7 @@ class Linear(Module):
         >>> input = torch.randn(128, 20)
         >>> output = m(input)
         >>> print(output.size())
+        torch.Size([128, 30])
     """
 
     def __init__(self, in_features, out_features, bias=True):
@@ -87,12 +88,12 @@ class Bilinear(Module):
 
     Attributes:
         weight: the learnable weights of the module of shape
-            `(out_features x in1_features x in2_features)`. The values are initialized from
-            :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+            :math:`(\text{out\_features} x \text{in1\_features} x \text{in2\_features})`.
+            The values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
             :math:`k = \frac{1}{\text{in1\_features}}`
-        bias:   the learnable bias of the module of shape `(out_features)`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`
                 If :attr:`bias` is ``True``, the values are initialized from
-                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
                 :math:`k = \frac{1}{\text{in1\_features}}`
 
     Examples::
@@ -102,6 +103,7 @@ class Bilinear(Module):
         >>> input2 = torch.randn(128, 30)
         >>> output = m(input1, input2)
         >>> print(output.size())
+        torch.Size([128, 40])
     """
 
     def __init__(self, in1_features, in2_features, out_features, bias=True):
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index ef6c8971691945..1737bd5f9da687 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1177,6 +1177,11 @@ class CTCLoss(_Loss):
         dtype :attr:`torch.int32`.
 
         The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
+
+
+    .. include:: cudnn_deterministic.rst
+
+
     """
 
     def __init__(self, blank=0, reduction='elementwise_mean'):
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index a85c220e66f521..06be26b8cdc146 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -42,7 +42,7 @@ def forward(self, x):
                return F.relu(self.conv2(x))
 
     Submodules assigned in this way will be registered, and will have their
-    parameters converted too when you call `.cuda()`, etc.
+    parameters converted too when you call :meth:`to`, etc.
     """
 
     dump_patches = False
diff --git a/torch/nn/modules/pixelshuffle.py b/torch/nn/modules/pixelshuffle.py
index e68946d7c7fe90..0e3883120ca352 100644
--- a/torch/nn/modules/pixelshuffle.py
+++ b/torch/nn/modules/pixelshuffle.py
@@ -3,28 +3,28 @@
 
 
 class PixelShuffle(Module):
-    r"""Rearranges elements in a Tensor of shape :math:`(*, r^2C, H, W)` to a
-    tensor of shape :math:`(C, rH, rW)`.
+    r"""Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
+    to a tensor of shape :math:`(C, H \times r, W \times r)`.
 
     This is useful for implementing efficient sub-pixel convolution
     with a stride of :math:`1/r`.
 
     Look at the paper:
     `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
-    by Shi et. al (2016) for more details
+    by Shi et. al (2016) for more details.
 
     Args:
         upscale_factor (int): factor to increase spatial resolution by
 
     Shape:
-        - Input: :math:`(N, C * \text{upscale\_factor}^2, H, W)`
-        - Output: :math:`(N, C, H * \text{upscale\_factor}, W * \text{upscale\_factor})`
+        - Input: :math:`(N, C \times \text{upscale_factor}^2, H, W)`
+        - Output: :math:`(N, C, H \times \text{upscale_factor}, W \times \text{upscale_factor})`
 
     Examples::
 
-        >>> ps = nn.PixelShuffle(3)
-        >>> input = torch.tensor(1, 9, 4, 4)
-        >>> output = ps(input)
+        >>> pixel_shuffle = nn.PixelShuffle(3)
+        >>> input = torch.randn(1, 9, 4, 4)
+        >>> output = pixel_shuffle(input)
         >>> print(output.size())
         torch.Size([1, 1, 12, 12])
 
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index 7ca95d21230cad..9243f14791b659 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -30,8 +30,8 @@ class MaxPool1d(_MaxPoolNd):
     and output :math:`(N, C, L_{out})` can be precisely described as:
 
     .. math::
-        out(N_i, C_j, k)  = \max_{m=0, \ldots, kernel\_size-1}
-                input(N_i, C_j, stride * k + m)
+        out(N_i, C_j, k) = \max_{m=0, \ldots, \text{kernel\_size} - 1}
+                input(N_i, C_j, stride \times k + m)
 
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
     for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
@@ -43,16 +43,16 @@ class MaxPool1d(_MaxPoolNd):
         padding: implicit zero padding to be added on both sides
         dilation: a parameter that controls the stride of elements in the window
         return_indices: if ``True``, will return the max indices along with the outputs.
-                        Useful when Unpooling later
+                        Useful for :class:`torch.nn.MaxUnpool1d` later
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
     Shape:
         - Input: :math:`(N, C, L_{in})`
-        - Output: :math:`(N, C, L_{out})` where
+        - Output: :math:`(N, C, L_{out})`, where
 
           .. math::
-              L_{out} = \left\lfloor \frac{L_{in} + 2 * \text{padding} - \text{dilation}
-                    * (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+              L_{out} = \left\lfloor \frac{L_{in} + 2 \times \text{padding} - \text{dilation}
+                    \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
 
     Examples::
 
@@ -84,9 +84,11 @@ class MaxPool2d(_MaxPoolNd):
     can be precisely described as:
 
     .. math::
-
-        out(N_i, C_j, h, w)  = \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1}
-                               \text{input}(N_i, C_j, \text{stride[0]} * h + m, \text{stride[1]} * w + n)
+        \begin{aligned}
+            out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
+        \end{aligned}
 
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
     for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
@@ -104,20 +106,20 @@ class MaxPool2d(_MaxPoolNd):
         padding: implicit zero padding to be added on both sides
         dilation: a parameter that controls the stride of elements in the window
         return_indices: if ``True``, will return the max indices along with the outputs.
-                        Useful when Unpooling later
+                        Useful for :class:`torch.nn.MaxUnpool2d` later
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})`
-        - Output: :math:`(N, C, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, H_{out}, W_{out})`, where
 
           .. math::
               H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding[0]} - \text{dilation[0]}
-                    * (\text{kernel\_size[0]} - 1) - 1}{\text{stride[0]}} + 1\right\rfloor
+                    \times (\text{kernel\_size[0]} - 1) - 1}{\text{stride[0]}} + 1\right\rfloor
 
           .. math::
               W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding[1]} - \text{dilation[1]}
-                    * (\text{kernel\_size[1]} - 1) - 1}{\text{stride[1]}} + 1\right\rfloor
+                    \times (\text{kernel\_size[1]} - 1) - 1}{\text{stride[1]}} + 1\right\rfloor
 
     Examples::
 
@@ -147,14 +149,11 @@ class MaxPool3d(_MaxPoolNd):
     can be precisely described as:
 
     .. math::
-        out(N_i, C_j, d, h, w) =
-        \begin{gathered}
-        \max_{k=0, \ldots, kD-1}
-        \max_{m=0, \ldots, kH-1}
-        \max_{n=0, \ldots, kW-1} \\
-        \text{input}(N_i, C_j, \text{stride[0]} *
-        k + d, \text{stride[1]} * h + m, \text{stride[2]} * w + n)
-        \end{gathered}
+        \begin{aligned}
+            \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, kD-1} \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                              & \text{input}(N_i, C_j, \text{stride[0]} \times k + d,
+                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
+        \end{aligned}
 
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
     for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
@@ -172,23 +171,23 @@ class MaxPool3d(_MaxPoolNd):
         padding: implicit zero padding to be added on all three sides
         dilation: a parameter that controls the stride of elements in the window
         return_indices: if ``True``, will return the max indices along with the outputs.
-                        Useful when Unpooling later
+                        Useful for :class:`torch.nn.MaxUnpool3d` later
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
-        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
 
           .. math::
-              D_{out} = \left\lfloor\frac{D_{in} + 2 * \text{padding}[0] - \text{dilation}[0] *
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] \times
                 (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
 
           .. math::
-              H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding}[1] - \text{dilation}[1] *
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] \times
                 (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
 
           .. math::
-              W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding}[2] - \text{dilation}[2] *
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2] \times
                 (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor
 
     Examples::
@@ -202,7 +201,7 @@ class MaxPool3d(_MaxPoolNd):
 
     .. _link:
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
-    """
+    """  # noqa: E501
 
     def forward(self, input):
         return F.max_pool3d(input, self.kernel_size, self.stride,
@@ -227,29 +226,29 @@ class MaxUnpool1d(_MaxUnpoolNd):
     including the indices of the maximal values and computes a partial inverse
     in which all non-maximal values are set to zero.
 
-    .. note:: `MaxPool1d` can map several input sizes to the same output sizes.
-              Hence, the inversion process can get ambiguous.
+    .. note:: :class:`MaxPool1d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
               To accommodate this, you can provide the needed output size
-              as an additional argument `output_size` in the forward call.
+              as an additional argument :attr:`output_size` in the forward call.
               See the Inputs and Example below.
 
     Args:
         kernel_size (int or tuple): Size of the max pooling window.
         stride (int or tuple): Stride of the max pooling window.
-            It is set to ``kernel_size`` by default.
+            It is set to :attr:`kernel_size` by default.
         padding (int or tuple): Padding that was added to the input
 
     Inputs:
         - `input`: the input Tensor to invert
-        - `indices`: the indices given out by `MaxPool1d`
-        - `output_size` (optional) : a `torch.Size` that specifies the targeted output size
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool1d`
+        - `output_size` (optional): the targeted output size
 
     Shape:
         - Input: :math:`(N, C, H_{in})`
-        - Output: :math:`(N, C, H_{out})` where
+        - Output: :math:`(N, C, H_{out})`, where
 
           .. math::
-              H_{out} = (H_{in} - 1) * \text{stride}[0] - 2 * \text{padding}[0] + \text{kernel\_size}[0]
+              H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{kernel\_size}[0]
 
           or as given by :attr:`output_size` in the call operator
 
@@ -292,32 +291,32 @@ class MaxUnpool2d(_MaxUnpoolNd):
     including the indices of the maximal values and computes a partial inverse
     in which all non-maximal values are set to zero.
 
-    .. note:: `MaxPool2d` can map several input sizes to the same output sizes.
-              Hence, the inversion process can get ambiguous.
+    .. note:: :class:`MaxPool2d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
               To accommodate this, you can provide the needed output size
-              as an additional argument `output_size` in the forward call.
+              as an additional argument :attr:`output_size` in the forward call.
               See the Inputs and Example below.
 
     Args:
         kernel_size (int or tuple): Size of the max pooling window.
         stride (int or tuple): Stride of the max pooling window.
-            It is set to ``kernel_size`` by default.
+            It is set to :attr:`kernel_size` by default.
         padding (int or tuple): Padding that was added to the input
 
     Inputs:
         - `input`: the input Tensor to invert
-        - `indices`: the indices given out by `MaxPool2d`
-        - `output_size` (optional) : a `torch.Size` that specifies the targeted output size
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool2d`
+        - `output_size` (optional): the targeted output size
 
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})`
-        - Output: :math:`(N, C, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, H_{out}, W_{out})`, where
 
           .. math::
-            H_{out} = (H_{in} - 1) * \text{stride[0]} - 2 * \text{padding[0]} + \text{kernel\_size[0]}
+            H_{out} = (H_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]}
 
           .. math::
-            W_{out} = (W_{in} - 1) * \text{stride[1]} - 2 * \text{padding[1]} + \text{kernel\_size[1]}
+            W_{out} = (W_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]}
 
           or as given by :attr:`output_size` in the call operator
 
@@ -364,35 +363,35 @@ class MaxUnpool3d(_MaxUnpoolNd):
     including the indices of the maximal values and computes a partial inverse
     in which all non-maximal values are set to zero.
 
-    .. note:: `MaxPool3d` can map several input sizes to the same output sizes.
-              Hence, the inversion process can get ambiguous.
+    .. note:: :class:`MaxPool3d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
               To accommodate this, you can provide the needed output size
-              as an additional argument `output_size` in the forward call.
+              as an additional argument :attr:`output_size` in the forward call.
               See the Inputs section below.
 
     Args:
         kernel_size (int or tuple): Size of the max pooling window.
         stride (int or tuple): Stride of the max pooling window.
-            It is set to ``kernel_size`` by default.
+            It is set to :attr:`kernel_size` by default.
         padding (int or tuple): Padding that was added to the input
 
     Inputs:
         - `input`: the input Tensor to invert
-        - `indices`: the indices given out by `MaxPool3d`
-        - `output_size` (optional) : a `torch.Size` that specifies the targeted output size
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool3d`
+        - `output_size` (optional): the targeted output size
 
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
-        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
 
           .. math::
-              D_{out} = (D_{in} - 1) * \text{stride[0]} - 2 * \text{padding[0]} + \text{kernel\_size[0]}
+              D_{out} = (D_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]}
 
           .. math::
-              H_{out} = (H_{in} - 1) * \text{stride[1]} - 2 * \text{padding[1]} + \text{kernel\_size[1]}
+              H_{out} = (H_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]}
 
           .. math::
-              W_{out} = (W_{in} - 1) * \text{stride[2]} - 2 * \text{padding[2]} + \text{kernel\_size[2]}
+              W_{out} = (W_{in} - 1) \times \text{stride[2]} - 2 \times \text{padding[2]} + \text{kernel\_size[2]}
 
           or as given by :attr:`output_size` in the call operator
 
@@ -436,8 +435,8 @@ class AvgPool1d(_AvgPoolNd):
 
     .. math::
 
-        \text{out}(N_i, C_j, l)  = \frac{1}{k} \sum_{m=0}^{k}
-                               \text{input}(N_i, C_j, \text{stride} * l + m)
+        \text{out}(N_i, C_j, l) = \frac{1}{k} \sum_{m=0}^{k}
+                               \text{input}(N_i, C_j, \text{stride} \times l + m)
 
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
     for :attr:`padding` number of points.
@@ -454,11 +453,11 @@ class AvgPool1d(_AvgPoolNd):
 
     Shape:
         - Input: :math:`(N, C, L_{in})`
-        - Output: :math:`(N, C, L_{out})` where
+        - Output: :math:`(N, C, L_{out})`, where
 
           .. math::
               L_{out} = \left\lfloor \frac{L_{in} +
-              2 * \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
+              2 \times \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
 
     Examples::
 
@@ -494,7 +493,7 @@ class AvgPool2d(_AvgPoolNd):
     .. math::
 
         out(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
-                               input(N_i, C_j, stride[0] * h + m, stride[1] * w + n)
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
 
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
     for :attr:`padding` number of points.
@@ -514,14 +513,14 @@ class AvgPool2d(_AvgPoolNd):
 
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})`
-        - Output: :math:`(N, C, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, H_{out}, W_{out})`, where
 
           .. math::
-              H_{out} = \left\lfloor\frac{H_{in}  + 2 * \text{padding}[0] -
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] -
                 \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
 
           .. math::
-              W_{out} = \left\lfloor\frac{W_{in}  + 2 * \text{padding}[1] -
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] -
                 \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
 
     Examples::
@@ -557,11 +556,12 @@ class AvgPool3d(_AvgPoolNd):
     can be precisely described as:
 
     .. math::
-
-        \text{out}(N_i, C_j, d, h, w)  = \sum_{k=0}^{kD-1} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
-                \frac{\text{input}(N_i, C_j, \text{stride}[0] * d + k, \text{stride}[1] * h + m,
-                        \text{stride}[2] * w + n)}
-                     {kD * kH * kW}
+        \begin{aligned}
+            \text{out}(N_i, C_j, d, h, w) ={} & \sum_{k=0}^{kD-1} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} \\
+                                              & \frac{\text{input}(N_i, C_j, \text{stride}[0] \times d + k,
+                                                      \text{stride}[1] \times h + m, \text{stride}[2] \times w + n)}
+                                                     {kD \times kH \times kW}
+        \end{aligned}
 
     If :attr:`padding` is non-zero, then the input is implicitly zero-padded on all three sides
     for :attr:`padding` number of points.
@@ -581,18 +581,18 @@ class AvgPool3d(_AvgPoolNd):
 
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
-        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
 
           .. math::
-              D_{out} = \left\lfloor\frac{D_{in} + 2 * \text{padding}[0] -
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] -
                     \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
 
           .. math::
-              H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding}[1] -
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] -
                     \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
 
           .. math::
-              W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding}[2] -
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] -
                     \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor
 
     Examples::
@@ -718,11 +718,11 @@ class LPPool1d(_LPPoolNd):
 
     Shape:
         - Input: :math:`(N, C, L_{in})`
-        - Output: :math:`(N, C, L_{out})` where
+        - Output: :math:`(N, C, L_{out})`, where
 
           .. math::
               L_{out} = \left\lfloor\frac{L_{in} +
-              2 * \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
+              2 \times \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
 
     Examples::
         >>> # power-2 pool of window of length 3, with stride 2.
@@ -746,7 +746,7 @@ class LPPool2d(_LPPoolNd):
         f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
 
     - At p = :math:`\infty`, one gets Max Pooling
-    - At p = 1, one gets Sum Pooling (which is proportional to Average Pooling)
+    - At p = 1, one gets Sum Pooling (which is proportional to average pooling)
 
     The parameters :attr:`kernel_size`, :attr:`stride` can either be:
 
@@ -764,14 +764,14 @@ class LPPool2d(_LPPoolNd):
 
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})`
-        - Output: :math:`(N, C, H_{out}, W_{out})` where
+        - Output: :math:`(N, C, H_{out}, W_{out})`, where
 
           .. math::
-              H_{out} = \left\lfloor\frac{H_{in}  + 2 * \text{padding}[0] - \text{dilation}[0] *
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] - \text{dilation}[0] \times
                     (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
 
           .. math::
-              W_{out} = \left\lfloor\frac{W_{in}  + 2 * \text{padding}[1] - \text{dilation}[1] *
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] - \text{dilation}[1] \times
                     (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
 
     Examples::
@@ -800,6 +800,9 @@ def __init__(self, output_size, return_indices=False):
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
 
+# FIXME (by @ssnl): Improve adaptive pooling docs: specify what the input and
+#   output shapes are, and how the operation computes output.
+
 
 class AdaptiveMaxPool1d(_AdaptiveMaxPoolNd):
     r"""Applies a 1D adaptive max pooling over an input signal composed of several input planes.
diff --git a/torch/nn/modules/upsampling.py b/torch/nn/modules/upsampling.py
index e416326229a142..5b2307470a6060 100644
--- a/torch/nn/modules/upsampling.py
+++ b/torch/nn/modules/upsampling.py
@@ -50,8 +50,8 @@ class Upsample(Module):
         0.3.1. Since then, the default behavior is ``align_corners = False``.
         See below for concrete examples on how this affects the outputs.
 
-    .. warning::
-        This class is deprecated in favor of :func:`~nn.functional.interpolate`.
+    .. note::
+        If you want downsampling/general resizing, you should use :func:`~nn.functional.interpolate`.
 
     Examples::
 
@@ -121,7 +121,6 @@ def __init__(self, size=None, scale_factor=None, mode='nearest', align_corners=N
         self.align_corners = align_corners
 
     def forward(self, input):
-        warnings.warn("nn.Upsampling is deprecated. Use nn.functional.interpolate instead.")
         return F.interpolate(input, self.size, self.scale_factor, self.mode, self.align_corners)
 
     def extra_repr(self):
diff --git a/torch/nn/modules/utils.py b/torch/nn/modules/utils.py
index 3cff6a9e9ffba9..2b8ebd642b000a 100644
--- a/torch/nn/modules/utils.py
+++ b/torch/nn/modules/utils.py
@@ -1,10 +1,10 @@
-import collections
+from torch._six import container_abcs
 from itertools import repeat
 
 
 def _ntuple(n):
     def parse(x):
-        if isinstance(x, collections.Iterable):
+        if isinstance(x, container_abcs.Iterable):
             return x
         return tuple(repeat(x, n))
     return parse
diff --git a/torch/nn/parallel/__init__.py b/torch/nn/parallel/__init__.py
index 00d942abf35e0e..066f415a73409a 100644
--- a/torch/nn/parallel/__init__.py
+++ b/torch/nn/parallel/__init__.py
@@ -4,8 +4,7 @@
 from .scatter_gather import scatter, gather
 from .distributed import DistributedDataParallel
 from .distributed_cpu import DistributedDataParallelCPU
-from .distributed_c10d import _DistributedDataParallelC10d
-from .distributed_c10d_cpu import _DistributedDataParallelC10dCPU
+import torch.nn.parallel.deprecated
 
 __all__ = ['replicate', 'scatter', 'parallel_apply', 'gather', 'data_parallel',
            'DataParallel', 'DistributedDataParallel', 'DistributedDataParallelCPU']
diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py
index 48b2a77d2d598d..4b1f58356287e9 100644
--- a/torch/nn/parallel/_functions.py
+++ b/torch/nn/parallel/_functions.py
@@ -3,6 +3,7 @@
 import torch
 import torch.cuda.comm as comm
 from torch.autograd import Function
+from torch.cuda._utils import _get_device_index
 
 
 class Broadcast(Function):
@@ -11,6 +12,7 @@ class Broadcast(Function):
     def forward(ctx, target_gpus, *inputs):
         if not all(input.is_cuda for input in inputs):
             raise TypeError('Broadcast function not implemented for CPU tensors')
+        target_gpus = list(map(lambda x: _get_device_index(x, True), target_gpus))
         ctx.target_gpus = target_gpus
         if len(inputs) == 0:
             return tuple()
@@ -50,6 +52,7 @@ class Gather(Function):
     @staticmethod
     def forward(ctx, target_device, dim, *inputs):
         assert all(map(lambda i: i.is_cuda, inputs))
+        target_device = _get_device_index(target_device, True)
         ctx.target_device = target_device
         ctx.dim = dim
         ctx.input_gpus = tuple(map(lambda i: i.get_device(), inputs))
@@ -76,6 +79,7 @@ class Scatter(Function):
 
     @staticmethod
     def forward(ctx, target_gpus, chunk_sizes, dim, input):
+        target_gpus = list(map(lambda x: _get_device_index(x, True), target_gpus))
         ctx.dim = dim
         ctx.input_device = input.get_device() if input.is_cuda else -1
         streams = None
diff --git a/torch/nn/parallel/data_parallel.py b/torch/nn/parallel/data_parallel.py
index 3fe381a59b5108..231963a48d1dd8 100644
--- a/torch/nn/parallel/data_parallel.py
+++ b/torch/nn/parallel/data_parallel.py
@@ -5,6 +5,7 @@
 from .scatter_gather import scatter_kwargs, gather
 from .replicate import replicate
 from .parallel_apply import parallel_apply
+from torch.cuda._utils import _get_device_index
 
 
 def _check_balance(device_ids):
@@ -13,7 +14,7 @@ def _check_balance(device_ids):
     has less than 75% of the memory or cores of GPU {}. You can do so by setting
     the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
     environment variable."""
-
+    device_ids = list(map(lambda x: _get_device_index(x, True), device_ids))
     dev_props = [torch.cuda.get_device_properties(i) for i in device_ids]
 
     def warn_imbalance(get_prop):
@@ -36,9 +37,10 @@ class DataParallel(Module):
 
     This container parallelizes the application of the given module by
     splitting the input across the specified devices by chunking in the batch
-    dimension. In the forward pass, the module is replicated on each device,
-    and each replica handles a portion of the input. During the backwards
-    pass, gradients from each replica are summed into the original module.
+    dimension (other objects will be copied once per device). In the forward
+    pass, the module is replicated on each device, and each replica handles a
+    portion of the input. During the backwards pass, gradients from each replica
+    are summed into the original module.
 
     The batch size should be larger than the number of GPUs used.
 
@@ -76,9 +78,9 @@ class DataParallel(Module):
 
 
     Args:
-        module: module to be parallelized
-        device_ids: CUDA devices (default: all devices)
-        output_device: device location of output (default: device_ids[0])
+        module (Module): module to be parallelized
+        device_ids (list of int or torch.device): CUDA devices (default: all devices)
+        output_device (int or torch.device): device location of output (default: device_ids[0])
 
     Attributes:
         module (Module): the module to be parallelized
@@ -103,10 +105,11 @@ def __init__(self, module, device_ids=None, output_device=None, dim=0):
             device_ids = list(range(torch.cuda.device_count()))
         if output_device is None:
             output_device = device_ids[0]
+
         self.dim = dim
         self.module = module
-        self.device_ids = device_ids
-        self.output_device = output_device
+        self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids))
+        self.output_device = _get_device_index(output_device, True)
 
         _check_balance(self.device_ids)
 
@@ -142,10 +145,10 @@ def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, mo
     This is the functional version of the DataParallel module.
 
     Args:
-        module: the module to evaluate in parallel
-        inputs: inputs to the module
-        device_ids: GPU ids on which to replicate module
-        output_device: GPU location of the output  Use -1 to indicate the CPU.
+        module (Module): the module to evaluate in parallel
+        inputs (tensor): inputs to the module
+        device_ids (list of int or torch.device): GPU ids on which to replicate module
+        output_device (list of int or torch.device): GPU location of the output  Use -1 to indicate the CPU.
             (default: device_ids[0])
     Returns:
         a Tensor containing the result of module(input) located on
diff --git a/torch/nn/parallel/deprecated/__init__.py b/torch/nn/parallel/deprecated/__init__.py
new file mode 100644
index 00000000000000..262827cf5c919d
--- /dev/null
+++ b/torch/nn/parallel/deprecated/__init__.py
@@ -0,0 +1,4 @@
+from .distributed import DistributedDataParallel
+from .distributed_cpu import DistributedDataParallelCPU
+
+__all__ = ['DistributedDataParallel', 'DistributedDataParallelCPU']
diff --git a/torch/nn/parallel/deprecated/distributed.py b/torch/nn/parallel/deprecated/distributed.py
new file mode 100644
index 00000000000000..655cdce0102bb5
--- /dev/null
+++ b/torch/nn/parallel/deprecated/distributed.py
@@ -0,0 +1,482 @@
+import sys
+import math
+import threading
+import copy
+
+import torch
+from torch.autograd import Variable
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors, \
+    _take_tensors
+
+from torch.cuda.comm import broadcast_coalesced
+from torch.cuda import nccl
+import torch.distributed.deprecated as dist
+
+from ...modules import Module
+from ..replicate import replicate
+from ..scatter_gather import scatter_kwargs, gather
+from ..parallel_apply import parallel_apply
+
+if sys.version_info[0] == 3:
+    import queue
+else:
+    import Queue as queue
+
+
+class DistributedDataParallel(Module):
+    r"""Implements distributed data parallelism at the module level.
+
+    This container parallelizes the application of the given module by
+    splitting the input across the specified devices by chunking in the batch
+    dimension. The module is replicated on each machine and each device, and
+    each such replica handles a portion of the input. During the backwards
+    pass, gradients from each node are averaged.
+
+    The batch size should be larger than the number of GPUs used locally. It
+    should also be an integer multiple of the number of GPUs so that each chunk
+    is the same size (so that each GPU processes the same number of samples).
+
+    See also: :ref:`distributed-basics` and :ref:`cuda-nn-dataparallel-instead`.
+    The same constraints on input as in :class:`torch.nn.DataParallel` apply.
+
+    Creation of this class requires the distributed package to be already
+    initialized in the process group mode
+    (see :func:`torch.distributed.deprecated.init_process_group`).
+
+    .. warning::
+        This module works only with the ``nccl`` and ``gloo`` backends.
+
+    .. warning::
+        Constructor, forward method, and differentiation of the output (or a
+        function of the output of this module) is a distributed synchronization
+        point. Take that into account in case different processes might be
+        executing different code.
+
+    .. warning::
+        This module assumes all parameters are registered in the model by the
+        time it is created. No parameters should be added nor removed later.
+        Same applies to buffers.
+
+    .. warning::
+        This module assumes all buffers and gradients are dense.
+
+    .. warning::
+        This module doesn't work with :func:`torch.autograd.grad` (i.e. it will
+        only work if gradients are to be accumulated in ``.grad`` attributes of
+        parameters).
+
+    .. warning::
+        If you plan on using this module with a ``nccl`` backend or a ``gloo``
+        backend (that uses Infiniband), together with a DataLoader that uses
+        multiple workers, please change the multiprocessing start method to
+        ``forkserver`` (Python 3 only) or ``spawn``. Unfortunately
+        Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will
+        likely experience deadlocks if you don't change this setting.
+
+    .. note::
+        Parameters are never broadcast between processes. The module performs
+        an all-reduce step on gradients and assumes that they will be modified
+        by the optimizer in all processes in the same way. Buffers
+        (e.g. BatchNorm stats) are broadcast from the module in process of rank
+        0, to all other replicas in the system in every iteration.
+
+    .. warning::
+        Forward and backward hooks defined on :attr:`module` and its submodules
+        won't be invoked anymore, unless the hooks are initialized in the
+        :meth:`forward` method.
+
+    Args:
+        module: module to be parallelized
+        device_ids: CUDA devices (default: all devices)
+        output_device: device location of output (default: device_ids[0])
+        broadcast_buffers: flag that enables syncing (broadcasting) buffers of
+                           the module at beginning of the forward function.
+                           (default: True)
+
+    Attributes:
+        module (Module): the module to be parallelized
+
+    Example::
+
+        >>> torch.distributed.deprecated.init_process_group(world_size=4, init_method='...')
+        >>> net = torch.nn.DistributedDataParallel(model)
+    """
+
+    def __init__(self, module, device_ids=None, output_device=None, dim=0,
+                 broadcast_buffers=True):
+        super(DistributedDataParallel, self).__init__()
+        if dist._backend not in (dist.dist_backend.NCCL, dist.dist_backend.GLOO):
+            raise ValueError('Invalid backend, only NCCL and GLOO backends are supported by DistributedDataParallel')
+
+        if device_ids is None:
+            device_ids = list(range(torch.cuda.device_count()))
+        if output_device is None:
+            output_device = device_ids[0]
+        self.dim = dim
+        self.module = module
+        self.device_ids = device_ids
+        self.output_device = output_device
+        self.broadcast_buffers = broadcast_buffers
+
+        # Flag used by the NCCL backend to make sure we only reduce gradients
+        # one time in the execution engine
+        self.need_reduction = False
+
+        MB = 1024 * 1024
+        # used for intra-node param sync and inter-node sync as well
+        self.broadcast_bucket_size = 10 * MB
+        self.nccl_reduce_bucket_size = 256 * MB
+
+        # Sync params and buffers
+        module_states = list(self.module.state_dict().values())
+        if len(module_states) > 0:
+            self._dist_broadcast_coalesced(module_states,
+                                           self.broadcast_bucket_size)
+
+        if len(device_ids) > 1:
+            # TODO: we don't need to replicate params in here. they're always going to
+            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
+            # better to not pollute the caches with these small blocks
+            self._module_copies = replicate(self.module, self.device_ids, detach=True)
+            self._module_copies[0] = self.module
+
+            for module_copy in self._module_copies[1:]:
+                for param, copy_param in zip(self.module.parameters(), module_copy.parameters()):
+                    copy_param.requires_grad = param.requires_grad
+
+        else:
+            self._module_copies = [self.module]
+
+        # For NCCL backend, since every single NCCL call is asynchoronous, we
+        # therefore directly enqueue all the NCCL reduction calls to the
+        # default CUDA stream without spawning up other reduction threads.
+        # This achieves the best performance.
+        if dist._backend == dist.dist_backend.NCCL:
+            self._register_nccl_grad_hook()
+            return
+
+        bucket_bytes_cap = 1 * MB
+
+        # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems
+        param_buckets = []
+        # Split the parameters into buckets and by types as well
+        for dev_idx, module in enumerate(self._module_copies):
+            param_buckets.append(list(_take_tensors(module.parameters(), bucket_bytes_cap)))
+
+        self.bucket_sizes = []
+        self.bucket_map = {}
+
+        # We transpose param_buckets, so the loop is over buckets.
+        # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems
+        for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)):
+            self.bucket_sizes.append(0)
+            # Now, we transpose again, so we iterate over bucket_elems, but getting tuples
+            # of params from each device.
+            for idx, param_tuple in enumerate(zip(*param_buckets_tuple)):
+                if idx == 0:
+                    # Bucket parameter type tracking
+                    bucket_param_type = param_tuple[0].type()
+                    # Only gloo and nccl support half-precision
+                    if bucket_param_type == torch.cuda.HalfTensor and \
+                            dist._backend != dist.dist_backend.GLOO:
+                        raise RuntimeError("DistributedDataParallel currently only "
+                                           "supports half precision parameters "
+                                           "with Nccl and Gloo backend")
+                if not param_tuple[0].requires_grad:
+                    continue
+                for p in param_tuple:
+                    self.bucket_map[p] = bucket_idx
+                self.bucket_sizes[bucket_idx] += 1
+
+        self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))]
+        self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))]
+        self.reduced = [False] * len(self.bucket_sizes)
+
+        self._register_grad_hooks()
+
+        self.dispatch_lock = threading.Lock()
+        self._start_reduction_threads()
+
+    def __getstate__(self):
+        attrs = copy.copy(self.__dict__)
+        if dist._backend != dist.dist_backend.NCCL:
+            del attrs['_grad_accs'], attrs['_reduction_queues'], \
+                attrs['_reduction_streams'], attrs['_reduction_threads'], \
+                attrs['_nccl_streams'], attrs['_default_streams']
+        return attrs
+
+    def __setstate__(self, state):
+        super(DistributedDataParallel, self).__setstate__(state)
+        if dist._backend == dist.dist_backend.NCCL:
+            self._register_nccl_grad_hook()
+        else:
+            self._register_grad_hooks()
+            self._start_reduction_threads()
+
+    def forward(self, *inputs, **kwargs):
+        self.need_reduction = True
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+        self._sync_params()
+        if len(self.device_ids) == 1:
+            return self.module(*inputs[0], **kwargs[0])
+        outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs)
+        return self.gather(outputs, self.output_device)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def parallel_apply(self, replicas, inputs, kwargs):
+        return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
+
+    def gather(self, outputs, output_device):
+        return gather(outputs, output_device, dim=self.dim)
+
+    def train(self, mode=True):
+        super(DistributedDataParallel, self).train(mode)
+        for module in self._module_copies[1:]:
+            module.train(mode)
+
+    def _dist_broadcast_coalesced(self, tensors, buffer_size):
+        """
+        Broadcast a sequence of tensors to the default group from rank 0.
+        Small tensors are first coalesced into a buffer to reduce the number of
+        broadcasts.
+
+        tensors (sequence): tensors to broadcast. Each tensor needs to be on the
+                            same GPU.
+        buffer_size (int): maximum size of the buffer for coalescing
+        """
+        for tensors in _take_tensors(tensors, buffer_size):
+            flat_tensors = _flatten_dense_tensors(tensors)
+            dist.broadcast(flat_tensors, 0)
+            for tensor, synced in zip(tensors,
+                                      _unflatten_dense_tensors(flat_tensors, tensors)):
+                tensor.copy_(synced)
+
+    def _sync_params(self):
+        if len(self.device_ids) > 1:
+            # intra-node parameter sync
+            params = [p.data for p in self.module.parameters()]
+            result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size)
+            for tensors, module in zip(result[1:], self._module_copies[1:]):
+                for tensor, param in zip(tensors, module.parameters()):
+                    param.data.set_(tensor)
+
+        # module buffer sync
+        if self.broadcast_buffers:
+            buffers = [b.data for b in self.module.buffers()]
+            if len(buffers) > 0:
+                # cross-node buffer sync
+                self._dist_broadcast_coalesced(buffers, self.broadcast_bucket_size)
+
+                if len(self.device_ids) > 1:
+                    # intra-node buffer sync
+                    result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size)
+                    for tensors, module in zip(result[1:], self._module_copies[1:]):
+                        for tensor, buf in zip(tensors, module.buffers()):
+                            buf.data.set_(tensor)
+
+    def _register_grad_hooks(self):
+        self._grad_accs = []  # need to keep them in scope
+        for device_idx, module in enumerate(self._module_copies):
+            for p in module.parameters():
+                if p.requires_grad:
+                    p_tmp = p.expand_as(p)
+                    grad_acc = p_tmp.grad_fn.next_functions[0][0]
+                    grad_acc.register_hook(self._make_param_hook(p, device_idx))
+                    self._grad_accs.append(grad_acc)
+
+    def _register_nccl_grad_hook(self):
+        """
+        This function registers the callback all-reduction function for the
+        NCCL backend. All gradients will be all reduced in one single step.
+        The NCCL reduction will directly be enqueued into the
+        default CUDA stream. Therefore, no synchronization is needed.
+        """
+        # Creating a new group
+        self.nccl_reduction_group_id = dist.new_group()
+
+        def reduction_fn_nccl():
+            # This function only needs to be called once
+            if not self.need_reduction:
+                return
+
+            self.need_reduction = False
+            all_grads = [[] for _ in range(len(self._module_copies))]
+            all_grads_buckets_iters = []
+
+            # Bucketing all the gradients
+            for dev_idx, module in enumerate(self._module_copies):
+                for param in module.parameters():
+                    if not param.requires_grad or param.grad is None:
+                        continue
+                    if param.grad.requires_grad:
+                        raise RuntimeError("DistributedDataParallel only works "
+                                           "with gradients that don't require "
+                                           "grad")
+                    # Adding the gradients for reduction
+                    all_grads[dev_idx].append(param.grad.data)
+
+                # Now bucketing the parameters
+                dev_grads_buckets = _take_tensors(all_grads[dev_idx],
+                                                  self.nccl_reduce_bucket_size)
+
+                all_grads_buckets_iters.append(dev_grads_buckets)
+
+            # Now reduce each bucket one after another
+            for grads_batch in zip(*all_grads_buckets_iters):
+                grads_batch_coalesced = []
+                # Coalesce each bucket
+                for dev_idx, dev_grads_batch in enumerate(grads_batch):
+                    dev_id = self.device_ids[dev_idx]
+                    with torch.cuda.device(dev_id):
+                        dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
+                        grads_batch_coalesced.append(dev_grads_batch_coalesced)
+
+                # We will only use device 0's results, but this single op should be
+                # faster than doing the following two operation sequentially:
+                # (1) intra-node reduce to lead GPU, followed by
+                # (2) inter-node allreduce for all the first lead GPUs in all nodes
+                dist.all_reduce_multigpu(grads_batch_coalesced,
+                                         group=self.nccl_reduction_group_id)
+
+                # Now only work on the first device of self.device_ids, uncoalesce
+                # the gradients for each bucket
+                grads_batch_coalesced[0] /= dist.get_world_size()
+                grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0])
+                for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
+                    grad.copy_(reduced)
+
+            # clear the gradients and save memory for replicas
+            for module in self._module_copies[1:]:
+                for param in module.parameters():
+                    if param.requires_grad:
+                        param.grad = None
+                        param.data.set_()
+
+        # Now register the reduction hook on the parameters
+        for p in self.module.parameters():
+            if not p.requires_grad:
+                continue
+
+            def allreduce_hook(*unused):
+                Variable._execution_engine.queue_callback(reduction_fn_nccl)
+
+            p.register_hook(allreduce_hook)
+
+    def _make_param_hook(self, param, device_idx):
+
+        bucket_idx = self.bucket_map[param]
+
+        def distributed_data_parallel_hook(*unused):
+            if param.grad.requires_grad:
+                raise RuntimeError("DistributedDataParallel only works with "
+                                   "gradients that don't require grad")
+            bucket = self.buckets[bucket_idx][device_idx]
+            bucket.append(param.grad.data)
+
+            # We can flush these and save memory for replicas
+            if device_idx > 0:
+                param.grad = None
+                param.data.set_()
+
+            # Current device's bucket is full
+            if len(bucket) == self.bucket_sizes[bucket_idx]:
+                with torch.cuda.device(self.device_ids[device_idx]):
+                    event = torch.cuda.Event()
+                    event.record()
+                with self.dispatch_lock:
+                    self.bucket_events[bucket_idx][device_idx] = event
+                    self._queue_reduction(bucket_idx)
+
+        return distributed_data_parallel_hook
+
+    def _queue_reduction(self, bucket_idx):
+        dev_buckets = self.buckets[bucket_idx]
+        dev_events = self.bucket_events[bucket_idx]
+
+        # Check if it's ready
+        if any(evt is None for evt in dev_events):
+            return
+
+        # Queue the reduction and make sure backward waits for it
+        event = threading.Event()
+        self._reduction_queues[bucket_idx].put((dev_buckets, dev_events, event))
+        Variable._execution_engine.queue_callback(lambda: event.wait())
+
+        # Reset bucket state
+        self.buckets[bucket_idx] = [[] for _ in range(len(self.device_ids))]
+        self.bucket_events[bucket_idx] = [None] * len(self.device_ids)
+        self.reduced[bucket_idx] = True
+        if all(self.reduced):
+            self.reduced = [False] * len(self.bucket_sizes)
+
+            def sync_reduction_streams():
+                # We only have to sync with the first one, but it's safer to do it this way
+                # in case we change the way in which we paralellize work
+                r_streams = zip(*self._reduction_streams)
+                for dev_id, default_stream, dev_r_streams in zip(self.device_ids, self._default_streams, r_streams):
+                    with torch.cuda.device(dev_id):
+                        for reduction_stream in dev_r_streams:
+                            default_stream.wait_stream(reduction_stream)
+            Variable._execution_engine.queue_callback(sync_reduction_streams)
+
+    def _start_reduction_threads(self):
+        num_buckets = len(self.bucket_sizes)
+        self._reduction_queues = [queue.Queue() for _ in range(num_buckets)]
+        self._reduction_threads = []
+        self._reduction_streams = [[] for _ in range(num_buckets)]
+        self._nccl_streams = []
+        self._default_streams = []
+        for dev_id in self.device_ids:
+            with torch.cuda.device(dev_id):
+                # TODO: don't assume we're on a default stream
+                self._default_streams.append(torch.cuda.current_stream())
+                self._nccl_streams.append(torch.cuda.Stream())
+        for reduction_queue, reduction_streams in zip(self._reduction_queues, self._reduction_streams):
+            for dev_id in self.device_ids:
+                with torch.cuda.device(dev_id):
+                    reduction_streams.append(torch.cuda.Stream())
+            # We only use the first device for distributed reductions
+            dist._register_stream(reduction_streams[0])
+
+            group_id = dist.new_group()
+
+            self._reduction_threads.append(threading.Thread(
+                target=self._reduction_thread_fn,
+                args=(reduction_queue, group_id, self.device_ids, reduction_streams, self._nccl_streams)))
+            self._reduction_threads[-1].daemon = True
+            self._reduction_threads[-1].start()
+
+    @staticmethod
+    def _reduction_thread_fn(queue, group_id, device_ids, reduction_streams, nccl_streams):
+
+        def _process_batch():
+            dev_grad_batch, dev_events, job_event = queue.get()
+            dev_coalesced = []
+            # Coalesce the tensors on all devices and start a local reduction
+            for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams):
+                with torch.cuda.device(dev_id), torch.cuda.stream(stream):
+                    stream.wait_event(event)
+                    coalesced = _flatten_dense_tensors(grad_batch)
+                    dev_coalesced.append(coalesced)
+            # Wait for all copies to complete before starting the NCCL kernel
+            for stream in reduction_streams:
+                stream.synchronize()
+            nccl.reduce(dev_coalesced, root=0, streams=nccl_streams)
+
+            # From now on we're only going to work on the first device (from device_ids)
+            grad_batch = dev_grad_batch[0]
+            coalesced = dev_coalesced[0]
+            reduce_stream = reduction_streams[0]
+            with torch.cuda.stream(reduce_stream):
+                reduce_stream.wait_stream(nccl_streams[0])
+                coalesced /= dist.get_world_size()
+                dist.all_reduce(coalesced, group=group_id)
+                for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)):
+                    grad.copy_(reduced)
+            job_event.set()
+
+        with torch.cuda.device(device_ids[0]):
+            while True:
+                _process_batch()  # just to have a clear scope
diff --git a/torch/nn/parallel/distributed_c10d_cpu.py b/torch/nn/parallel/deprecated/distributed_cpu.py
similarity index 88%
rename from torch/nn/parallel/distributed_c10d_cpu.py
rename to torch/nn/parallel/deprecated/distributed_cpu.py
index d7cdeb45cae7f0..e93ed2a9b816f1 100644
--- a/torch/nn/parallel/distributed_c10d_cpu.py
+++ b/torch/nn/parallel/deprecated/distributed_cpu.py
@@ -1,16 +1,15 @@
 import torch
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-import torch.distributed.c10d as dist
+import torch.distributed.deprecated as dist
 from torch.nn.modules import Module
 from collections import defaultdict
 from torch.autograd import Variable
 
 
-class _DistributedDataParallelC10dCPU(Module):
-    r"""Implements distributed data parallelism for CPU at the module level
-    and it will use PyTorch's new distributed package: c10d.
+class DistributedDataParallelCPU(Module):
+    r"""Implements distributed data parallelism for CPU at the module level.
 
-    This module support the ``mpi``, ``gloo``, backends.
+    This module support the ``mpi``, ``gloo``, ``tcp`` backends.
 
     This container parallelizes the application of the given module by
     splitting the input across the specified devices by chunking in the batch
@@ -29,7 +28,7 @@ class _DistributedDataParallelC10dCPU(Module):
 
     Creation of this class requires the distributed package to be already
     initialized in the process group mode
-    (see :func:`torch.distributed.init_process_group`).
+    (see :func:`torch.distributed.deprecated.init_process_group`).
 
     .. warning::
         Constructor, forward method, and differentiation of the output (or a
@@ -64,12 +63,12 @@ class _DistributedDataParallelC10dCPU(Module):
 
     Example::
 
-        >>> torch.distributed.init_process_group(world_size=4, init_method='...')
-        >>> net = torch.nn._DistributedDataParallelC10dCPU(model)
+        >>> torch.distributed.deprecated.init_process_group(world_size=4, init_method='...')
+        >>> net = torch.nn.DistributedDataParallelCPU(model)
     """
 
     def __init__(self, module):
-        super(_DistributedDataParallelC10dCPU, self).__init__()
+        super(DistributedDataParallelCPU, self).__init__()
         self.module = module
         self.sync_parameters()
 
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index caf1f533acbec9..2ea2d451594700 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -1,10 +1,6 @@
-import sys
-import math
-import threading
 import copy
 
 import torch
-from torch.autograd import Variable
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors, \
     _take_tensors
 
@@ -16,15 +12,15 @@
 from .replicate import replicate
 from .scatter_gather import scatter_kwargs, gather
 from .parallel_apply import parallel_apply
-
-if sys.version_info[0] == 3:
-    import queue
-else:
-    import Queue as queue
+from torch.cuda._utils import _get_device_index
 
 
 class DistributedDataParallel(Module):
-    r"""Implements distributed data parallelism at the module level.
+    r"""Implements distributed data parallelism that is based on c10d at the
+    module level.
+
+    Currently this module is EXPERIMENTAL ONLY and should not be
+    used by normal users. Instead, please use DistributedDataParallel.
 
     This container parallelizes the application of the given module by
     splitting the input across the specified devices by chunking in the batch
@@ -39,12 +35,12 @@ class DistributedDataParallel(Module):
     See also: :ref:`distributed-basics` and :ref:`cuda-nn-dataparallel-instead`.
     The same constraints on input as in :class:`torch.nn.DataParallel` apply.
 
-    Creation of this class requires the distributed package to be already
-    initialized in the process group mode
-    (see :func:`torch.distributed.init_process_group`).
+    Creation of this class requires the c10d process group to be already
+    initialized. This class will basically operate on the provided c10d
+    process group.
 
     .. warning::
-        This module works only with the ``nccl`` and ``gloo`` backends.
+        This module works only with the ``gloo`` and ``nccl`` process groups.
 
     .. warning::
         Constructor, forward method, and differentiation of the output (or a
@@ -57,6 +53,14 @@ class DistributedDataParallel(Module):
         time it is created. No parameters should be added nor removed later.
         Same applies to buffers.
 
+    -- warning::
+        This module assumes all parameters are registered in the model of each
+        distributed processes are in the same order. The module itself will
+        conduct gradient all-reduction following the reverse order of the
+        registered parameters of the model. In other wise, it is users'
+        responsibility to ensure that each distributed process has the exact
+        same model and thus the exact parameter registeration order.
+
     .. warning::
         This module assumes all buffers and gradients are dense.
 
@@ -66,12 +70,13 @@ class DistributedDataParallel(Module):
         parameters).
 
     .. warning::
-        If you plan on using this module with a ``nccl`` backend or a ``gloo``
-        backend (that uses Infiniband), together with a DataLoader that uses
-        multiple workers, please change the multiprocessing start method to
-        ``forkserver`` (Python 3 only) or ``spawn``. Unfortunately
-        Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will
-        likely experience deadlocks if you don't change this setting.
+        If you plan on using this module with a ``nccl`` process group or
+        a ``gloo`` process group (that uses Infiniband), together with a
+        DataLoader that uses multiple workers, please change the multiprocessing
+        start method to ``forkserver`` (Python 3 only) or ``spawn``.
+        Unfortunately Gloo (that uses Infiniband) and NCCL2 are not fork safe,
+        and you will likely experience deadlocks if you don't change this
+        setting.
 
     .. note::
         Parameters are never broadcast between processes. The module performs
@@ -86,46 +91,59 @@ class DistributedDataParallel(Module):
         :meth:`forward` method.
 
     Args:
-        module: module to be parallelized
-        device_ids: CUDA devices (default: all devices)
-        output_device: device location of output (default: device_ids[0])
-        broadcast_buffers: flag that enables syncing (broadcasting) buffers of
+        module (Module): module to be parallelized
+        device_ids (list of int or torch.device): CUDA devices (default: all devices)
+        output_device (int or torch.device): device location of output (default: device_ids[0])
+        broadcast_buffers (bool): flag that enables syncing (broadcasting) buffers of
                            the module at beginning of the forward function.
                            (default: True)
+        process_group: the c10d process group to be used for distributed data
+                       all-reduction. If None, the default process group will
+                       be used. (default: None)
+        bucket_cap_mb: DistributedDataParallel will bucket parameters into
+                       multiple buckets so that gradient reduction of each
+                       bucket can potentially overlap with backward computation.
+                       bucket_cap_mb controls the bucket size in MegaBytes (MB)
+                       (default: 25)
 
     Attributes:
         module (Module): the module to be parallelized
 
     Example::
-
-        >>> torch.distributed.init_process_group(world_size=4, init_method='...')
-        >>> net = torch.nn.DistributedDataParallel(model)
+        >>> store = torch.distributed.FileStore("/tmp/tempfile.txt")
+        >>> pg = torch.distributed.ProcessGroupGloo(store, rank, world_size)
+        >>> net = torch.nn.DistributedDataParallel(model, pg)
     """
+    def __init__(self, module, device_ids=None,
+                 output_device=None, dim=0, broadcast_buffers=True,
+                 process_group=None, bucket_cap_mb=25):
 
-    def __init__(self, module, device_ids=None, output_device=None, dim=0,
-                 broadcast_buffers=True):
         super(DistributedDataParallel, self).__init__()
-        if dist._backend not in (dist.dist_backend.NCCL, dist.dist_backend.GLOO):
-            raise ValueError('Invalid backend, only NCCL and GLOO backends are supported by DistributedDataParallel')
 
+        # Use all devices by default
         if device_ids is None:
             device_ids = list(range(torch.cuda.device_count()))
+
         if output_device is None:
             output_device = device_ids[0]
+
+        if process_group is None:
+            self.process_group = dist.get_default_group()
+        else:
+            self.process_group = process_group
+
         self.dim = dim
         self.module = module
-        self.device_ids = device_ids
-        self.output_device = output_device
+        self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids))
+        self.output_device = _get_device_index(output_device, True)
         self.broadcast_buffers = broadcast_buffers
 
-        # Flag used by the NCCL backend to make sure we only reduce gradients
-        # one time in the execution engine
-        self.need_reduction = False
+        self.allreduce_opts = dist.AllreduceOptions()
 
         MB = 1024 * 1024
+
         # used for intra-node param sync and inter-node sync as well
-        self.broadcast_bucket_size = 10 * MB
-        self.nccl_reduce_bucket_size = 256 * MB
+        self.broadcast_bucket_size = 25 * MB
 
         # Sync params and buffers
         module_states = list(self.module.state_dict().values())
@@ -147,21 +165,19 @@ def __init__(self, module, device_ids=None, output_device=None, dim=0,
         else:
             self._module_copies = [self.module]
 
-        # For NCCL backend, since every single NCCL call is asynchoronous, we
-        # therefore directly enqueue all the NCCL reduction calls to the
-        # default CUDA stream without spawning up other reduction threads.
-        # This achieves the best performance.
-        if dist._backend == dist.dist_backend.NCCL:
-            self._register_nccl_grad_hook()
-            return
+        self.modules_params_data = [[] for _ in range(len(self.device_ids))]
+        self.modules_buffers_data = [[] for _ in range(len(self.device_ids))]
+
+        for dev_idx, module in enumerate(self._module_copies):
+            self.modules_params_data[dev_idx] = [p.data for p in module.parameters()]
+            self.modules_buffers_data[dev_idx] = [b.data for b in module.buffers()]
 
-        bucket_bytes_cap = 1 * MB
+        bucket_bytes_cap = bucket_cap_mb * MB
 
         # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems
         param_buckets = []
         # Split the parameters into buckets and by types as well
-        for dev_idx, module in enumerate(self._module_copies):
-            param_buckets.append(list(_take_tensors(module.parameters(), bucket_bytes_cap)))
+        param_buckets = [list(_take_tensors(m.parameters(), bucket_bytes_cap)) for m in self._module_copies]
 
         self.bucket_sizes = []
         self.bucket_map = {}
@@ -173,48 +189,45 @@ def __init__(self, module, device_ids=None, output_device=None, dim=0,
             # Now, we transpose again, so we iterate over bucket_elems, but getting tuples
             # of params from each device.
             for idx, param_tuple in enumerate(zip(*param_buckets_tuple)):
-                if idx == 0:
-                    # Bucket parameter type tracking
-                    bucket_param_type = param_tuple[0].type()
-                    # Only gloo and nccl support half-precision
-                    if bucket_param_type == torch.cuda.HalfTensor and \
-                            dist._backend != dist.dist_backend.GLOO:
-                        raise RuntimeError("DistributedDataParallel currently only "
-                                           "supports half precision parameters "
-                                           "with Nccl and Gloo backend")
                 if not param_tuple[0].requires_grad:
                     continue
                 for p in param_tuple:
-                    self.bucket_map[p] = bucket_idx
+                    self.bucket_map[p] = (bucket_idx, idx)
                 self.bucket_sizes[bucket_idx] += 1
 
-        self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))]
-        self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))]
-        self.reduced = [False] * len(self.bucket_sizes)
+        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
+                        for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
+        # The number of params ready in each bucket
+        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
 
-        self._register_grad_hooks()
+        # coalesced bucket for only device 0
+        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
+        # We will always reduce the bucket following the reverse order
+        # that is, alway reduces following the order of: n - 1, n - 2, ..., 0
+        self.next_bucket = len(self.bucket_sizes) - 1
+        self.ready_buckets_not_reduced = set()
+        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
+
+        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]
 
-        self.dispatch_lock = threading.Lock()
-        self._start_reduction_threads()
+        # default stream tracking to launch nccl reduce kernels
+        self.default_streams = []
+        for dev_id in self.device_ids:
+            with torch.cuda.device(dev_id):
+                self.default_streams.append(torch.cuda.current_stream())
+
+        self._register_grad_hooks()
 
     def __getstate__(self):
         attrs = copy.copy(self.__dict__)
-        if dist._backend != dist.dist_backend.NCCL:
-            del attrs['_grad_accs'], attrs['_reduction_queues'], \
-                attrs['_reduction_streams'], attrs['_reduction_threads'], \
-                attrs['_nccl_streams'], attrs['_default_streams']
+        del attrs['_grad_accs']
         return attrs
 
     def __setstate__(self, state):
         super(DistributedDataParallel, self).__setstate__(state)
-        if dist._backend == dist.dist_backend.NCCL:
-            self._register_nccl_grad_hook()
-        else:
-            self._register_grad_hooks()
-            self._start_reduction_threads()
+        self._register_grad_hooks()
 
     def forward(self, *inputs, **kwargs):
-        self.need_reduction = True
         inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
         self._sync_params()
         if len(self.device_ids) == 1:
@@ -237,44 +250,32 @@ def train(self, mode=True):
             module.train(mode)
 
     def _dist_broadcast_coalesced(self, tensors, buffer_size):
-        """
-        Broadcast a sequence of tensors to the default group from rank 0.
-        Small tensors are first coalesced into a buffer to reduce the number of
-        broadcasts.
-
-        tensors (sequence): tensors to broadcast. Each tensor needs to be on the
-                            same GPU.
-        buffer_size (int): maximum size of the buffer for coalescing
-        """
-        for tensors in _take_tensors(tensors, buffer_size):
-            flat_tensors = _flatten_dense_tensors(tensors)
-            dist.broadcast(flat_tensors, 0)
-            for tensor, synced in zip(tensors,
-                                      _unflatten_dense_tensors(flat_tensors, tensors)):
-                tensor.copy_(synced)
+        dist._dist_broadcast_coalesced(tensors, buffer_size, self.process_group)
 
     def _sync_params(self):
         if len(self.device_ids) > 1:
             # intra-node parameter sync
-            params = [p.data for p in self.module.parameters()]
-            result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size)
-            for tensors, module in zip(result[1:], self._module_copies[1:]):
-                for tensor, param in zip(tensors, module.parameters()):
-                    param.data.set_(tensor)
+            result = broadcast_coalesced(self.modules_params_data[0],
+                                         self.device_ids,
+                                         self.broadcast_bucket_size)
+            for tensors, module_params_data in zip(result[1:], self.modules_params_data[1:]):
+                for tensor, param_data in zip(tensors, module_params_data):
+                    param_data.set_(tensor)
 
         # module buffer sync
         if self.broadcast_buffers:
-            buffers = [b.data for b in self.module.buffers()]
-            if len(buffers) > 0:
+            if len(self.modules_buffers_data[0]) > 0:
                 # cross-node buffer sync
-                self._dist_broadcast_coalesced(buffers, self.broadcast_bucket_size)
-
+                self._dist_broadcast_coalesced(self.modules_buffers_data[0],
+                                               self.broadcast_bucket_size)
                 if len(self.device_ids) > 1:
                     # intra-node buffer sync
-                    result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size)
-                    for tensors, module in zip(result[1:], self._module_copies[1:]):
-                        for tensor, buf in zip(tensors, module.buffers()):
-                            buf.data.set_(tensor)
+                    result = broadcast_coalesced(self.modules_buffers_data[0],
+                                                 self.device_ids,
+                                                 self.broadcast_bucket_size)
+                    for tensors, module_buffers_data in zip(result[1:], self.modules_buffers_data[1:]):
+                        for tensor, buffer_data in zip(tensors, module_buffers_data):
+                            buffer_data.set_(tensor)
 
     def _register_grad_hooks(self):
         self._grad_accs = []  # need to keep them in scope
@@ -286,94 +287,16 @@ def _register_grad_hooks(self):
                     grad_acc.register_hook(self._make_param_hook(p, device_idx))
                     self._grad_accs.append(grad_acc)
 
-    def _register_nccl_grad_hook(self):
-        """
-        This function registers the callback all-reduction function for the
-        NCCL backend. All gradients will be all reduced in one single step.
-        The NCCL reduction will directly be enqueued into the
-        default CUDA stream. Therefore, no synchronization is needed.
-        """
-        # Creating a new group
-        self.nccl_reduction_group_id = dist.new_group()
-
-        def reduction_fn_nccl():
-            # This function only needs to be called once
-            if not self.need_reduction:
-                return
-
-            self.need_reduction = False
-            all_grads = [[] for _ in range(len(self._module_copies))]
-            all_grads_buckets_iters = []
-
-            # Bucketing all the gradients
-            for dev_idx, module in enumerate(self._module_copies):
-                for param in module.parameters():
-                    if not param.requires_grad or param.grad is None:
-                        continue
-                    if param.grad.requires_grad:
-                        raise RuntimeError("DistributedDataParallel only works "
-                                           "with gradients that don't require "
-                                           "grad")
-                    # Adding the gradients for reduction
-                    all_grads[dev_idx].append(param.grad.data)
-
-                # Now bucketing the parameters
-                dev_grads_buckets = _take_tensors(all_grads[dev_idx],
-                                                  self.nccl_reduce_bucket_size)
-
-                all_grads_buckets_iters.append(dev_grads_buckets)
-
-            # Now reduce each bucket one after another
-            for grads_batch in zip(*all_grads_buckets_iters):
-                grads_batch_coalesced = []
-                # Coalesce each bucket
-                for dev_idx, dev_grads_batch in enumerate(grads_batch):
-                    dev_id = self.device_ids[dev_idx]
-                    with torch.cuda.device(dev_id):
-                        dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
-                        grads_batch_coalesced.append(dev_grads_batch_coalesced)
-
-                # We will only use device 0's results, but this single op should be
-                # faster than doing the following two operation sequentially:
-                # (1) intra-node reduce to lead GPU, followed by
-                # (2) inter-node allreduce for all the first lead GPUs in all nodes
-                dist.all_reduce_multigpu(grads_batch_coalesced,
-                                         group=self.nccl_reduction_group_id)
-
-                # Now only work on the first device of self.device_ids, uncoalesce
-                # the gradients for each bucket
-                grads_batch_coalesced[0] /= dist.get_world_size()
-                grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0])
-                for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
-                    grad.copy_(reduced)
-
-            # clear the gradients and save memory for replicas
-            for module in self._module_copies[1:]:
-                for param in module.parameters():
-                    if param.requires_grad:
-                        param.grad = None
-                        param.data.set_()
-
-        # Now register the reduction hook on the parameters
-        for p in self.module.parameters():
-            if not p.requires_grad:
-                continue
-
-            def allreduce_hook(*unused):
-                Variable._execution_engine.queue_callback(reduction_fn_nccl)
-
-            p.register_hook(allreduce_hook)
-
     def _make_param_hook(self, param, device_idx):
-
-        bucket_idx = self.bucket_map[param]
+        bucket_idx, bucket_offset = self.bucket_map[param]
 
         def distributed_data_parallel_hook(*unused):
             if param.grad.requires_grad:
-                raise RuntimeError("DistributedDataParallel only works with "
-                                   "gradients that don't require grad")
+                raise RuntimeError("DistributedDataParallel only works "
+                                   "with gradients that don't require grad")
             bucket = self.buckets[bucket_idx][device_idx]
-            bucket.append(param.grad.data)
+            bucket[bucket_offset] = param.grad.data
+            self.buckets_ready_size[bucket_idx][device_idx] += 1
 
             # We can flush these and save memory for replicas
             if device_idx > 0:
@@ -381,102 +304,79 @@ def distributed_data_parallel_hook(*unused):
                 param.data.set_()
 
             # Current device's bucket is full
-            if len(bucket) == self.bucket_sizes[bucket_idx]:
-                with torch.cuda.device(self.device_ids[device_idx]):
-                    event = torch.cuda.Event()
-                    event.record()
-                with self.dispatch_lock:
-                    self.bucket_events[bucket_idx][device_idx] = event
+            if self.buckets_ready_size[bucket_idx][device_idx] == self.bucket_sizes[bucket_idx]:
+                self.devs_ready[bucket_idx] += 1
+                if self.devs_ready[bucket_idx] < len(self.device_ids):
+                    return
+
+                # Now all devices's buckets with index: bucket_idx are ready
+                if bucket_idx == self.next_bucket:
                     self._queue_reduction(bucket_idx)
+                    self.next_bucket -= 1
+                    # Now reduce anything that is ready but not yet reduced
+                    if len(self.ready_buckets_not_reduced) > 0:
+                        sorted_todo = sorted(self.ready_buckets_not_reduced, reverse=True)
+                        for i in sorted_todo:
+                            # Nothing can be reduced now
+                            if i < self.next_bucket:
+                                break
+                            self._queue_reduction(i)
+                            self.ready_buckets_not_reduced.remove(i)
+                            if i == self.next_bucket:
+                                self.next_bucket -= 1
+                else:
+                    self.ready_buckets_not_reduced.add(bucket_idx)
+
+                # When all devices' buckets
+                if self.next_bucket == -1:
+                    # A final sync for all the reduction works
+                    self._sync_reduction_works()
 
         return distributed_data_parallel_hook
 
     def _queue_reduction(self, bucket_idx):
-        dev_buckets = self.buckets[bucket_idx]
-        dev_events = self.bucket_events[bucket_idx]
-
-        # Check if it's ready
-        if any(evt is None for evt in dev_events):
-            return
-
-        # Queue the reduction and make sure backward waits for it
-        event = threading.Event()
-        self._reduction_queues[bucket_idx].put((dev_buckets, dev_events, event))
-        Variable._execution_engine.queue_callback(lambda: event.wait())
-
-        # Reset bucket state
-        self.buckets[bucket_idx] = [[] for _ in range(len(self.device_ids))]
-        self.bucket_events[bucket_idx] = [None] * len(self.device_ids)
-        self.reduced[bucket_idx] = True
-        if all(self.reduced):
-            self.reduced = [False] * len(self.bucket_sizes)
-
-            def sync_reduction_streams():
-                # We only have to sync with the first one, but it's safer to do it this way
-                # in case we change the way in which we paralellize work
-                r_streams = zip(*self._reduction_streams)
-                for dev_id, default_stream, dev_r_streams in zip(self.device_ids, self._default_streams, r_streams):
-                    with torch.cuda.device(dev_id):
-                        for reduction_stream in dev_r_streams:
-                            default_stream.wait_stream(reduction_stream)
-            Variable._execution_engine.queue_callback(sync_reduction_streams)
-
-    def _start_reduction_threads(self):
-        num_buckets = len(self.bucket_sizes)
-        self._reduction_queues = [queue.Queue() for _ in range(num_buckets)]
-        self._reduction_threads = []
-        self._reduction_streams = [[] for _ in range(num_buckets)]
-        self._nccl_streams = []
-        self._default_streams = []
-        for dev_id in self.device_ids:
+        grads_batch = self.buckets[bucket_idx]
+        grads_batch_coalesced = []
+
+        # coalesce the bucket
+        for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch):
             with torch.cuda.device(dev_id):
-                # TODO: don't assume we're on a default stream
-                self._default_streams.append(torch.cuda.current_stream())
-                self._nccl_streams.append(torch.cuda.Stream())
-        for reduction_queue, reduction_streams in zip(self._reduction_queues, self._reduction_streams):
-            for dev_id in self.device_ids:
-                with torch.cuda.device(dev_id):
-                    reduction_streams.append(torch.cuda.Stream())
-            # We only use the first device for distributed reductions
-            dist._register_stream(reduction_streams[0])
-
-            group_id = dist.new_group()
-
-            self._reduction_threads.append(threading.Thread(
-                target=self._reduction_thread_fn,
-                args=(reduction_queue, group_id, self.device_ids, reduction_streams, self._nccl_streams)))
-            self._reduction_threads[-1].daemon = True
-            self._reduction_threads[-1].start()
-
-    @staticmethod
-    def _reduction_thread_fn(queue, group_id, device_ids, reduction_streams, nccl_streams):
-
-        def _process_batch():
-            dev_grad_batch, dev_events, job_event = queue.get()
-            dev_coalesced = []
-            # Coalesce the tensors on all devices and start a local reduction
-            for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams):
-                with torch.cuda.device(dev_id), torch.cuda.stream(stream):
-                    stream.wait_event(event)
-                    coalesced = _flatten_dense_tensors(grad_batch)
-                    dev_coalesced.append(coalesced)
-            # Wait for all copies to complete before starting the NCCL kernel
-            for stream in reduction_streams:
-                stream.synchronize()
-            nccl.reduce(dev_coalesced, root=0, streams=nccl_streams)
-
-            # From now on we're only going to work on the first device (from device_ids)
-            grad_batch = dev_grad_batch[0]
-            coalesced = dev_coalesced[0]
-            reduce_stream = reduction_streams[0]
-            with torch.cuda.stream(reduce_stream):
-                reduce_stream.wait_stream(nccl_streams[0])
-                coalesced /= dist.get_world_size()
-                dist.all_reduce(coalesced, group=group_id)
-                for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)):
-                    grad.copy_(reduced)
-            job_event.set()
-
-        with torch.cuda.device(device_ids[0]):
-            while True:
-                _process_batch()  # just to have a clear scope
+                dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
+                grads_batch_coalesced.append(dev_grads_batch_coalesced)
+
+        # reduce to the first GPU in self.device_ids
+        if len(self.device_ids) > 1:
+            nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams)
+
+        # divide by the number of processes here to reduce chances of overflow
+        grads_batch_coalesced[0] /= self.process_group.size()
+
+        # now work on the first gpu
+        reduction_work = self.process_group.allreduce([grads_batch_coalesced[0]],
+                                                      self.allreduce_opts)
+        self.reduction_works[bucket_idx] = reduction_work
+        self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]
+
+    def _sync_reduction_works(self):
+        # Now only work on the first GPU of self.device_ids, uncoalesce
+        # the gradients for each bucket
+        for bucket_idx, grads_batch in enumerate(self.buckets):
+            # wait will let current stream wait on the c10d reduction stream
+            self.reduction_works[bucket_idx].wait()
+
+            grads_batch_reduced = _unflatten_dense_tensors(
+                self.buckets_coalesced[bucket_idx], grads_batch[0])
+
+            for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
+                grad.copy_(reduced)
+
+        # Reset the module states
+        self.next_bucket = len(self.bucket_sizes) - 1
+        self.ready_buckets_not_reduced = set()
+        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
+        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]
+
+        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
+                        for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
+        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
+        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py
deleted file mode 100644
index 78a3bbfc2c79fa..00000000000000
--- a/torch/nn/parallel/distributed_c10d.py
+++ /dev/null
@@ -1,364 +0,0 @@
-import copy
-
-import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors, \
-    _take_tensors
-
-from torch.cuda.comm import broadcast_coalesced
-from torch.cuda import nccl
-import torch.distributed.c10d as c10d
-
-from ..modules import Module
-from .replicate import replicate
-from .scatter_gather import scatter_kwargs, gather
-from .parallel_apply import parallel_apply
-
-
-class _DistributedDataParallelC10d(Module):
-    r"""Implements distributed data parallelism that is based on c10d at the
-    module level.
-
-    Currently this module is EXPERIMENTAL ONLY and should not be
-    used by normal users. Instead, please use DistributedDataParallel.
-
-    This container parallelizes the application of the given module by
-    splitting the input across the specified devices by chunking in the batch
-    dimension. The module is replicated on each machine and each device, and
-    each such replica handles a portion of the input. During the backwards
-    pass, gradients from each node are averaged.
-
-    The batch size should be larger than the number of GPUs used locally. It
-    should also be an integer multiple of the number of GPUs so that each chunk
-    is the same size (so that each GPU processes the same number of samples).
-
-    See also: :ref:`distributed-basics` and :ref:`cuda-nn-dataparallel-instead`.
-    The same constraints on input as in :class:`torch.nn.DataParallel` apply.
-
-    Creation of this class requires the c10d process group to be already
-    initialized. This class will basically operate on the provided c10d
-    process group.
-
-    .. warning::
-        This module works only with the ``gloo`` and ``nccl`` process groups.
-
-    .. warning::
-        Constructor, forward method, and differentiation of the output (or a
-        function of the output of this module) is a distributed synchronization
-        point. Take that into account in case different processes might be
-        executing different code.
-
-    .. warning::
-        This module assumes all parameters are registered in the model by the
-        time it is created. No parameters should be added nor removed later.
-        Same applies to buffers.
-
-    -- warning::
-        This module assumes all parameters are registered in the model of each
-        distributed processes are in the same order. The module itself will
-        conduct gradient all-reduction following the reverse order of the
-        registered parameters of the model. In other wise, it is users'
-        responsibility to ensure that each distributed process has the exact
-        same model and thus the exact parameter registeration order.
-
-    .. warning::
-        This module assumes all buffers and gradients are dense.
-
-    .. warning::
-        This module doesn't work with :func:`torch.autograd.grad` (i.e. it will
-        only work if gradients are to be accumulated in ``.grad`` attributes of
-        parameters).
-
-    .. warning::
-        If you plan on using this module with a ``nccl`` process group or
-        a ``gloo`` process group (that uses Infiniband), together with a
-        DataLoader that uses multiple workers, please change the multiprocessing
-        start method to ``forkserver`` (Python 3 only) or ``spawn``.
-        Unfortunately Gloo (that uses Infiniband) and NCCL2 are not fork safe,
-        and you will likely experience deadlocks if you don't change this
-        setting.
-
-    .. note::
-        Parameters are never broadcast between processes. The module performs
-        an all-reduce step on gradients and assumes that they will be modified
-        by the optimizer in all processes in the same way. Buffers
-        (e.g. BatchNorm stats) are broadcast from the module in process of rank
-        0, to all other replicas in the system in every iteration.
-
-    .. warning::
-        Forward and backward hooks defined on :attr:`module` and its submodules
-        won't be invoked anymore, unless the hooks are initialized in the
-        :meth:`forward` method.
-
-    Args:
-        module: module to be parallelized
-        device_ids: CUDA devices (default: all devices)
-        output_device: device location of output (default: device_ids[0])
-        broadcast_buffers: flag that enables syncing (broadcasting) buffers of
-                           the module at beginning of the forward function.
-                           (default: True)
-        process_group: the c10d process group to be used for distributed data
-                       all-reduction. If None, the default process group will
-                       be used
-        bucket_cap_mb: DistributedDataParallelC10d will bucket parameters into
-                       multiple buckets so that gradient reduction of each
-                       bucket can potentially overlap with backward computation.
-                       bucket_cap_mb controls the bucket size in MegaBytes (MB)
-                       (default: 25)
-
-    Attributes:
-        module (Module): the module to be parallelized
-
-    Example::
-        >>> store = torch.distributed.c10d.FileStore("/tmp/tempfile.txt")
-        >>> pg = torch.distributed.c10d.ProcessGroupGloo(store, rank, world_size)
-        >>> net = torch.nn._DistributedDataParallelC10d(model, pg)
-    """
-    def __init__(self, module, device_ids=None,
-                 output_device=None, dim=0, broadcast_buffers=True,
-                 process_group=None, bucket_cap_mb=25):
-
-        super(_DistributedDataParallelC10d, self).__init__()
-
-        # Use all devices by default
-        if device_ids is None:
-            device_ids = list(range(torch.cuda.device_count()))
-
-        if output_device is None:
-            output_device = device_ids[0]
-
-        if process_group is None:
-            self.process_group = c10d.get_default_group()
-        else:
-            self.process_group = process_group
-
-        self.dim = dim
-        self.module = module
-        self.device_ids = device_ids
-        self.output_device = output_device
-        self.broadcast_buffers = broadcast_buffers
-
-        self.allreduce_opts = c10d.AllreduceOptions()
-
-        MB = 1024 * 1024
-
-        # used for intra-node param sync and inter-node sync as well
-        self.broadcast_bucket_size = 25 * MB
-
-        # Sync params and buffers
-        module_states = list(self.module.state_dict().values())
-        if len(module_states) > 0:
-            self._dist_broadcast_coalesced(module_states,
-                                           self.broadcast_bucket_size)
-
-        if len(device_ids) > 1:
-            # TODO: we don't need to replicate params in here. they're always going to
-            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
-            # better to not pollute the caches with these small blocks
-            self._module_copies = replicate(self.module, self.device_ids, detach=True)
-            self._module_copies[0] = self.module
-
-            for module_copy in self._module_copies[1:]:
-                for param, copy_param in zip(self.module.parameters(), module_copy.parameters()):
-                    copy_param.requires_grad = param.requires_grad
-
-        else:
-            self._module_copies = [self.module]
-
-        # .data() of each parameter for each model replica
-        self.modules_params_data = [[] for _ in range(len(self.device_ids))]
-        # .data() of each buffer for each model replica
-        self.modules_buffers_data = [[] for _ in range(len(self.device_ids))]
-
-        for dev_idx, module in enumerate(self._module_copies):
-            self.modules_params_data[dev_idx] = [p.data for p in module.parameters()]
-            self.modules_buffers_data[dev_idx] = [b.data for b in module.buffers()]
-
-        bucket_bytes_cap = bucket_cap_mb * MB
-
-        # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems
-        param_buckets = []
-        # Split the parameters into buckets and by types as well
-        param_buckets = [list(_take_tensors(m.parameters(), bucket_bytes_cap)) for m in self._module_copies]
-
-        self.bucket_sizes = []
-        self.bucket_map = {}
-
-        # We transpose param_buckets, so the loop is over buckets.
-        # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems
-        for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)):
-            self.bucket_sizes.append(0)
-            # Now, we transpose again, so we iterate over bucket_elems, but getting tuples
-            # of params from each device.
-            for idx, param_tuple in enumerate(zip(*param_buckets_tuple)):
-                if not param_tuple[0].requires_grad:
-                    continue
-                for p in param_tuple:
-                    self.bucket_map[p] = (bucket_idx, idx)
-                self.bucket_sizes[bucket_idx] += 1
-
-        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
-                        for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
-        # The number of params ready in each bucket
-        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
-
-        # coalesced bucket for only device 0
-        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
-        # We will always reduce the bucket following the reverse order
-        # that is, alway reduces following the order of: n - 1, n - 2, ..., 0
-        self.next_bucket = len(self.bucket_sizes) - 1
-        self.ready_buckets_not_reduced = set()
-        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
-
-        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]
-
-        # default stream tracking to launch nccl reduce kernels
-        self.default_streams = []
-        for dev_id in self.device_ids:
-            with torch.cuda.device(dev_id):
-                self.default_streams.append(torch.cuda.current_stream())
-
-        self._register_grad_hooks()
-
-    def __getstate__(self):
-        attrs = copy.copy(self.__dict__)
-        del attrs['_grad_accs']
-        return attrs
-
-    def __setstate__(self, state):
-        super(_DistributedDataParallelC10d, self).__setstate__(state)
-        self._register_grad_hooks()
-
-    def forward(self, *inputs, **kwargs):
-        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-        self._sync_params()
-        if len(self.device_ids) == 1:
-            return self.module(*inputs[0], **kwargs[0])
-        outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs)
-        return self.gather(outputs, self.output_device)
-
-    def scatter(self, inputs, kwargs, device_ids):
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
-
-    def parallel_apply(self, replicas, inputs, kwargs):
-        return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
-
-    def gather(self, outputs, output_device):
-        return gather(outputs, output_device, dim=self.dim)
-
-    def train(self, mode=True):
-        super(_DistributedDataParallelC10d, self).train(mode)
-        for module in self._module_copies[1:]:
-            module.train(mode)
-
-    def _dist_broadcast_coalesced(self, tensors, buffer_size):
-        c10d._dist_broadcast_coalesced(tensors, buffer_size, self.process_group)
-
-    def _sync_params(self):
-        c10d._sync_params(self.process_group,
-                          self.modules_params_data,
-                          self.modules_buffers_data,
-                          self.device_ids,
-                          self.broadcast_bucket_size,
-                          self.broadcast_buffers)
-
-    def _register_grad_hooks(self):
-        self._grad_accs = []  # need to keep them in scope
-        for device_idx, module in enumerate(self._module_copies):
-            for p in module.parameters():
-                if p.requires_grad:
-                    p_tmp = p.expand_as(p)
-                    grad_acc = p_tmp.grad_fn.next_functions[0][0]
-                    grad_acc.register_hook(self._make_param_hook(p, device_idx))
-                    self._grad_accs.append(grad_acc)
-
-    def _make_param_hook(self, param, device_idx):
-        bucket_idx, bucket_offset = self.bucket_map[param]
-
-        def distributed_data_parallel_hook(*unused):
-            if param.grad.requires_grad:
-                raise RuntimeError("DistributedDataParallelC10d only works "
-                                   "with gradients that don't require grad")
-            bucket = self.buckets[bucket_idx][device_idx]
-            bucket[bucket_offset] = param.grad.data
-            self.buckets_ready_size[bucket_idx][device_idx] += 1
-
-            # We can flush these and save memory for replicas
-            if device_idx > 0:
-                param.grad = None
-                param.data.set_()
-
-            # Current device's bucket is full
-            if self.buckets_ready_size[bucket_idx][device_idx] == self.bucket_sizes[bucket_idx]:
-                self.devs_ready[bucket_idx] += 1
-                if self.devs_ready[bucket_idx] < len(self.device_ids):
-                    return
-
-                # Now all devices's buckets with index: bucket_idx are ready
-                if bucket_idx == self.next_bucket:
-                    self._queue_reduction(bucket_idx)
-                    self.next_bucket -= 1
-                    # Now reduce anything that is ready but not yet reduced
-                    if len(self.ready_buckets_not_reduced) > 0:
-                        sorted_todo = sorted(self.ready_buckets_not_reduced, reverse=True)
-                        for i in sorted_todo:
-                            # Nothing can be reduced now
-                            if i < self.next_bucket:
-                                break
-                            self._queue_reduction(i)
-                            self.ready_buckets_not_reduced.remove(i)
-                            if i == self.next_bucket:
-                                self.next_bucket -= 1
-                else:
-                    self.ready_buckets_not_reduced.add(bucket_idx)
-
-                # When all devices' buckets
-                if self.next_bucket == -1:
-                    # A final sync for all the reduction works
-                    self._sync_reduction_works()
-
-        return distributed_data_parallel_hook
-
-    def _queue_reduction(self, bucket_idx):
-        grads_batch = self.buckets[bucket_idx]
-        grads_batch_coalesced = []
-
-        # coalesce the bucket
-        for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch):
-            with torch.cuda.device(dev_id):
-                dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
-                grads_batch_coalesced.append(dev_grads_batch_coalesced)
-
-        # reduce to the first GPU in self.device_ids
-        if len(self.device_ids) > 1:
-            nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams)
-
-        # now work on the first gpu
-        reduction_work = self.process_group.allreduce([grads_batch_coalesced[0]],
-                                                      self.allreduce_opts)
-        self.reduction_works[bucket_idx] = reduction_work
-        self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]
-
-    def _sync_reduction_works(self):
-        # Now only work on the first GPU of self.device_ids, uncoalesce
-        # the gradients for each bucket
-        for bucket_idx, grads_batch in enumerate(self.buckets):
-            # wait will let current stream wait on the c10d reduction stream
-            self.reduction_works[bucket_idx].wait()
-
-            self.buckets_coalesced[bucket_idx] /= self.process_group.size()
-            grads_batch_reduced = _unflatten_dense_tensors(
-                self.buckets_coalesced[bucket_idx], grads_batch[0])
-
-            for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
-                grad.copy_(reduced)
-
-        # Reset the module states
-        self.next_bucket = len(self.bucket_sizes) - 1
-        self.ready_buckets_not_reduced = set()
-        self.reduction_works = [None for _ in range(len(self.bucket_sizes))]
-        self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]
-
-        self.buckets = [[[None for _ in range(self.bucket_sizes[i])]
-                        for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
-        self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))]
-        self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))]
diff --git a/torch/nn/parallel/distributed_cpu.py b/torch/nn/parallel/distributed_cpu.py
index 07b59c528d25b4..d5a3b1235f9678 100644
--- a/torch/nn/parallel/distributed_cpu.py
+++ b/torch/nn/parallel/distributed_cpu.py
@@ -7,9 +7,10 @@
 
 
 class DistributedDataParallelCPU(Module):
-    r"""Implements distributed data parallelism for CPU at the module level.
+    r"""Implements distributed data parallelism for CPU at the module level
+    and it will use PyTorch's new distributed package: c10d.
 
-    This module support the ``mpi``, ``gloo``, ``tcp`` backends.
+    This module support the ``mpi``, ``gloo``, backends.
 
     This container parallelizes the application of the given module by
     splitting the input across the specified devices by chunking in the batch
diff --git a/torch/nn/parallel/parallel_apply.py b/torch/nn/parallel/parallel_apply.py
index 6a5ab99cacfae1..438bdb9faf4037 100644
--- a/torch/nn/parallel/parallel_apply.py
+++ b/torch/nn/parallel/parallel_apply.py
@@ -1,5 +1,6 @@
 import threading
 import torch
+from torch.cuda._utils import _get_device_index
 
 
 def get_a_var(obj):
@@ -22,6 +23,11 @@ def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
     contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword)
     on each of :attr:`devices`.
 
+    Args:
+        modules (Module): modules to be parallelized
+        inputs (tensor): inputs to the modules
+        devices (list of int or torch.device): CUDA devices
+
     :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
     :attr:`devices` (if given) should all have same length. Moreover, each
     element of :attr:`inputs` can either be a single object as the only argument
@@ -36,7 +42,7 @@ def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
         assert len(modules) == len(devices)
     else:
         devices = [None] * len(modules)
-
+    devices = list(map(lambda x: _get_device_index(x, True), devices))
     lock = threading.Lock()
     results = {}
     grad_enabled = torch.is_grad_enabled()
diff --git a/torch/nn/parallel/replicate.py b/torch/nn/parallel/replicate.py
index b82ef08d727bcf..309c0db8f25054 100644
--- a/torch/nn/parallel/replicate.py
+++ b/torch/nn/parallel/replicate.py
@@ -1,10 +1,11 @@
 import torch.cuda.comm as comm
+from torch.cuda._utils import _get_device_index
 
 
 def replicate(network, devices, detach=False):
     from ._functions import Broadcast
 
-    devices = tuple(devices)
+    devices = list(map(lambda x: _get_device_index(x, True), devices))
     num_replicas = len(devices)
 
     params = list(network.parameters())
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
index b61fdabd3e828d..c5ad556e510a89 100644
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -215,10 +215,9 @@ def pad_sequence(sequences, batch_first=False, padding_value=0):
         torch.Size([25, 3, 300])
 
     Note:
-        This function returns a Tensor of size ``T x B x *`` or ``B x T x *`` where `T` is the
-            length of the longest sequence.
-        Function assumes trailing dimensions and type of all the Tensors
-            in sequences are same.
+        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
+        where `T` is the length of the longest sequence. This function assumes
+        trailing dimensions and type of all the Tensors in sequences are same.
 
     Arguments:
         sequences (list[Tensor]): list of variable length sequences.
@@ -227,7 +226,7 @@ def pad_sequence(sequences, batch_first=False, padding_value=0):
         padding_value (float, optional): value for padded elements. Default: 0.
 
     Returns:
-        Tensor of size ``T x B x *`` if batch_first is False
+        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
         Tensor of size ``B x T x *`` otherwise
     """
 
diff --git a/torch/nn/utils/weight_norm.py b/torch/nn/utils/weight_norm.py
index abc8b63cb105b9..a80ef02b411ea0 100644
--- a/torch/nn/utils/weight_norm.py
+++ b/torch/nn/utils/weight_norm.py
@@ -2,34 +2,26 @@
 Weight Normalization from https://arxiv.org/abs/1602.07868
 """
 from torch.nn.parameter import Parameter
-
-
-def _norm(p, dim):
-    """Computes the norm over all dimensions except dim"""
-    if dim is None:
-        return p.norm()
-    elif dim == 0:
-        output_size = (p.size(0),) + (1,) * (p.dim() - 1)
-        return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size)
-    elif dim == p.dim() - 1:
-        output_size = (1,) * (p.dim() - 1) + (p.size(-1),)
-        return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size)
-    else:
-        return _norm(p.transpose(0, dim), 0).transpose(0, dim)
+from torch import _weight_norm, norm_except_dim
 
 
 class WeightNorm(object):
     def __init__(self, name, dim):
+        if dim is None:
+            dim = -1
         self.name = name
         self.dim = dim
 
     def compute_weight(self, module):
         g = getattr(module, self.name + '_g')
         v = getattr(module, self.name + '_v')
-        return v * (g / _norm(v, self.dim))
+        return _weight_norm(v, g, self.dim)
 
     @staticmethod
     def apply(module, name, dim):
+        if dim is None:
+            dim = -1
+
         fn = WeightNorm(name, dim)
 
         weight = getattr(module, name)
@@ -38,7 +30,7 @@ def apply(module, name, dim):
         del module._parameters[name]
 
         # add g and v as new parameters and express w as g/||v|| * v
-        module.register_parameter(name + '_g', Parameter(_norm(weight, dim).data))
+        module.register_parameter(name + '_g', Parameter(norm_except_dim(weight, 2, dim).data))
         module.register_parameter(name + '_v', Parameter(weight.data))
         setattr(module, name, fn.compute_weight(module))
 
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index 23e95e282c4408..3f334304705ded 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -151,7 +151,7 @@ def _unimplemented(op, msg):
 # increasing this number.  This includes symbolic definitions NOT in this
 # file, so grep for "OpName" (with quotes)
 
-_onnx_opset_version = 7
+_onnx_opset_version = 9
 
 
 # ---------------------------------------------------------------------
@@ -192,25 +192,22 @@ def unused(g):
     return g.op("prim::Undefined")
 
 
-@parse_args('v', 'v', 't')
-def add(g, self, other, alpha):
-    if _scalar(alpha) != 1:
+def add(g, self, other, alpha=None):
+    # default alpha arg is to allow no-alpha add (aten add st overload no alpha)
+    if alpha and _scalar(_maybe_get_scalar(alpha)) != 1:
         return _unimplemented("add", "alpha != 1")
     # See Note [Pointwise by scalar]
     other = _maybe_get_scalar(other)
     return g.op("Add", self, _if_scalar_type_as(g, other, self))
 
 
-@parse_args('v', 'v', 't')
-def sub(g, self, other, alpha):
-    if _scalar(alpha) != 1:
+def sub(g, self, other, alpha=None):
+    # default alpha arg is to allow no-alpha sub (aten sub st overload no alpha)
+    if alpha and _scalar(_maybe_get_scalar(alpha)) != 1:
         return _unimplemented("sub", "alpha != 1")
     # See Note [Pointwise by scalar]. Note that self or other may be scalars.
     other = _maybe_get_scalar(other)
-    self = _maybe_get_scalar(self)
-    self = _if_scalar_type_as(g, self, other)
-    other = _if_scalar_type_as(g, other, self)
-    return g.op("Sub", self, other)
+    return g.op("Sub", self, _if_scalar_type_as(g, other, self))
 
 
 def mul(g, self, other):
@@ -716,11 +713,32 @@ def batch_norm(g, input, weight, bias, running_mean, running_var, training, mome
         return res
 
 
+@parse_args('v', 'v', 'v', 'v', 'v', 'i', 'f', 'f', 'i')
+def instance_norm(g, input, weight, bias, running_mean, running_var, use_input_stats, momentum, eps, cudnn_enabled):
+    input_sizes = input.type().sizes()
+    if weight is None or weight.node().kind() == "prim::Undefined":
+        assert len(input_sizes) > 1
+        weight_value = torch.tensor([1.] * input_sizes[1]).type(
+            'torch.' + input.type().scalarType() + 'Tensor')
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or bias.node().kind() == "prim::Undefined":
+        assert len(input_sizes) > 1
+        bias_value = torch.tensor([0.] * input_sizes[1]).type(
+            'torch.' + input.type().scalarType() + 'Tensor')
+        bias = g.op("Constant", value_t=bias_value)
+    return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
+
+
 @parse_args('v', 'i', 'i', 'i')
 def unfold(g, input, dimension, size, step):
     return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step)
 
 
+@parse_args('v', 'v', 'i')
+def _weight_norm(graph, v, g, dim):
+    return graph.op("ATen", v, g, dim_i=dim, operator_s="_weight_norm")
+
+
 @parse_args('v', 't', 't', 't')
 def elu(g, input, alpha, scale, input_scale):
     if scale and scale != 1.:
@@ -936,16 +954,45 @@ def zeros_like(g, input):
     return g.op("Sub", input, input).setType(input.type().contiguous())
 
 
+scalar_type_to_onnx = [
+    cast_pytorch_to_onnx["Byte"],
+    cast_pytorch_to_onnx["Char"],
+    cast_pytorch_to_onnx["Short"],
+    cast_pytorch_to_onnx["Int"],
+    cast_pytorch_to_onnx["Long"],
+    cast_pytorch_to_onnx["Half"],
+    cast_pytorch_to_onnx["Float"],
+    cast_pytorch_to_onnx["Double"],
+]
+
+
+@parse_args('v', 'i', 'i', 'v')
+def zeros(g, shape, scalar_type, layout, device):
+    # NOTE: no way to set device in ONNX, so we ignore it
+    return g.op("ConstantFill", shape, dtype_i=scalar_type_to_onnx[scalar_type],
+                input_as_shape_i=1, value_f=0)
+
+
 def full_like(g, input, fill_value):
     # TODO: a more efficient implementation (ConstantFill?)
     return add(g, zeros_like(g, input), fill_value, g.op("Constant", value_t=torch.tensor(1)))
 
 
-@parse_args('v', 'i', 'i', 'i', 'i')
+@parse_args('v', 'v', 'v', 'v', 'i')
 def slice(g, self, dim, start, end, step):
     if step != 1:
         _unimplemented("slice", "step!=1 is currently not supported")
-    return g.op("Slice", self, axes_i=[dim], starts_i=[start], ends_i=[end])
+    if start.node().kind() != 'onnx::Constant' or \
+            end.node().kind() != 'onnx::Constant' or dim.node().kind() != 'onnx::Constant':
+        start_unsqueezed = g.op("Unsqueeze", start, axes_i=[0])
+        end_unsqueezed = g.op("Unsqueeze", end, axes_i=[0])
+        dim_unsqueezed = g.op("Unsqueeze", dim, axes_i=[0])
+        return g.op("DynamicSlice", self, start_unsqueezed, end_unsqueezed, dim_unsqueezed)
+    else:
+        start = _parse_arg(start, 'i')
+        end = _parse_arg(end, 'i')
+        dim = _parse_arg(dim, 'i')
+        return g.op("Slice", self, axes_i=[dim], starts_i=[start], ends_i=[end])
 
 
 @parse_args('v', 'f', 'f')
@@ -972,6 +1019,24 @@ def topk(g, self, k, dim, largest, sorted, out=None):
     return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
 
 
+def to(g, self, *args):
+    # ONNX doesn't have a concept of a device, so we ignore device casts
+    if len(args) == 2:
+        if args[0].type().isSubtypeOf(ListType.ofInts()):
+            # aten::to(Tensor, Device, bool)
+            return self
+        else:
+            # aten::to(Tensor, ScalarType, bool)
+            dtype = _get_const(args[0], 'i', 'dtype')
+            return g.op("Cast", self, to_i=scalar_type_to_onnx[dtype])
+    elif len(args) == 3:
+        # aten::to(Tensor, Device, ScalarType, bool)
+        dtype = _get_const(args[1], 'i', 'dtype')
+        return g.op("Cast", self, to_i=scalar_type_to_onnx[dtype])
+    else:
+        raise NotImplementedError("Unknown aten::to signature")
+
+
 def repeat(g, self, repeats):
     if not _is_value(repeats):
         repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
@@ -985,22 +1050,6 @@ def repeat(g, self, repeats):
     return g.op("Tile", self, repeats)
 
 
-def instance_norm(g, input, **kwargs):
-    input_type = input.type().scalarType()
-    weight = kwargs.get("weight", None)
-    bias = kwargs.get("bias", None)
-    eps = kwargs.get("eps", 1e-5)
-    if weight is None:
-        weight = g.constant(1.0, [input.type().sizes()[1]], input_type)
-    else:
-        weight = g.op('Constant', value_t=weight)
-    if bias is None:
-        bias = g.constant(0.0, [input.type().sizes()[1]], input_type)
-    else:
-        bias = g.op('Constant', value_t=bias)
-    return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
-
-
 def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases,
                  num_layers, dropout, train, bidirectional, batch_first=None, batch_sizes=None):
     weights_per_layer = 4 if has_biases else 2
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 963e0bc9591255..d027267053052a 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -9,7 +9,7 @@
 import torch.autograd
 import torch.serialization
 import re
-import collections
+from torch._six import container_abcs
 import contextlib
 import numbers
 import warnings
@@ -354,7 +354,7 @@ def _run_symbolic_method(op_name, symbolic_fn, args):
 def _is_onnx_list(value):
     if not isinstance(value, string_classes) and \
             not isinstance(value, torch.Tensor) and \
-            isinstance(value, collections.Iterable):
+            isinstance(value, container_abcs.Iterable):
         return True
     return False
 
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 21f69bb82804d3..41c1e916f4d8d1 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -1,4 +1,5 @@
-from collections import defaultdict, Iterable
+from collections import defaultdict
+from torch._six import container_abcs
 
 import torch
 from copy import deepcopy
@@ -123,7 +124,7 @@ def cast(param, value):
                 return value
             elif isinstance(value, dict):
                 return {k: cast(param, v) for k, v in value.items()}
-            elif isinstance(value, Iterable):
+            elif isinstance(value, container_abcs.Iterable):
                 return type(value)(cast(param, v) for v in value)
             else:
                 return value
diff --git a/torch/op.h b/torch/script.h
similarity index 64%
rename from torch/op.h
rename to torch/script.h
index bf6f2aa080c133..01f9e38cd7766d 100644
--- a/torch/op.h
+++ b/torch/script.h
@@ -3,10 +3,6 @@
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/jit/custom_operator.h>
 #include <torch/csrc/jit/import.h>
+#include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <ATen/ATen.h>
-
-namespace torch {
-using jit::createOperator;
-using jit::RegisterOperators;
-} // namespace torch
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 0af802625087c5..01961cf994ec73 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -69,6 +69,12 @@ def _find_cuda_home():
 # it the below pattern.
 BUILT_FROM_SOURCE_VERSION_PATTERN = re.compile(r'\d+\.\d+\.\d+\w+\+\w+')
 
+COMMON_NVCC_FLAGS = [
+    '-D__CUDA_NO_HALF_OPERATORS__',
+    '-D__CUDA_NO_HALF_CONVERSIONS__',
+    '-D__CUDA_NO_HALF2_OPERATORS__',
+]
+
 
 def is_binary_build():
     return not BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__)
@@ -165,7 +171,7 @@ def unix_wrap_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
                     self.compiler.set_executable('compiler_so', nvcc)
                     if isinstance(cflags, dict):
                         cflags = cflags['nvcc']
-                    cflags += ['--compiler-options', "'-fPIC'"]
+                    cflags = COMMON_NVCC_FLAGS + ['--compiler-options', "'-fPIC'"] + cflags
                 elif isinstance(cflags, dict):
                     cflags = cflags['cxx']
                 # NVCC does not allow multiple -std to be passed, so we avoid
@@ -282,7 +288,7 @@ def _add_gnu_abi_flag_if_binary(self, extension):
         # if the extension is compiled with gcc >= 5.1,
         # then we have to define _GLIBCXX_USE_CXX11_ABI=0
         # so that the std::string in the API is resolved to
-        # non-C++11 symbols
+        # non-C++11 symbols.
         define = '-D_GLIBCXX_USE_CXX11_ABI=0'
         if is_binary_build():
             if isinstance(extension.extra_compile_args, dict):
@@ -811,15 +817,21 @@ def _write_ninja_file(path,
     # Turn into absolute paths so we can emit them into the ninja build
     # file wherever it is.
     sources = [os.path.abspath(file) for file in sources]
-    includes = [os.path.abspath(file) for file in extra_include_paths]
+    user_includes = [os.path.abspath(file) for file in extra_include_paths]
 
     # include_paths() gives us the location of torch/torch.h
-    includes += include_paths(with_cuda)
+    system_includes = include_paths(with_cuda)
     # sysconfig.get_paths()['include'] gives us the location of Python.h
-    includes.append(sysconfig.get_paths()['include'])
+    system_includes.append(sysconfig.get_paths()['include'])
+
+    # Windoze does not understand `-isystem`.
+    if sys.platform == 'win32':
+        user_includes += system_includes
+        system_includes.clear()
 
     common_cflags = ['-DTORCH_EXTENSION_NAME={}'.format(name)]
-    common_cflags += ['-I{}'.format(include) for include in includes]
+    common_cflags += ['-I{}'.format(include) for include in user_includes]
+    common_cflags += ['-isystem {}'.format(include) for include in system_includes]
 
     if is_binary_build():
         common_cflags += ['-D_GLIBCXX_USE_CXX11_ABI=0']
@@ -831,7 +843,7 @@ def _write_ninja_file(path,
     flags = ['cflags = {}'.format(' '.join(cflags))]
 
     if with_cuda:
-        cuda_flags = common_cflags
+        cuda_flags = common_cflags + COMMON_NVCC_FLAGS
         if sys.platform == 'win32':
             cuda_flags = _nt_quote_args(cuda_flags)
         else:
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 951321fcdf1052..4618e731406e68 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -6,7 +6,7 @@
 from . import SequentialSampler, RandomSampler, BatchSampler
 import signal
 import functools
-import collections
+from torch._six import container_abcs
 import re
 import sys
 import threading
@@ -73,49 +73,53 @@ def is_alive(self):
 
 
 def _worker_loop(dataset, index_queue, data_queue, done_event, collate_fn, seed, init_fn, worker_id):
-    global _use_shared_memory
-    _use_shared_memory = True
+    try:
+        global _use_shared_memory
+        _use_shared_memory = True
 
-    # Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
-    # module's handlers are executed after Python returns from C low-level
-    # handlers, likely when the same fatal signal happened again already.
-    # https://docs.python.org/3/library/signal.html Sec. 18.8.1.1
-    _set_worker_signal_handlers()
+        # Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
+        # module's handlers are executed after Python returns from C low-level
+        # handlers, likely when the same fatal signal happened again already.
+        # https://docs.python.org/3/library/signal.html Sec. 18.8.1.1
+        _set_worker_signal_handlers()
 
-    torch.set_num_threads(1)
-    random.seed(seed)
-    torch.manual_seed(seed)
+        torch.set_num_threads(1)
+        random.seed(seed)
+        torch.manual_seed(seed)
 
-    # Do not wait for putting thread to join when this worker exits. Otherwise,
-    # this worker may always be waiting to put and doesn't check index_queue
-    # and done_event for termination signal.
-    data_queue.cancel_join_thread()
+        # Do not wait for putting thread to join when this worker exits.
+        # Otherwise, this worker may always be waiting to put and doesn't check
+        # index_queue and done_event for termination signal.
+        data_queue.cancel_join_thread()
 
-    if init_fn is not None:
-        init_fn(worker_id)
+        if init_fn is not None:
+            init_fn(worker_id)
 
-    watchdog = ManagerWatchdog()
+        watchdog = ManagerWatchdog()
 
-    while True:
-        try:
-            r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
-        except queue.Empty:
-            if watchdog.is_alive() and not done_event.is_set():
-                continue
-            else:
+        while True:
+            try:
+                r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
+            except queue.Empty:
+                if watchdog.is_alive() and not done_event.is_set():
+                    continue
+                else:
+                    break
+            # use done_event so that we can get faster exiting signal even if there
+            # are still indices in index_queue
+            if r is None or done_event.is_set():
                 break
-        # use done_event so that we can get faster exiting signal even if there
-        # are still indices in index_queue
-        if r is None or done_event.is_set():
-            break
-        idx, batch_indices = r
-        try:
-            samples = collate_fn([dataset[i] for i in batch_indices])
-        except Exception:
-            data_queue.put((idx, ExceptionWrapper(sys.exc_info())))
-        else:
-            data_queue.put((idx, samples))
-            del samples
+            idx, batch_indices = r
+            try:
+                samples = collate_fn([dataset[i] for i in batch_indices])
+            except Exception:
+                data_queue.put((idx, ExceptionWrapper(sys.exc_info())))
+            else:
+                data_queue.put((idx, samples))
+                del samples
+    except KeyboardInterrupt:
+        # Main process will raise KeyboardInterrupt anyways.
+        pass
 
 
 def _pin_memory_loop(in_queue, out_queue, done_event, pin_memory, device_id):
@@ -187,9 +191,9 @@ def default_collate(batch):
         return torch.DoubleTensor(batch)
     elif isinstance(batch[0], string_classes):
         return batch
-    elif isinstance(batch[0], collections.Mapping):
+    elif isinstance(batch[0], container_abcs.Mapping):
         return {key: default_collate([d[key] for d in batch]) for key in batch[0]}
-    elif isinstance(batch[0], collections.Sequence):
+    elif isinstance(batch[0], container_abcs.Sequence):
         transposed = zip(*batch)
         return [default_collate(samples) for samples in transposed]
 
@@ -201,9 +205,9 @@ def pin_memory_batch(batch):
         return batch.pin_memory()
     elif isinstance(batch, string_classes):
         return batch
-    elif isinstance(batch, collections.Mapping):
+    elif isinstance(batch, container_abcs.Mapping):
         return {k: pin_memory_batch(sample) for k, sample in batch.items()}
-    elif isinstance(batch, collections.Sequence):
+    elif isinstance(batch, container_abcs.Sequence):
         return [pin_memory_batch(sample) for sample in batch]
     else:
         return batch
@@ -289,12 +293,15 @@ def __init__(self, loader):
 
             if self.pin_memory:
                 self.data_queue = queue.Queue()
-                self.pin_memory_thread = threading.Thread(
+                pin_memory_thread = threading.Thread(
                     target=_pin_memory_loop,
                     args=(self.worker_result_queue, self.data_queue, self.done_event, self.pin_memory,
                           torch.cuda.current_device()))
-                self.pin_memory_thread.daemon = True
-                self.pin_memory_thread.start()
+                pin_memory_thread.daemon = True
+                pin_memory_thread.start()
+                # Similar to workers (see comment above), we only register
+                # pin_memory_thread once it is started.
+                self.pin_memory_thread = pin_memory_thread
             else:
                 self.data_queue = self.worker_result_queue
 
@@ -397,7 +404,7 @@ def _shutdown_workers(self):
                 q.put(None)
             for w in self.workers:
                 w.join()
-            if self.pin_memory:
+            if hasattr(self, 'pin_memory_thread'):
                 self.pin_memory_thread.join()
 
     def __del__(self):
diff --git a/torch/utils/data/distributed.py b/torch/utils/data/distributed.py
index d3e871353ece96..ee58c1afc5a74d 100644
--- a/torch/utils/data/distributed.py
+++ b/torch/utils/data/distributed.py
@@ -1,7 +1,7 @@
 import math
 import torch
 from . import Sampler
-from torch.distributed import get_world_size, get_rank
+import torch.distributed as dist
 
 
 class DistributedSampler(Sampler):
@@ -24,9 +24,13 @@ class DistributedSampler(Sampler):
 
     def __init__(self, dataset, num_replicas=None, rank=None):
         if num_replicas is None:
-            num_replicas = get_world_size()
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
         if rank is None:
-            rank = get_rank()
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
         self.dataset = dataset
         self.num_replicas = num_replicas
         self.rank = rank